From 5326b2706673ae2c38f241f54d0edbb59a25c8e2 Mon Sep 17 00:00:00 2001 From: Debadri Basak Date: Mon, 3 Nov 2025 11:20:48 +0000 Subject: [PATCH 001/539] Adding implementation for LifetimeSafetyAnalysis::PrintStats --- .../Analyses/LifetimeSafety/LifetimeSafety.h | 8 ++++++++ .../Analysis/Analyses/LifetimeSafety/Origins.h | 6 ++++++ .../Analysis/LifetimeSafety/LifetimeSafety.cpp | 18 ++++++++++++++++++ clang/lib/Analysis/LifetimeSafety/Origins.cpp | 17 +++++++++++++++++ clang/lib/Sema/AnalysisBasedWarnings.cpp | 3 +++ 5 files changed, 52 insertions(+) diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h index 91ffbb169f947..e5ac4ca0d01c0 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h @@ -23,7 +23,10 @@ #include "clang/Analysis/Analyses/LifetimeSafety/Facts.h" #include "clang/Analysis/Analyses/LifetimeSafety/LiveOrigins.h" #include "clang/Analysis/Analyses/LifetimeSafety/LoanPropagation.h" +#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h" #include "clang/Analysis/AnalysisDeclContext.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Support/raw_ostream.h" namespace clang::lifetimes { @@ -73,7 +76,12 @@ class LifetimeSafetyAnalysis { LiveOriginsAnalysis &getLiveOrigins() const { return *LiveOrigins; } FactManager &getFactManager() { return FactMgr; } + static void PrintStats(llvm::raw_ostream& OS); + + static void UpdateMissingOriginCount(const OriginManager& OM); + private: + static llvm::StringMap MissingOriginMap; AnalysisDeclContext &AC; LifetimeSafetyReporter *Reporter; LifetimeFactory Factory; diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h index ba138b078b379..231cc60b7e097 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h @@ -16,7 +16,10 @@ #include "clang/AST/Decl.h" #include "clang/AST/Expr.h" +#include "clang/AST/TypeBase.h" #include "clang/Analysis/Analyses/LifetimeSafety/Utils.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Support/raw_ostream.h" namespace clang::lifetimes::internal { @@ -76,6 +79,8 @@ class OriginManager { void dump(OriginID OID, llvm::raw_ostream &OS) const; + const llvm::StringMap getMissingOrigins() const; + private: OriginID getNextOriginID() { return NextOriginID++; } @@ -85,6 +90,7 @@ class OriginManager { llvm::SmallVector AllOrigins; llvm::DenseMap DeclToOriginID; llvm::DenseMap ExprToOriginID; + llvm::StringMap ExprTypeToMissingOriginCount; }; } // namespace clang::lifetimes::internal diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp index 00c7ed90503e7..5ad18ee26c174 100644 --- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp @@ -23,18 +23,35 @@ #include "clang/Analysis/AnalysisDeclContext.h" #include "clang/Analysis/CFG.h" #include "llvm/ADT/FoldingSet.h" +#include "llvm/ADT/StringMap.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TimeProfiler.h" +#include "llvm/Support/raw_ostream.h" #include namespace clang::lifetimes { namespace internal { +llvm::StringMap LifetimeSafetyAnalysis::MissingOriginMap; + LifetimeSafetyAnalysis::LifetimeSafetyAnalysis(AnalysisDeclContext &AC, LifetimeSafetyReporter *Reporter) : AC(AC), Reporter(Reporter) {} +void LifetimeSafetyAnalysis::PrintStats(llvm::raw_ostream& OS) { + llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats (expression_type : count) :\n"; + for (const auto& [expr, count] : LifetimeSafetyAnalysis::MissingOriginMap) { + OS << expr << " : " << count << '\n'; + } +} + +void LifetimeSafetyAnalysis::UpdateMissingOriginCount(const OriginManager& OM) { + for (const auto& [expr, missing_origin_count] : OM.getMissingOrigins()) { + LifetimeSafetyAnalysis::MissingOriginMap[std::string(expr)] += missing_origin_count; + } +} + void LifetimeSafetyAnalysis::run() { llvm::TimeTraceScope TimeProfile("LifetimeSafetyAnalysis"); @@ -66,6 +83,7 @@ void LifetimeSafetyAnalysis::run() { LiveOrigins->dump(llvm::dbgs(), FactMgr.getTestPoints())); runLifetimeChecker(*LoanPropagation, *LiveOrigins, FactMgr, AC, Reporter); + UpdateMissingOriginCount(FactMgr.getOriginMgr()); } } // namespace internal diff --git a/clang/lib/Analysis/LifetimeSafety/Origins.cpp b/clang/lib/Analysis/LifetimeSafety/Origins.cpp index ea51a75324e06..c8570844fe314 100644 --- a/clang/lib/Analysis/LifetimeSafety/Origins.cpp +++ b/clang/lib/Analysis/LifetimeSafety/Origins.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "clang/Analysis/Analyses/LifetimeSafety/Origins.h" +#include "clang/AST/TypeBase.h" +#include "llvm/ADT/StringMap.h" namespace clang::lifetimes::internal { @@ -22,6 +24,11 @@ void OriginManager::dump(OriginID OID, llvm::raw_ostream &OS) const { OS << ")"; } +const llvm::StringMap OriginManager::getMissingOrigins() const { + return ExprTypeToMissingOriginCount; +} + + Origin &OriginManager::addOrigin(OriginID ID, const clang::ValueDecl &D) { AllOrigins.emplace_back(ID, &D); return AllOrigins.back(); @@ -37,6 +44,16 @@ OriginID OriginManager::get(const Expr &E) { auto It = ExprToOriginID.find(&E); if (It != ExprToOriginID.end()) return It->second; + + // if the expression has no specific origin, increment the missing origin counter. + const QualType ExprType = E.getType(); + auto CountIt = ExprTypeToMissingOriginCount.find(ExprType.getAsString()); + if (CountIt == ExprTypeToMissingOriginCount.end()) { + ExprTypeToMissingOriginCount[ExprType.getAsString()] = 1; + } else { + CountIt->second++; + } + // If the expression itself has no specific origin, and it's a reference // to a declaration, its origin is that of the declaration it refers to. // For pointer types, where we don't pre-emptively create an origin for the diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index 140b709dbb651..ca74c637bb92f 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -30,6 +30,7 @@ #include "clang/Analysis/Analyses/CalledOnceCheck.h" #include "clang/Analysis/Analyses/Consumed.h" #include "clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h" +#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h" #include "clang/Analysis/Analyses/ReachableCode.h" #include "clang/Analysis/Analyses/ThreadSafety.h" #include "clang/Analysis/Analyses/UninitializedValues.h" @@ -53,6 +54,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include #include #include @@ -3132,6 +3134,7 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings( } void clang::sema::AnalysisBasedWarnings::PrintStats() const { + clang::lifetimes::internal::LifetimeSafetyAnalysis::PrintStats(llvm::errs()); llvm::errs() << "\n*** Analysis Based Warnings Stats:\n"; unsigned NumCFGsBuilt = NumFunctionsAnalyzed - NumFunctionsWithBadCFGs; From b923858250c94c233e6b17a64bf28cca7ec4dbf2 Mon Sep 17 00:00:00 2001 From: nerix Date: Tue, 28 Oct 2025 11:43:47 +0100 Subject: [PATCH 002/539] [LLDB][PDB] Run `function-nested-block.test` with both plugins (#165364) This test passes with both plugins, but only ran with the DIA plugin. It was fixed with #161678, where I missed this test. --- lldb/test/Shell/SymbolFile/PDB/function-nested-block.test | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lldb/test/Shell/SymbolFile/PDB/function-nested-block.test b/lldb/test/Shell/SymbolFile/PDB/function-nested-block.test index 4a2355bf23c9a..a18955b18151f 100644 --- a/lldb/test/Shell/SymbolFile/PDB/function-nested-block.test +++ b/lldb/test/Shell/SymbolFile/PDB/function-nested-block.test @@ -1,7 +1,9 @@ REQUIRES: system-windows, lld RUN: %build --compiler=clang-cl --nodefaultlib --output=%t.exe %S/Inputs/FunctionNestedBlockTest.cpp -RUN: lldb-test symbols -find=function -file FunctionNestedBlockTest.cpp -line 4 %t.exe | FileCheck --check-prefix=CHECK-FUNCTION %s -RUN: lldb-test symbols -find=block -file FunctionNestedBlockTest.cpp -line 4 %t.exe | FileCheck --check-prefix=CHECK-BLOCK %s +RUN: env LLDB_USE_NATIVE_PDB_READER=0 lldb-test symbols -find=function -file FunctionNestedBlockTest.cpp -line 4 %t.exe | FileCheck --check-prefix=CHECK-FUNCTION %s +RUN: env LLDB_USE_NATIVE_PDB_READER=0 lldb-test symbols -find=block -file FunctionNestedBlockTest.cpp -line 4 %t.exe | FileCheck --check-prefix=CHECK-BLOCK %s +RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -find=function -file FunctionNestedBlockTest.cpp -line 4 %t.exe | FileCheck --check-prefix=CHECK-FUNCTION %s +RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -find=block -file FunctionNestedBlockTest.cpp -line 4 %t.exe | FileCheck --check-prefix=CHECK-BLOCK %s CHECK-FUNCTION: Found 1 functions: CHECK-FUNCTION: name = "main" From 0063220be959785cabee64541a4b2de47a2da064 Mon Sep 17 00:00:00 2001 From: Igor Wodiany Date: Tue, 28 Oct 2025 10:47:23 +0000 Subject: [PATCH 003/539] [mlir][spirv] Enable validation of decorations target tests (#165229) The Intel Cache Control tests are separated and not validated as `spirv-val` fails with: "ID '7' decorated with CacheControlLoadINTEL multiple times is not allowed". However, Intel extension does allow duplicated decoration if cache level in each annotation is different. It seems that `spirv-val` does not currently support it. --- .../decorations-intel-cache-controls.mlir | 42 ++++++++++ mlir/test/Target/SPIRV/decorations.mlir | 77 +++++-------------- 2 files changed, 61 insertions(+), 58 deletions(-) create mode 100644 mlir/test/Target/SPIRV/decorations-intel-cache-controls.mlir diff --git a/mlir/test/Target/SPIRV/decorations-intel-cache-controls.mlir b/mlir/test/Target/SPIRV/decorations-intel-cache-controls.mlir new file mode 100644 index 0000000000000..62d15de5ab03c --- /dev/null +++ b/mlir/test/Target/SPIRV/decorations-intel-cache-controls.mlir @@ -0,0 +1,42 @@ +// RUN: mlir-translate --no-implicit-module --split-input-file --test-spirv-roundtrip --verify-diagnostics %s | FileCheck %s + +// CHECK-LABEL: spirv.module Logical GLSL450 requires #spirv.vce { +spirv.module Logical GLSL450 requires #spirv.vce { + spirv.func @cache_controls() "None" { + // CHECK: spirv.Variable {cache_control_load_intel = [#spirv.cache_control_load_intel, #spirv.cache_control_load_intel, #spirv.cache_control_load_intel]} : !spirv.ptr + %0 = spirv.Variable {cache_control_load_intel = [#spirv.cache_control_load_intel, #spirv.cache_control_load_intel, #spirv.cache_control_load_intel]} : !spirv.ptr + // CHECK: spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel, #spirv.cache_control_store_intel, #spirv.cache_control_store_intel]} : !spirv.ptr + %1 = spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel, #spirv.cache_control_store_intel, #spirv.cache_control_store_intel]} : !spirv.ptr + spirv.Return + } +} + +// ----- + +spirv.module Logical GLSL450 requires #spirv.vce { + spirv.func @cache_controls_invalid_type() "None" { + // expected-error@below {{expecting array attribute of CacheControlLoadINTEL for CacheControlLoadINTEL}} + %0 = spirv.Variable {cache_control_load_intel = #spirv.cache_control_load_intel} : !spirv.ptr + spirv.Return + } +} + +// ----- + +spirv.module Logical GLSL450 requires #spirv.vce { + spirv.func @cache_controls_invalid_type() "None" { + // expected-error@below {{expecting array attribute of CacheControlStoreINTEL for CacheControlStoreINTEL}} + %0 = spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel, 0 : i32]} : !spirv.ptr + spirv.Return + } +} + +// ----- + +spirv.module Logical GLSL450 requires #spirv.vce { + spirv.func @cache_controls_invalid_type() "None" { + // expected-error@below {{expecting non-empty array attribute of CacheControlStoreINTEL for CacheControlStoreINTEL}} + %0 = spirv.Variable {cache_control_store_intel = []} : !spirv.ptr + spirv.Return + } +} diff --git a/mlir/test/Target/SPIRV/decorations.mlir b/mlir/test/Target/SPIRV/decorations.mlir index 90ba690e50b73..712fd17623402 100644 --- a/mlir/test/Target/SPIRV/decorations.mlir +++ b/mlir/test/Target/SPIRV/decorations.mlir @@ -1,27 +1,32 @@ -// RUN: mlir-translate -no-implicit-module -split-input-file -test-spirv-roundtrip -verify-diagnostics %s | FileCheck %s +// RUN: mlir-translate --no-implicit-module --split-input-file --test-spirv-roundtrip %s | FileCheck %s -spirv.module Logical GLSL450 requires #spirv.vce { +// RUN: %if spirv-tools %{ rm -rf %t %} +// RUN: %if spirv-tools %{ mkdir %t %} +// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %} +// RUN: %if spirv-tools %{ spirv-val %t %} + +spirv.module Logical GLSL450 requires #spirv.vce { // CHECK: location = 0 : i32 spirv.GlobalVariable @var {location = 0 : i32} : !spirv.ptr, Input> } // ----- -spirv.module Logical GLSL450 requires #spirv.vce { +spirv.module Logical GLSL450 requires #spirv.vce { // CHECK: no_perspective spirv.GlobalVariable @var {no_perspective} : !spirv.ptr, Input> } // ----- -spirv.module Logical GLSL450 requires #spirv.vce { +spirv.module Logical GLSL450 requires #spirv.vce { // CHECK: flat spirv.GlobalVariable @var {flat} : !spirv.ptr } // ----- -spirv.module Logical GLSL450 requires #spirv.vce { +spirv.module Logical GLSL450 requires #spirv.vce { // CHECK: aliased // CHECK: aliased spirv.GlobalVariable @var1 bind(0, 0) {aliased} : !spirv.ptr[0])>, StorageBuffer> @@ -30,28 +35,28 @@ spirv.module Logical GLSL450 requires #spirv.vce { // ----- -spirv.module Logical GLSL450 requires #spirv.vce { +spirv.module Logical GLSL450 requires #spirv.vce { // CHECK: non_readable spirv.GlobalVariable @var bind(0, 0) {non_readable} : !spirv.ptr[0])>, StorageBuffer> } // ----- -spirv.module Logical GLSL450 requires #spirv.vce { +spirv.module Logical GLSL450 requires #spirv.vce { // CHECK: non_writable spirv.GlobalVariable @var bind(0, 0) {non_writable} : !spirv.ptr[0])>, StorageBuffer> } // ----- -spirv.module Logical GLSL450 requires #spirv.vce { +spirv.module Logical GLSL450 requires #spirv.vce { // CHECK: restrict spirv.GlobalVariable @var bind(0, 0) {restrict} : !spirv.ptr[0])>, StorageBuffer> } // ----- -spirv.module Logical GLSL450 requires #spirv.vce { +spirv.module Logical GLSL450 requires #spirv.vce { // CHECK: relaxed_precision spirv.GlobalVariable @var {location = 0 : i32, relaxed_precision} : !spirv.ptr, Output> } @@ -84,7 +89,7 @@ spirv.module Logical GLSL450 requires #spirv.vce { // ----- -spirv.module Logical GLSL450 requires #spirv.vce { +spirv.module Logical OpenCL requires #spirv.vce { spirv.func @iadd_decorations(%arg: i32) -> i32 "None" { // CHECK: spirv.IAdd %{{.*}}, %{{.*}} {no_signed_wrap, no_unsigned_wrap} %0 = spirv.IAdd %arg, %arg {no_signed_wrap, no_unsigned_wrap} : i32 @@ -94,7 +99,7 @@ spirv.func @iadd_decorations(%arg: i32) -> i32 "None" { // ----- -spirv.module Logical GLSL450 requires #spirv.vce { +spirv.module Logical OpenCL requires #spirv.vce { spirv.func @fadd_decorations(%arg: f32) -> f32 "None" { // CHECK: spirv.FAdd %{{.*}}, %{{.*}} {fp_fast_math_mode = #spirv.fastmath_mode} %0 = spirv.FAdd %arg, %arg {fp_fast_math_mode = #spirv.fastmath_mode} : f32 @@ -104,7 +109,7 @@ spirv.func @fadd_decorations(%arg: f32) -> f32 "None" { // ----- -spirv.module Logical GLSL450 requires #spirv.vce { +spirv.module Logical GLSL450 requires #spirv.vce { spirv.func @fmul_decorations(%arg: f32) -> f32 "None" { // CHECK: spirv.FMul %{{.*}}, %{{.*}} {no_contraction} %0 = spirv.FMul %arg, %arg {no_contraction} : f32 @@ -114,7 +119,7 @@ spirv.func @fmul_decorations(%arg: f32) -> f32 "None" { // ----- -spirv.module Logical GLSL450 requires #spirv.vce { +spirv.module Logical OpenCL requires #spirv.vce { spirv.func @fp_rounding_mode(%arg: f32) -> f16 "None" { // CHECK: spirv.FConvert %arg0 {fp_rounding_mode = #spirv.fp_rounding_mode} : f32 to f16 %0 = spirv.FConvert %arg {fp_rounding_mode = #spirv.fp_rounding_mode} : f32 to f16 @@ -124,51 +129,7 @@ spirv.func @fp_rounding_mode(%arg: f32) -> f16 "None" { // ----- -// CHECK-LABEL: spirv.module Logical GLSL450 requires #spirv.vce { - -spirv.module Logical GLSL450 requires #spirv.vce { - spirv.func @cache_controls() "None" { - // CHECK: spirv.Variable {cache_control_load_intel = [#spirv.cache_control_load_intel, #spirv.cache_control_load_intel, #spirv.cache_control_load_intel]} : !spirv.ptr - %0 = spirv.Variable {cache_control_load_intel = [#spirv.cache_control_load_intel, #spirv.cache_control_load_intel, #spirv.cache_control_load_intel]} : !spirv.ptr - // CHECK: spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel, #spirv.cache_control_store_intel, #spirv.cache_control_store_intel]} : !spirv.ptr - %1 = spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel, #spirv.cache_control_store_intel, #spirv.cache_control_store_intel]} : !spirv.ptr - spirv.Return - } -} - -// ----- - -spirv.module Logical GLSL450 requires #spirv.vce { - spirv.func @cache_controls_invalid_type() "None" { - // expected-error@below {{expecting array attribute of CacheControlLoadINTEL for CacheControlLoadINTEL}} - %0 = spirv.Variable {cache_control_load_intel = #spirv.cache_control_load_intel} : !spirv.ptr - spirv.Return - } -} - -// ----- - -spirv.module Logical GLSL450 requires #spirv.vce { - spirv.func @cache_controls_invalid_type() "None" { - // expected-error@below {{expecting array attribute of CacheControlStoreINTEL for CacheControlStoreINTEL}} - %0 = spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel, 0 : i32]} : !spirv.ptr - spirv.Return - } -} - -// ----- - -spirv.module Logical GLSL450 requires #spirv.vce { - spirv.func @cache_controls_invalid_type() "None" { - // expected-error@below {{expecting non-empty array attribute of CacheControlStoreINTEL for CacheControlStoreINTEL}} - %0 = spirv.Variable {cache_control_store_intel = []} : !spirv.ptr - spirv.Return - } -} - -// ----- - -spirv.module Logical GLSL450 requires #spirv.vce { +spirv.module Logical GLSL450 requires #spirv.vce { // CHECK: spirv.func @relaxed_precision_arg({{%.*}}: !spirv.ptr {spirv.decoration = #spirv.decoration}) "None" attributes {relaxed_precision} { spirv.func @relaxed_precision_arg(%arg0: !spirv.ptr {spirv.decoration = #spirv.decoration}) -> () "None" attributes {relaxed_precision} { spirv.Return From 4950af594f5a568691443f33b42a6f5568427ff7 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 28 Oct 2025 11:59:36 +0100 Subject: [PATCH 004/539] [InstCombine] Support ptrtoaddr when converting to align assume bundle ptrtoaddr can be treated the same way as ptrtoint here. --- .../InstCombine/InstCombineCalls.cpp | 2 +- llvm/test/Transforms/InstCombine/assume.ll | 28 +++++++++++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 8d9933bfab938..92fca90ddb88a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3496,7 +3496,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { if (isPowerOf2_64(AlignMask + 1)) { uint64_t Offset = 0; match(A, m_Add(m_Value(A), m_ConstantInt(Offset))); - if (match(A, m_PtrToInt(m_Value(A)))) { + if (match(A, m_PtrToIntOrAddr(m_Value(A)))) { /// Note: this doesn't preserve the offset information but merges /// offset and alignment. /// TODO: we can generate a GEP instead of merging the alignment with diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll index 7b0b871513513..cc87d6542fa12 100644 --- a/llvm/test/Transforms/InstCombine/assume.ll +++ b/llvm/test/Transforms/InstCombine/assume.ll @@ -10,8 +10,8 @@ declare void @llvm.assume(i1) #1 ; Check that the assume has not been removed: -define i32 @foo1(ptr %a) #0 { -; DEFAULT-LABEL: @foo1( +define i32 @align_to_bundle(ptr %a) #0 { +; DEFAULT-LABEL: @align_to_bundle( ; DEFAULT-NEXT: [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4 ; DEFAULT-NEXT: [[PTRINT:%.*]] = ptrtoint ptr [[A]] to i64 ; DEFAULT-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31 @@ -19,7 +19,7 @@ define i32 @foo1(ptr %a) #0 { ; DEFAULT-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) ; DEFAULT-NEXT: ret i32 [[T0]] ; -; BUNDLES-LABEL: @foo1( +; BUNDLES-LABEL: @align_to_bundle( ; BUNDLES-NEXT: [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4 ; BUNDLES-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 32) ] ; BUNDLES-NEXT: ret i32 [[T0]] @@ -32,6 +32,28 @@ define i32 @foo1(ptr %a) #0 { ret i32 %t0 } +define i32 @align_to_bundle_ptrtoaddr(ptr %a) #0 { +; DEFAULT-LABEL: @align_to_bundle_ptrtoaddr( +; DEFAULT-NEXT: [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4 +; DEFAULT-NEXT: [[PTRINT:%.*]] = ptrtoaddr ptr [[A]] to i64 +; DEFAULT-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31 +; DEFAULT-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0 +; DEFAULT-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; DEFAULT-NEXT: ret i32 [[T0]] +; +; BUNDLES-LABEL: @align_to_bundle_ptrtoaddr( +; BUNDLES-NEXT: [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4 +; BUNDLES-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 32) ] +; BUNDLES-NEXT: ret i32 [[T0]] +; + %t0 = load i32, ptr %a, align 4 + %ptrint = ptrtoaddr ptr %a to i64 + %maskedptr = and i64 %ptrint, 31 + %maskcond = icmp eq i64 %maskedptr, 0 + tail call void @llvm.assume(i1 %maskcond) + ret i32 %t0 +} + define i32 @align_assume_trunc_cond(ptr %a) #0 { ; DEFAULT-LABEL: @align_assume_trunc_cond( ; DEFAULT-NEXT: [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4 From 1501304607377b3b430ff8dd971ab1e20b9edd78 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 28 Oct 2025 12:22:19 +0100 Subject: [PATCH 005/539] [InstCombine] Support ptrtoaddr of ptrmask fold For now not trying to share the code with ptrtoint, as there's very little code. Also fix IRBuilder::CreatePtrToAddr to actually create a PtrToAddr instruction... --- llvm/include/llvm/IR/IRBuilder.h | 2 +- .../InstCombine/InstCombineCasts.cpp | 12 ++++++++++ llvm/test/Transforms/InstCombine/ptrtoaddr.ll | 24 +++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index dacda0afc7f03..972a253344ddf 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -2191,7 +2191,7 @@ class IRBuilderBase { FMFSource); } Value *CreatePtrToAddr(Value *V, const Twine &Name = "") { - return CreateCast(Instruction::PtrToInt, V, + return CreateCast(Instruction::PtrToAddr, V, BB->getDataLayout().getAddressType(V->getType()), Name); } Value *CreatePtrToInt(Value *V, Type *DestTy, diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index f939e7aa78c33..6d704e1dc893b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -2228,6 +2228,18 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) { } Instruction *InstCombinerImpl::visitPtrToAddr(PtrToAddrInst &CI) { + Value *SrcOp = CI.getPointerOperand(); + Type *Ty = CI.getType(); + + // (ptrtoaddr (ptrmask P, M)) + // -> (and (ptrtoaddr P), M) + // This is generally beneficial as `and` is better supported than `ptrmask`. + Value *Ptr, *Mask; + if (match(SrcOp, m_OneUse(m_Intrinsic(m_Value(Ptr), + m_Value(Mask)))) && + Mask->getType() == Ty) + return BinaryOperator::CreateAnd(Builder.CreatePtrToAddr(Ptr), Mask); + // FIXME: Implement variants of ptrtoint folds. return commonCastTransforms(CI); } diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll index a7434a28c4164..671a16d28f11a 100644 --- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll +++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll @@ -237,3 +237,27 @@ define ptr addrspace(1) @gep_sub_ptrtoaddr_different_obj_addrsize(ptr addrspace( call void @use.i32(i32 %addr) ret ptr addrspace(1) %gep } + +define i64 @ptrtoaddr_of_ptrmask(ptr %p, i64 %mask) { +; CHECK-LABEL: define i64 @ptrtoaddr_of_ptrmask( +; CHECK-SAME: ptr [[P:%.*]], i64 [[MASK:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoaddr ptr [[P]] to i64 +; CHECK-NEXT: [[ADDR:%.*]] = and i64 [[MASK]], [[TMP1]] +; CHECK-NEXT: ret i64 [[ADDR]] +; + %masked = call ptr @llvm.ptrmask(ptr %p, i64 %mask) + %addr = ptrtoaddr ptr %masked to i64 + ret i64 %addr +} + +define i32 @ptrtoaddr_of_ptrmask_addrsize(ptr addrspace(1) %p, i32 %mask) { +; CHECK-LABEL: define i32 @ptrtoaddr_of_ptrmask_addrsize( +; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[MASK:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoaddr ptr addrspace(1) [[P]] to i32 +; CHECK-NEXT: [[ADDR:%.*]] = and i32 [[MASK]], [[TMP1]] +; CHECK-NEXT: ret i32 [[ADDR]] +; + %masked = call ptr addrspace(1) @llvm.ptrmask(ptr addrspace(1) %p, i32 %mask) + %addr = ptrtoaddr ptr addrspace(1) %masked to i32 + ret i32 %addr +} From 69be9f0878d52bc554072891e914f7455a2dbd57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gergely=20B=C3=A1lint?= Date: Tue, 28 Oct 2025 12:43:52 +0100 Subject: [PATCH 006/539] [BOLT] Fix thread-safety of MarkRAStates (#165368) The pass calls setIgnored() on functions in parallel, but setIgnored is not thread safe. This patch adds a std::mutex to guard setIgnored calls. Fixes: #165362 --- bolt/include/bolt/Passes/MarkRAStates.h | 5 +++++ bolt/lib/Passes/MarkRAStates.cpp | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/bolt/include/bolt/Passes/MarkRAStates.h b/bolt/include/bolt/Passes/MarkRAStates.h index 675ab9727142b..202f1dda2aad8 100644 --- a/bolt/include/bolt/Passes/MarkRAStates.h +++ b/bolt/include/bolt/Passes/MarkRAStates.h @@ -13,11 +13,16 @@ #define BOLT_PASSES_MARK_RA_STATES #include "bolt/Passes/BinaryPasses.h" +#include namespace llvm { namespace bolt { class MarkRAStates : public BinaryFunctionPass { + // setIgnored() is not thread-safe, but the pass is running on functions in + // parallel. + std::mutex IgnoreMutex; + public: explicit MarkRAStates() : BinaryFunctionPass(false) {} diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp index af6a5ca7e31e5..b262d66732b7d 100644 --- a/bolt/lib/Passes/MarkRAStates.cpp +++ b/bolt/lib/Passes/MarkRAStates.cpp @@ -43,10 +43,11 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) { // Not all functions have .cfi_negate_ra_state in them. But if one does, // we expect psign/pauth instructions to have the hasNegateRAState // annotation. - BF.setIgnored(); BC.outs() << "BOLT-INFO: inconsistent RAStates in function " << BF.getPrintName() << ": ptr sign/auth inst without .cfi_negate_ra_state\n"; + std::lock_guard Lock(IgnoreMutex); + BF.setIgnored(); return false; } } @@ -67,6 +68,7 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) { BC.outs() << "BOLT-INFO: inconsistent RAStates in function " << BF.getPrintName() << ": ptr signing inst encountered in Signed RA state\n"; + std::lock_guard Lock(IgnoreMutex); BF.setIgnored(); return false; } @@ -80,6 +82,7 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) { << BF.getPrintName() << ": ptr authenticating inst encountered in Unsigned RA " "state\n"; + std::lock_guard Lock(IgnoreMutex); BF.setIgnored(); return false; } From 646663e622df02a6f87798f4f1a8d59e2092e431 Mon Sep 17 00:00:00 2001 From: ddubov100 <155631080+ddubov100@users.noreply.github.com> Date: Tue, 28 Oct 2025 13:44:33 +0200 Subject: [PATCH 007/539] Added RecursiveMemoryEffects to ExecuteRegionOp (#164390) Added RecursiveMemoryEffects to ExecuteRegionOp to be aligned to other ops with region and get appropriate support in all appropriate passes, which need RecursiveMemoryEffects. The added test in dealloc-memoryeffect-interface.mlir fails with error 'ops with unknown memory side effects are not supported' without RecursiveMemoryEffects. The updated test in one-shot-module-bufferize.mlir gets cleaned by DCE once the interface is added. Added func.call @foo():()->() which has effect to keep execute_region from being removed. --------- Co-authored-by: Mehdi Amini --- mlir/include/mlir/Dialect/SCF/IR/SCFOps.td | 2 +- .../dealloc-memoryeffect-interface.mlir | 21 +++++++++++++++++++ .../Transforms/one-shot-module-bufferize.mlir | 20 +++++++++++------- 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td index fadd3fc10bfc4..66174ce0f7928 100644 --- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td +++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td @@ -77,7 +77,7 @@ def ConditionOp : SCF_Op<"condition", [ //===----------------------------------------------------------------------===// def ExecuteRegionOp : SCF_Op<"execute_region", [ - DeclareOpInterfaceMethods]> { + DeclareOpInterfaceMethods, RecursiveMemoryEffects]> { let summary = "operation that executes its region exactly once"; let description = [{ The `scf.execute_region` operation is used to allow multiple blocks within SCF diff --git a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-memoryeffect-interface.mlir b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-memoryeffect-interface.mlir index 40a57b90c6e99..e8bb0c0f2eff6 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-memoryeffect-interface.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-memoryeffect-interface.mlir @@ -156,3 +156,24 @@ func.func @manual_deallocation(%c: i1, %f: f32, %idx: index) -> f32 { // CHECK: cf.assert %[[true]], "expected that the block does not have ownership" // CHECK: memref.dealloc %[[manual_alloc]] // CHECK: bufferization.dealloc (%[[managed_alloc]] : memref<5xf32>) if (%[[true]]) + +// ----- + +// CHECK-LABEL: func.func private @properly_creates_deallocations_in_execute_region( +// CHECK: %[[true:.*]] = arith.constant true +// CHECK: scf.execute_region no_inline { +// CHECK: %[[alloc:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x63x378x16xui8> +// CHECK: bufferization.dealloc (%[[alloc]] : memref<1x63x378x16xui8>) if (%[[true]]) + +func.func private @properly_creates_deallocations_in_execute_region(%arg1: memref<1x16x252x380xui8> ) -> (memref<1x250x378x16xui8> ) { + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x250x378x16xui8> + scf.execute_region no_inline { + %subview = memref.subview %arg1[0, 0, 0, 0] [1, 16, 65, 380] [1, 1, 1, 1] : memref<1x16x252x380xui8> to memref<1x16x65x380xui8, strided<[1532160, 95760, 380, 1]>> + %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<1x63x378x16xui8> + test.buffer_based in(%subview: memref<1x16x65x380xui8, strided<[1532160, 95760, 380, 1]>>) out(%alloc_3: memref<1x63x378x16xui8>) + %subview_7 = memref.subview %alloc[0, 0, 0, 0] [1, 63, 378, 16] [1, 1, 1, 1] : memref<1x250x378x16xui8> to memref<1x63x378x16xui8, strided<[1512000, 6048, 16, 1]>> + test.copy(%alloc_3, %subview_7) : (memref<1x63x378x16xui8>, memref<1x63x378x16xui8, strided<[1512000, 6048, 16, 1]>>) + scf.yield + } + return %alloc : memref<1x250x378x16xui8> +} diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir index d5f834bce9b83..8db1ebb87a1e5 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir @@ -381,15 +381,19 @@ func.func private @execute_region_test(%t1 : tensor) // ----- // CHECK-LABEL: func @no_inline_execute_region_not_canonicalized -func.func @no_inline_execute_region_not_canonicalized() { - %c = arith.constant 42 : i32 - // CHECK: scf.execute_region - // CHECK-SAME: no_inline - %v = scf.execute_region -> i32 no_inline { - scf.yield %c : i32 +module { + func.func private @foo()->() + func.func @no_inline_execute_region_not_canonicalized() { + %c = arith.constant 42 : i32 + // CHECK: scf.execute_region + // CHECK-SAME: no_inline + %v = scf.execute_region -> i32 no_inline { + func.call @foo():()->() + scf.yield %c : i32 + } + // CHECK: return + return } - // CHECK: return - return } // ----- From 1b955366b1e65770661ccf336f0da0cc00f05f11 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 28 Oct 2025 12:44:55 +0100 Subject: [PATCH 008/539] [MLIR] Fix test after ptrtoaddr change b6bbc4b1940006884c49bad7c93b2a949928fe4c fixed IRBuilder::CreatePtrToAddr to produce the correct instruction. Update the test for ptr_diff lowering accordingly. --- mlir/test/Target/LLVMIR/ptr.mlir | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mlir/test/Target/LLVMIR/ptr.mlir b/mlir/test/Target/LLVMIR/ptr.mlir index 473ac0598e9ce..94b6628772634 100644 --- a/mlir/test/Target/LLVMIR/ptr.mlir +++ b/mlir/test/Target/LLVMIR/ptr.mlir @@ -284,8 +284,8 @@ llvm.func @ptr_add_cst() -> !ptr.ptr<#llvm.address_space<0>> { // CHECK-LABEL: define i64 @ptr_diff_scalar // CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) { -// CHECK-NEXT: %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64 -// CHECK-NEXT: %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64 +// CHECK-NEXT: %[[P1INT:.*]] = ptrtoaddr ptr %[[PTR1]] to i64 +// CHECK-NEXT: %[[P2INT:.*]] = ptrtoaddr ptr %[[PTR2]] to i64 // CHECK-NEXT: %[[DIFF:.*]] = sub i64 %[[P1INT]], %[[P2INT]] // CHECK-NEXT: ret i64 %[[DIFF]] // CHECK-NEXT: } @@ -296,8 +296,8 @@ llvm.func @ptr_diff_scalar(%ptr1: !ptr.ptr<#llvm.address_space<0>>, %ptr2: !ptr. // CHECK-LABEL: define i32 @ptr_diff_scalar_i32 // CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) { -// CHECK-NEXT: %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64 -// CHECK-NEXT: %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64 +// CHECK-NEXT: %[[P1INT:.*]] = ptrtoaddr ptr %[[PTR1]] to i64 +// CHECK-NEXT: %[[P2INT:.*]] = ptrtoaddr ptr %[[PTR2]] to i64 // CHECK-NEXT: %[[DIFF:.*]] = sub i64 %[[P1INT]], %[[P2INT]] // CHECK-NEXT: %[[TRUNC:.*]] = trunc i64 %[[DIFF]] to i32 // CHECK-NEXT: ret i32 %[[TRUNC]] @@ -309,8 +309,8 @@ llvm.func @ptr_diff_scalar_i32(%ptr1: !ptr.ptr<#llvm.address_space<0>>, %ptr2: ! // CHECK-LABEL: define <4 x i64> @ptr_diff_vector // CHECK-SAME: (<4 x ptr> %[[PTRS1:.*]], <4 x ptr> %[[PTRS2:.*]]) { -// CHECK-NEXT: %[[P1INT:.*]] = ptrtoint <4 x ptr> %[[PTRS1]] to <4 x i64> -// CHECK-NEXT: %[[P2INT:.*]] = ptrtoint <4 x ptr> %[[PTRS2]] to <4 x i64> +// CHECK-NEXT: %[[P1INT:.*]] = ptrtoaddr <4 x ptr> %[[PTRS1]] to <4 x i64> +// CHECK-NEXT: %[[P2INT:.*]] = ptrtoaddr <4 x ptr> %[[PTRS2]] to <4 x i64> // CHECK-NEXT: %[[DIFF:.*]] = sub <4 x i64> %[[P1INT]], %[[P2INT]] // CHECK-NEXT: ret <4 x i64> %[[DIFF]] // CHECK-NEXT: } @@ -321,8 +321,8 @@ llvm.func @ptr_diff_vector(%ptrs1: vector<4x!ptr.ptr<#llvm.address_space<0>>>, % // CHECK-LABEL: define <8 x i32> @ptr_diff_vector_i32 // CHECK-SAME: (<8 x ptr> %[[PTRS1:.*]], <8 x ptr> %[[PTRS2:.*]]) { -// CHECK-NEXT: %[[P1INT:.*]] = ptrtoint <8 x ptr> %[[PTRS1]] to <8 x i64> -// CHECK-NEXT: %[[P2INT:.*]] = ptrtoint <8 x ptr> %[[PTRS2]] to <8 x i64> +// CHECK-NEXT: %[[P1INT:.*]] = ptrtoaddr <8 x ptr> %[[PTRS1]] to <8 x i64> +// CHECK-NEXT: %[[P2INT:.*]] = ptrtoaddr <8 x ptr> %[[PTRS2]] to <8 x i64> // CHECK-NEXT: %[[DIFF:.*]] = sub <8 x i64> %[[P1INT]], %[[P2INT]] // CHECK-NEXT: %[[TRUNC:.*]] = trunc <8 x i64> %[[DIFF]] to <8 x i32> // CHECK-NEXT: ret <8 x i32> %[[TRUNC]] @@ -344,8 +344,8 @@ llvm.func @ptr_diff_with_constants() -> i64 { // CHECK-LABEL: define i64 @ptr_diff_with_flags_nsw // CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) { -// CHECK-NEXT: %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64 -// CHECK-NEXT: %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64 +// CHECK-NEXT: %[[P1INT:.*]] = ptrtoaddr ptr %[[PTR1]] to i64 +// CHECK-NEXT: %[[P2INT:.*]] = ptrtoaddr ptr %[[PTR2]] to i64 // CHECK-NEXT: %[[DIFF:.*]] = sub nsw i64 %[[P1INT]], %[[P2INT]] // CHECK-NEXT: ret i64 %[[DIFF]] // CHECK-NEXT: } @@ -356,8 +356,8 @@ llvm.func @ptr_diff_with_flags_nsw(%ptr1: !ptr.ptr<#llvm.address_space<0>>, %ptr // CHECK-LABEL: define i64 @ptr_diff_with_flags_nuw // CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) { -// CHECK-NEXT: %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64 -// CHECK-NEXT: %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64 +// CHECK-NEXT: %[[P1INT:.*]] = ptrtoaddr ptr %[[PTR1]] to i64 +// CHECK-NEXT: %[[P2INT:.*]] = ptrtoaddr ptr %[[PTR2]] to i64 // CHECK-NEXT: %[[DIFF:.*]] = sub nuw i64 %[[P1INT]], %[[P2INT]] // CHECK-NEXT: ret i64 %[[DIFF]] // CHECK-NEXT: } @@ -368,8 +368,8 @@ llvm.func @ptr_diff_with_flags_nuw(%ptr1: !ptr.ptr<#llvm.address_space<0>>, %ptr // CHECK-LABEL: define i64 @ptr_diff_with_flags_nsw_nuw // CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) { -// CHECK-NEXT: %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64 -// CHECK-NEXT: %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64 +// CHECK-NEXT: %[[P1INT:.*]] = ptrtoaddr ptr %[[PTR1]] to i64 +// CHECK-NEXT: %[[P2INT:.*]] = ptrtoaddr ptr %[[PTR2]] to i64 // CHECK-NEXT: %[[DIFF:.*]] = sub nuw nsw i64 %[[P1INT]], %[[P2INT]] // CHECK-NEXT: ret i64 %[[DIFF]] // CHECK-NEXT: } From 2666a409e97ac7d304adc6f84ad5eb7a376f9764 Mon Sep 17 00:00:00 2001 From: Dan Blackwell Date: Tue, 28 Oct 2025 11:49:23 +0000 Subject: [PATCH 009/539] [TSan] Fix warning when compiling with -Wmissing-designated-field-initializers (#163401) Currently we receive a warning when initializing a ThreadEventCallbacks when compiling with this flag: ``` llvm-project/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp:252:3: warning: missing field 'start' initializer [-Wmissing-designated-field-initializers] 252 | }; | ^ ``` This patch explicitly initializes the missing fields to null, fixing the warning. rdar://162074310 --- compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp index 62ab0554df08e..7fa5e017d3985 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp @@ -259,7 +259,9 @@ void InitializePlatform() { ThreadEventCallbacks callbacks = { .create = ThreadCreateCallback, + .start = nullptr, .terminate = ThreadTerminateCallback, + .destroy = nullptr, }; InstallPthreadIntrospectionHook(callbacks); #endif From ffc92372813a64833ee81b019c0dc30f11cee84b Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 28 Oct 2025 12:42:50 +0100 Subject: [PATCH 010/539] [InstCombine] Support ptrtoaddr of gep fold This fold can be directly reused for ptrtoaddr. One caveat is that for an inttoptr base, it currently won't work for pointers with non-address bits. It's possible to support this case. --- .../InstCombine/InstCombineCasts.cpp | 7 ++- .../InstCombine/InstCombineInternal.h | 2 +- llvm/test/Transforms/InstCombine/ptrtoaddr.ll | 48 +++++++++++++++++++ 3 files changed, 54 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 6d704e1dc893b..614c6ebd63be6 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -2148,7 +2148,7 @@ Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) { return nullptr; } -Value *InstCombinerImpl::foldPtrToIntOfGEP(Type *IntTy, Value *Ptr) { +Value *InstCombinerImpl::foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr) { // Look through chain of one-use GEPs. Type *PtrTy = Ptr->getType(); SmallVector GEPs; @@ -2210,7 +2210,7 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) { Mask->getType() == Ty) return BinaryOperator::CreateAnd(Builder.CreatePtrToInt(Ptr, Ty), Mask); - if (Value *V = foldPtrToIntOfGEP(Ty, SrcOp)) + if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp)) return replaceInstUsesWith(CI, V); Value *Vec, *Scalar, *Index; @@ -2240,6 +2240,9 @@ Instruction *InstCombinerImpl::visitPtrToAddr(PtrToAddrInst &CI) { Mask->getType() == Ty) return BinaryOperator::CreateAnd(Builder.CreatePtrToAddr(Ptr), Mask); + if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp)) + return replaceInstUsesWith(CI, V); + // FIXME: Implement variants of ptrtoint folds. return commonCastTransforms(CI); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 9c75d9a6711b9..d85e4f7590197 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -700,7 +700,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final /// folded operation. void PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN); - Value *foldPtrToIntOfGEP(Type *IntTy, Value *Ptr); + Value *foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr); Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, CmpPredicate Cond, Instruction &I); Instruction *foldSelectICmp(CmpPredicate Pred, SelectInst *SI, Value *RHS, diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll index 671a16d28f11a..adf3aa12623b9 100644 --- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll +++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll @@ -261,3 +261,51 @@ define i32 @ptrtoaddr_of_ptrmask_addrsize(ptr addrspace(1) %p, i32 %mask) { %addr = ptrtoaddr ptr addrspace(1) %masked to i32 ret i32 %addr } + +define i64 @ptrtoaddr_of_gep_of_inttoptr(i64 %int, i64 %offset) { +; CHECK-LABEL: define i64 @ptrtoaddr_of_gep_of_inttoptr( +; CHECK-SAME: i64 [[INT:%.*]], i64 [[OFFSET:%.*]]) { +; CHECK-NEXT: [[ADDR:%.*]] = add i64 [[INT]], [[OFFSET]] +; CHECK-NEXT: ret i64 [[ADDR]] +; + %ptr = inttoptr i64 %int to ptr + %gep = getelementptr i8, ptr %ptr, i64 %offset + %addr = ptrtoaddr ptr %gep to i64 + ret i64 %addr +} + +; FIXME: This could be supported by truncating %int before performing the +; arithmetic. +define i32 @ptrtoaddr_of_gep_of_inttoptr_addrsize(i64 %int, i32 %offset) { +; CHECK-LABEL: define i32 @ptrtoaddr_of_gep_of_inttoptr_addrsize( +; CHECK-SAME: i64 [[INT:%.*]], i32 [[OFFSET:%.*]]) { +; CHECK-NEXT: [[PTR:%.*]] = inttoptr i64 [[INT]] to ptr addrspace(1) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PTR]], i32 [[OFFSET]] +; CHECK-NEXT: [[ADDR:%.*]] = ptrtoaddr ptr addrspace(1) [[GEP]] to i32 +; CHECK-NEXT: ret i32 [[ADDR]] +; + %ptr = inttoptr i64 %int to ptr addrspace(1) + %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset + %addr = ptrtoaddr ptr addrspace(1) %gep to i32 + ret i32 %addr +} + +define i64 @ptrtoaddr_of_gep_of_null(i64 %offset) { +; CHECK-LABEL: define i64 @ptrtoaddr_of_gep_of_null( +; CHECK-SAME: i64 [[OFFSET:%.*]]) { +; CHECK-NEXT: ret i64 [[OFFSET]] +; + %gep = getelementptr i8, ptr null, i64 %offset + %addr = ptrtoaddr ptr %gep to i64 + ret i64 %addr +} + +define i32 @ptrtoaddr_of_gep_of_null_addrsize(i32 %offset) { +; CHECK-LABEL: define i32 @ptrtoaddr_of_gep_of_null_addrsize( +; CHECK-SAME: i32 [[OFFSET:%.*]]) { +; CHECK-NEXT: ret i32 [[OFFSET]] +; + %gep = getelementptr i8, ptr addrspace(1) null, i32 %offset + %addr = ptrtoaddr ptr addrspace(1) %gep to i32 + ret i32 %addr +} From 756fc6d2454986b3b90b479378539428b665609f Mon Sep 17 00:00:00 2001 From: tcottin Date: Tue, 28 Oct 2025 13:02:31 +0100 Subject: [PATCH 011/539] [clangd] Fix regression regarding new line handling for hover/signature help content (#162029) Fix clangd/clangd#2513 This regression was introduced with #140498. The issue is that with #140498 the extraction of the documentation comment changed from line based to paragraph based. This also removed some required line breaks inside paragraphs, which used to be added before the change. This PR adds the missing line breaks again. --- clang-tools-extra/clangd/support/Markup.cpp | 190 +++++++++++------- clang-tools-extra/clangd/support/Markup.h | 81 +++++++- .../clangd/unittests/HoverTests.cpp | 46 +++-- .../unittests/SymbolDocumentationTests.cpp | 8 +- .../clangd/unittests/support/MarkupTests.cpp | 106 +++++++++- 5 files changed, 330 insertions(+), 101 deletions(-) diff --git a/clang-tools-extra/clangd/support/Markup.cpp b/clang-tools-extra/clangd/support/Markup.cpp index 304917de252bf..9ba993a04709c 100644 --- a/clang-tools-extra/clangd/support/Markup.cpp +++ b/clang-tools-extra/clangd/support/Markup.cpp @@ -475,31 +475,61 @@ std::string Block::asPlainText() const { return llvm::StringRef(OS.str()).trim().str(); } +void Paragraph::renderNewlinesMarkdown(llvm::raw_ostream &OS, + llvm::StringRef ParagraphText) const { + llvm::StringRef Line, Rest; + + for (std::tie(Line, Rest) = ParagraphText.ltrim("\n").rtrim().split('\n'); + !(Line.empty() && Rest.empty()); + std::tie(Line, Rest) = Rest.split('\n')) { + + if (Line.empty()) { + // Blank lines are preserved in markdown. + OS << '\n'; + continue; + } + + OS << Line; + + if (!Rest.empty() && isHardLineBreakAfter(Line, Rest, /*IsMarkdown=*/true)) + // In markdown, 2 spaces before a line break forces a line break. + OS << " "; + OS << '\n'; + } +} + void Paragraph::renderEscapedMarkdown(llvm::raw_ostream &OS) const { bool NeedsSpace = false; bool HasChunks = false; + std::string ParagraphText; + ParagraphText.reserve(EstimatedStringSize); + llvm::raw_string_ostream ParagraphTextOS(ParagraphText); for (auto &C : Chunks) { if (C.SpaceBefore || NeedsSpace) - OS << " "; + ParagraphTextOS << " "; switch (C.Kind) { case ChunkKind::PlainText: - OS << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/true); + ParagraphTextOS << renderText(C.Contents, !HasChunks, + /*EscapeMarkdown=*/true); break; case ChunkKind::InlineCode: - OS << renderInlineBlock(C.Contents); + ParagraphTextOS << renderInlineBlock(C.Contents); break; case ChunkKind::Bold: - OS << renderText("**" + C.Contents + "**", !HasChunks, - /*EscapeMarkdown=*/true); + ParagraphTextOS << renderText("**" + C.Contents + "**", !HasChunks, + /*EscapeMarkdown=*/true); break; case ChunkKind::Emphasized: - OS << renderText("*" + C.Contents + "*", !HasChunks, - /*EscapeMarkdown=*/true); + ParagraphTextOS << renderText("*" + C.Contents + "*", !HasChunks, + /*EscapeMarkdown=*/true); break; } HasChunks = true; NeedsSpace = C.SpaceAfter; } + + renderNewlinesMarkdown(OS, ParagraphText); + // A paragraph in markdown is separated by a blank line. OS << "\n\n"; } @@ -507,28 +537,39 @@ void Paragraph::renderEscapedMarkdown(llvm::raw_ostream &OS) const { void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const { bool NeedsSpace = false; bool HasChunks = false; + std::string ParagraphText; + ParagraphText.reserve(EstimatedStringSize); + llvm::raw_string_ostream ParagraphTextOS(ParagraphText); for (auto &C : Chunks) { if (C.SpaceBefore || NeedsSpace) - OS << " "; + ParagraphTextOS << " "; switch (C.Kind) { case ChunkKind::PlainText: - OS << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/false); + ParagraphTextOS << renderText(C.Contents, !HasChunks, + /*EscapeMarkdown=*/false); break; case ChunkKind::InlineCode: - OS << renderInlineBlock(C.Contents); + ParagraphTextOS << renderInlineBlock(C.Contents); break; case ChunkKind::Bold: - OS << "**" << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/false) - << "**"; + ParagraphTextOS << "**" + << renderText(C.Contents, !HasChunks, + /*EscapeMarkdown=*/false) + << "**"; break; case ChunkKind::Emphasized: - OS << "*" << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/false) - << "*"; + ParagraphTextOS << "*" + << renderText(C.Contents, !HasChunks, + /*EscapeMarkdown=*/false) + << "*"; break; } HasChunks = true; NeedsSpace = C.SpaceAfter; } + + renderNewlinesMarkdown(OS, ParagraphText); + // A paragraph in markdown is separated by a blank line. OS << "\n\n"; } @@ -537,8 +578,6 @@ std::unique_ptr Paragraph::clone() const { return std::make_unique(*this); } -/// Choose a marker to delimit `Text` from a prioritized list of options. -/// This is more readable than escaping for plain-text. llvm::StringRef Paragraph::chooseMarker(llvm::ArrayRef Options, llvm::StringRef Text) const { // Prefer a delimiter whose characters don't appear in the text. @@ -548,23 +587,36 @@ llvm::StringRef Paragraph::chooseMarker(llvm::ArrayRef Options, return Options.front(); } -bool Paragraph::punctuationIndicatesLineBreak(llvm::StringRef Line) const { +bool Paragraph::punctuationIndicatesLineBreak(llvm::StringRef Line, + bool IsMarkdown) const { constexpr llvm::StringLiteral Punctuation = R"txt(.:,;!?)txt"; + if (!IsMarkdown && Line.ends_with(" ")) + return true; + Line = Line.rtrim(); return !Line.empty() && Punctuation.contains(Line.back()); } -bool Paragraph::isHardLineBreakIndicator(llvm::StringRef Rest) const { +bool Paragraph::isHardLineBreakIndicator(llvm::StringRef Rest, + bool IsMarkdown) const { + // Plaintext indicators: // '-'/'*' md list, '@'/'\' documentation command, '>' md blockquote, - // '#' headings, '`' code blocks, two spaces (markdown force newline) - constexpr llvm::StringLiteral LinebreakIndicators = R"txt(-*@\>#`)txt"; + // '#' headings, '`' code blocks + constexpr llvm::StringLiteral LinebreakIndicatorsPlainText = + R"txt(-*@\>#`)txt"; + // Markdown indicators: + // Only '@' and '\' documentation commands/escaped markdown syntax. + constexpr llvm::StringLiteral LinebreakIndicatorsMarkdown = R"txt(@\)txt"; Rest = Rest.ltrim(" \t"); if (Rest.empty()) return false; - if (LinebreakIndicators.contains(Rest.front())) + if (IsMarkdown) + return LinebreakIndicatorsMarkdown.contains(Rest.front()); + + if (LinebreakIndicatorsPlainText.contains(Rest.front())) return true; if (llvm::isDigit(Rest.front())) { @@ -575,64 +627,18 @@ bool Paragraph::isHardLineBreakIndicator(llvm::StringRef Rest) const { return false; } -bool Paragraph::isHardLineBreakAfter(llvm::StringRef Line, - llvm::StringRef Rest) const { - // In Markdown, 2 spaces before a line break forces a line break. - // Add a line break for plaintext in this case too. +bool Paragraph::isHardLineBreakAfter(llvm::StringRef Line, llvm::StringRef Rest, + bool IsMarkdown) const { // Should we also consider whether Line is short? - return Line.ends_with(" ") || punctuationIndicatesLineBreak(Line) || - isHardLineBreakIndicator(Rest); + return punctuationIndicatesLineBreak(Line, IsMarkdown) || + isHardLineBreakIndicator(Rest, IsMarkdown); } -void Paragraph::renderPlainText(llvm::raw_ostream &OS) const { - bool NeedsSpace = false; - std::string ConcatenatedText; - ConcatenatedText.reserve(EstimatedStringSize); - - llvm::raw_string_ostream ConcatenatedOS(ConcatenatedText); - - for (auto &C : Chunks) { - - if (C.Kind == ChunkKind::PlainText) { - if (C.SpaceBefore || NeedsSpace) - ConcatenatedOS << ' '; - - ConcatenatedOS << C.Contents; - NeedsSpace = llvm::isSpace(C.Contents.back()) || C.SpaceAfter; - continue; - } - - if (C.SpaceBefore || NeedsSpace) - ConcatenatedOS << ' '; - llvm::StringRef Marker = ""; - if (C.Preserve && C.Kind == ChunkKind::InlineCode) - Marker = chooseMarker({"`", "'", "\""}, C.Contents); - else if (C.Kind == ChunkKind::Bold) - Marker = "**"; - else if (C.Kind == ChunkKind::Emphasized) - Marker = "*"; - ConcatenatedOS << Marker << C.Contents << Marker; - NeedsSpace = C.SpaceAfter; - } - - // We go through the contents line by line to handle the newlines - // and required spacing correctly. - // - // Newlines are added if: - // - the line ends with 2 spaces and a newline follows - // - the line ends with punctuation that indicates a line break (.:,;!?) - // - the next line starts with a hard line break indicator (-@>#`, or a digit - // followed by '.' or ')'), ignoring leading whitespace. - // - // Otherwise, newlines in the input are replaced with a single space. - // - // Multiple spaces are collapsed into a single space. - // - // Lines containing only whitespace are ignored. +void Paragraph::renderNewlinesPlaintext(llvm::raw_ostream &OS, + llvm::StringRef ParagraphText) const { llvm::StringRef Line, Rest; - for (std::tie(Line, Rest) = - llvm::StringRef(ConcatenatedText).trim().split('\n'); + for (std::tie(Line, Rest) = ParagraphText.trim().split('\n'); !(Line.empty() && Rest.empty()); std::tie(Line, Rest) = Rest.split('\n')) { @@ -653,7 +659,7 @@ void Paragraph::renderPlainText(llvm::raw_ostream &OS) const { OS << canonicalizeSpaces(Line); - if (isHardLineBreakAfter(Line, Rest)) + if (isHardLineBreakAfter(Line, Rest, /*IsMarkdown=*/false)) OS << '\n'; else if (!Rest.empty()) // Since we removed any trailing whitespace from the input using trim(), @@ -661,6 +667,40 @@ void Paragraph::renderPlainText(llvm::raw_ostream &OS) const { // Therefore, we can add a space without worrying about trailing spaces. OS << ' '; } +} + +void Paragraph::renderPlainText(llvm::raw_ostream &OS) const { + bool NeedsSpace = false; + std::string ParagraphText; + ParagraphText.reserve(EstimatedStringSize); + + llvm::raw_string_ostream ParagraphTextOS(ParagraphText); + + for (auto &C : Chunks) { + + if (C.Kind == ChunkKind::PlainText) { + if (C.SpaceBefore || NeedsSpace) + ParagraphTextOS << ' '; + + ParagraphTextOS << C.Contents; + NeedsSpace = llvm::isSpace(C.Contents.back()) || C.SpaceAfter; + continue; + } + + if (C.SpaceBefore || NeedsSpace) + ParagraphTextOS << ' '; + llvm::StringRef Marker = ""; + if (C.Preserve && C.Kind == ChunkKind::InlineCode) + Marker = chooseMarker({"`", "'", "\""}, C.Contents); + else if (C.Kind == ChunkKind::Bold) + Marker = "**"; + else if (C.Kind == ChunkKind::Emphasized) + Marker = "*"; + ParagraphTextOS << Marker << C.Contents << Marker; + NeedsSpace = C.SpaceAfter; + } + + renderNewlinesPlaintext(OS, ParagraphText); // Paragraphs are separated by a blank line. OS << "\n\n"; diff --git a/clang-tools-extra/clangd/support/Markup.h b/clang-tools-extra/clangd/support/Markup.h index eea6328f69a12..219a7dad1e175 100644 --- a/clang-tools-extra/clangd/support/Markup.h +++ b/clang-tools-extra/clangd/support/Markup.h @@ -92,9 +92,84 @@ class Paragraph : public Block { llvm::StringRef chooseMarker(llvm::ArrayRef Options, llvm::StringRef Text) const; - bool punctuationIndicatesLineBreak(llvm::StringRef Line) const; - bool isHardLineBreakIndicator(llvm::StringRef Rest) const; - bool isHardLineBreakAfter(llvm::StringRef Line, llvm::StringRef Rest) const; + + /// Checks if the given line ends with punctuation that indicates a line break + /// (.:,;!?). + /// + /// If \p IsMarkdown is false, lines ending with 2 spaces are also considered + /// as indicating a line break. This is not needed for markdown because the + /// client renderer will handle this case. + bool punctuationIndicatesLineBreak(llvm::StringRef Line, + bool IsMarkdown) const; + + /// Checks if the given line starts with a hard line break indicator. + /// + /// If \p IsMarkdown is true, only '@' and '\' are considered as indicators. + /// Otherwise, '-', '*', '@', '\', '>', '#', '`' and a digit followed by '.' + /// or ')' are also considered as indicators. + bool isHardLineBreakIndicator(llvm::StringRef Rest, bool IsMarkdown) const; + + /// Checks if a hard line break should be added after the given line. + bool isHardLineBreakAfter(llvm::StringRef Line, llvm::StringRef Rest, + bool IsMarkdown) const; + + /// \brief Go through the contents line by line to handle the newlines + /// and required spacing correctly for markdown rendering. + /// + /// Newlines are added if: + /// - the line ends with punctuation that indicates a line break (.:,;!?) + /// - the next line starts with a hard line break indicator \\ (escaped + /// markdown/doxygen command) or @ (doxygen command) + /// + /// This newline handling is only used when the client requests markdown + /// for hover/signature help content. + /// Markdown does not add any newlines inside paragraphs unless the user + /// explicitly adds them. For hover/signature help content, we still want to + /// add newlines in some cases to improve readability, especially when doxygen + /// parsing is disabled or not implemented (like for signature help). + /// Therefore we add newlines in the above mentioned cases. + /// + /// In addition to that, we need to consider that the user can configure + /// clangd to treat documentation comments as plain text, while the client + /// requests markdown. + /// In this case, all markdown syntax is escaped and will + /// not be rendered as expected by markdown. + /// Examples are lists starting with '-' or headings starting with '#'. + /// With the above next line heuristics, these cases are also covered by the + /// '\\' new line indicator. + /// + /// FIXME: The heuristic fails e.g. for lists starting with '*' because it is + /// also used for emphasis in markdown and should not be treated as a newline. + /// + /// \param OS The stream to render to. + /// \param ParagraphText The text of the paragraph to render. + void renderNewlinesMarkdown(llvm::raw_ostream &OS, + llvm::StringRef ParagraphText) const; + + /// \brief Go through the contents line by line to handle the newlines + /// and required spacing correctly for plain text rendering. + /// + /// Newlines are added if: + /// - the line ends with 2 spaces and a newline follows + /// - the line ends with punctuation that indicates a line break (.:,;!?) + /// - the next line starts with a hard line break indicator (-@>#`\\ or a + /// digit followed by '.' or ')'), ignoring leading whitespace. + /// + /// Otherwise, newlines in the input are replaced with a single space. + /// + /// Multiple spaces are collapsed into a single space. + /// + /// Lines containing only whitespace are ignored. + /// + /// This newline handling is only used when the client requests plain + /// text for hover/signature help content. + /// Therefore with this approach we mimic the behavior of markdown rendering + /// for these clients. + /// + /// \param OS The stream to render to. + /// \param ParagraphText The text of the paragraph to render. + void renderNewlinesPlaintext(llvm::raw_ostream &OS, + llvm::StringRef ParagraphText) const; }; /// Represents a sequence of one or more documents. Knows how to print them in a diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp index 718c1bc5f355a..eb858ff616e90 100644 --- a/clang-tools-extra/clangd/unittests/HoverTests.cpp +++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp @@ -4087,16 +4087,16 @@ As well as warnings @brief brief doc -longer doc +longer doc @note this is a note -As you see, notes are "inlined". +As you see, notes are "inlined". @warning this is a warning -As well as warnings -@param a this is a param -@return it returns something -@retval 0 if successful +As well as warnings +@param a this is a param +@return it returns something +@retval 0 if successful @retval 1 if failed --- @@ -4166,9 +4166,9 @@ As well as warnings)"}, @brief brief doc -longer doc -@param a this is a param -@param b does not exist +longer doc +@param a this is a param +@param b does not exist @return it returns something --- @@ -4315,19 +4315,19 @@ TEST(Hover, ParseDocumentation) { }, { "foo.\nbar", - "foo.\nbar", - "foo.\nbar", + "foo. \nbar", + "foo. \nbar", "foo.\nbar", }, { "foo. \nbar", - "foo. \nbar", - "foo. \nbar", + "foo. \nbar", + "foo. \nbar", "foo.\nbar", }, { "foo\n*bar", - "foo\n\\*bar", + "foo \n\\*bar", "foo\n*bar", "foo\n*bar", }, @@ -4354,6 +4354,24 @@ TEST(Hover, ParseDocumentation) { "\\`not\nparsed\\`", "`not\nparsed`", "`not parsed`", + }, + { + R"(@brief this is a typical use case +@param x this is x +\param y this is y +@return something)", + R"(@brief this is a typical use case +@param x this is x +\\param y this is y +@return something)", + R"(@brief this is a typical use case +@param x this is x +\param y this is y +@return something)", + R"(@brief this is a typical use case +@param x this is x +\param y this is y +@return something)", }}; for (const auto &C : Cases) { diff --git a/clang-tools-extra/clangd/unittests/SymbolDocumentationTests.cpp b/clang-tools-extra/clangd/unittests/SymbolDocumentationTests.cpp index b3185cc10dd5a..676f7dfc74483 100644 --- a/clang-tools-extra/clangd/unittests/SymbolDocumentationTests.cpp +++ b/clang-tools-extra/clangd/unittests/SymbolDocumentationTests.cpp @@ -195,10 +195,10 @@ More description documentation)", normal textthis is an italic text this is a code block)", R"(\this is a bold text\ -normal text\this is an italic text\ +normal text\this is an italic text\ \this is a code block\)", R"(\this is a bold text\ -normal text\this is an italic text\ +normal text\this is an italic text\ \this is a code block\)", "this is a bold text normal textthis is an italic text " "this is a code block", @@ -712,10 +712,10 @@ TEST(SymbolDocumentation, MarkdownCodeSpans) { line \c span`)", R"(\`multi -line +line \\c span\`)", R"(`multi -line +line \c span`)", R"(`multi line \c span`)"}, diff --git a/clang-tools-extra/clangd/unittests/support/MarkupTests.cpp b/clang-tools-extra/clangd/unittests/support/MarkupTests.cpp index 5f91f31557176..af4782c07ae52 100644 --- a/clang-tools-extra/clangd/unittests/support/MarkupTests.cpp +++ b/clang-tools-extra/clangd/unittests/support/MarkupTests.cpp @@ -304,9 +304,9 @@ TEST(Paragraph, SeparationOfChunks) { P.appendSpace().appendCode("code").appendText(".\n newline"); EXPECT_EQ(P.asEscapedMarkdown(), - "after `foobar` bat`no` `space` text `code`.\n newline"); + "after `foobar` bat`no` `space` text `code`. \n newline"); EXPECT_EQ(P.asMarkdown(), - "after `foobar` bat`no` `space` text `code`.\n newline"); + "after `foobar` bat`no` `space` text `code`. \n newline"); EXPECT_EQ(P.asPlainText(), "after foobar batno space text code.\nnewline"); } @@ -371,21 +371,117 @@ TEST(Paragraph, SeparationOfChunks3) { EXPECT_EQ(P.asPlainText(), "after\nfoobar"); P.appendText("- bat\n"); - EXPECT_EQ(P.asEscapedMarkdown(), "after \n foobar\n\\- bat"); + EXPECT_EQ(P.asEscapedMarkdown(), "after \n foobar \n\\- bat"); EXPECT_EQ(P.asMarkdown(), "after \n foobar\n- bat"); EXPECT_EQ(P.asPlainText(), "after\nfoobar\n- bat"); P.appendText("- baz"); - EXPECT_EQ(P.asEscapedMarkdown(), "after \n foobar\n\\- bat\n\\- baz"); + EXPECT_EQ(P.asEscapedMarkdown(), "after \n foobar \n\\- bat \n\\- baz"); EXPECT_EQ(P.asMarkdown(), "after \n foobar\n- bat\n- baz"); EXPECT_EQ(P.asPlainText(), "after\nfoobar\n- bat\n- baz"); P.appendText(" faz "); - EXPECT_EQ(P.asEscapedMarkdown(), "after \n foobar\n\\- bat\n\\- baz faz"); + EXPECT_EQ(P.asEscapedMarkdown(), + "after \n foobar \n\\- bat \n\\- baz faz"); EXPECT_EQ(P.asMarkdown(), "after \n foobar\n- bat\n- baz faz"); EXPECT_EQ(P.asPlainText(), "after\nfoobar\n- bat\n- baz faz"); } +TEST(Paragraph, PunctuationLineBreaks) { + + struct { + std::string Text; + std::string EscapedMarkdown; + std::string Markdown; + std::string PlainText; + } Cases[] = { + {"Line ending with dot.\nForces a visual linebreak.", + "Line ending with dot. \nForces a visual linebreak.", + "Line ending with dot. \nForces a visual linebreak.", + "Line ending with dot.\nForces a visual linebreak."}, + {"Line ending with colon:\nForces a visual linebreak.", + "Line ending with colon: \nForces a visual linebreak.", + "Line ending with colon: \nForces a visual linebreak.", + "Line ending with colon:\nForces a visual linebreak."}, + {"Line ending with semicolon:\nForces a visual linebreak.", + "Line ending with semicolon: \nForces a visual linebreak.", + "Line ending with semicolon: \nForces a visual linebreak.", + "Line ending with semicolon:\nForces a visual linebreak."}, + {"Line ending with comma,\nForces a visual linebreak.", + "Line ending with comma, \nForces a visual linebreak.", + "Line ending with comma, \nForces a visual linebreak.", + "Line ending with comma,\nForces a visual linebreak."}, + {"Line ending with exclamation mark!\nForces a visual linebreak.", + "Line ending with exclamation mark! \nForces a visual linebreak.", + "Line ending with exclamation mark! \nForces a visual linebreak.", + "Line ending with exclamation mark!\nForces a visual linebreak."}, + {"Line ending with question mark?\nForces a visual linebreak.", + "Line ending with question mark? \nForces a visual linebreak.", + "Line ending with question mark? \nForces a visual linebreak.", + "Line ending with question mark?\nForces a visual linebreak."}, + }; + + for (const auto &C : Cases) { + Paragraph P; + P.appendText(C.Text); + EXPECT_EQ(P.asEscapedMarkdown(), C.EscapedMarkdown); + EXPECT_EQ(P.asMarkdown(), C.Markdown); + EXPECT_EQ(P.asPlainText(), C.PlainText); + } +} + +TEST(Paragraph, LineBreakIndicators) { + + struct { + std::string Text; + std::string EscapedMarkdown; + std::string Markdown; + std::string PlainText; + } Cases[] = { + {"Visual linebreak for\n- list items\n- and so on", + "Visual linebreak for \n\\- list items \n\\- and so on", + "Visual linebreak for\n- list items\n- and so on", + "Visual linebreak for\n- list items\n- and so on"}, + {"Visual linebreak for\n* list items\n* and so on", + "Visual linebreak for \n\\* list items \n\\* and so on", + "Visual linebreak for\n* list items\n* and so on", + "Visual linebreak for\n* list items\n* and so on"}, + {"Visual linebreak for\n@command any doxygen command\n\\other other " + "doxygen command", + "Visual linebreak for \n@command any doxygen command \n\\\\other " + "other doxygen command", + "Visual linebreak for \n@command any doxygen command \n\\other other " + "doxygen command", + "Visual linebreak for\n@command any doxygen command\n\\other other " + "doxygen command"}, + {"Visual linebreak for\n>blockquoute line 1\n> blockquoute line 2", + "Visual linebreak for \n\\>blockquoute line 1 \n\\> blockquoute line " + "2", + "Visual linebreak for\n>blockquoute line 1\n> blockquoute line 2", + "Visual linebreak for\n>blockquoute line 1\n> blockquoute line 2"}, + {"Visual linebreak for\n# Heading 1\ntext under heading\n## Heading " + "2\ntext under heading 2", + "Visual linebreak for \n\\# Heading 1\ntext under heading \n\\## " + "Heading 2\ntext under heading 2", + "Visual linebreak for\n# Heading 1\ntext under heading\n## Heading " + "2\ntext under heading 2", + "Visual linebreak for\n# Heading 1 text under heading\n## Heading 2 " + "text under heading 2"}, + {"Visual linebreak for\n`inline code`", + "Visual linebreak for \n\\`inline code\\`", + "Visual linebreak for\n`inline code`", + "Visual linebreak for\n`inline code`"}, + }; + + for (const auto &C : Cases) { + Paragraph P; + P.appendText(C.Text); + EXPECT_EQ(P.asEscapedMarkdown(), C.EscapedMarkdown); + EXPECT_EQ(P.asMarkdown(), C.Markdown); + EXPECT_EQ(P.asPlainText(), C.PlainText); + } +} + TEST(Paragraph, ExtraSpaces) { // Make sure spaces inside chunks are preserved for markdown // and dropped for plain text. From c6fa0e855a59e7b4cf1082c51ebc0cbc1ec4e1c6 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 28 Oct 2025 13:05:52 +0000 Subject: [PATCH 012/539] [X86] Attempt to fold trunc(srl(load(p),amt) -> load(p+amt/8) (#165266) As reported on #164853 - we only attempt to reduce shifted loads for constant shift amounts, but we could do more with non-constant values if value tracking can confirm basic alignments. This patch determines if a truncated shifted load of scalar integer shifts by a byte aligned amount and replaces the non-constant shift amount with a pointer offset instead. I had hoped to make this a generic DAG fold, but reduceLoadWidth isn't ready to be converted to a KnownBits value tracking mechanism, and other targets don't have complex address math like X86. Fixes #164853 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 34 + llvm/test/CodeGen/X86/bfloat-calling-conv.ll | 6 +- llvm/test/CodeGen/X86/trunc-srl-load.ll | 1652 ++--------------- ...ad-of-small-alloca-with-zero-upper-half.ll | 50 +- .../CodeGen/X86/widen-load-of-small-alloca.ll | 53 +- 5 files changed, 177 insertions(+), 1618 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 410f20edc6281..f514621094f13 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -54634,6 +54634,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); SDLoc DL(N); // Attempt to pre-truncate inputs to arithmetic ops instead. @@ -54652,6 +54653,39 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget)) return V; + // Fold trunc(srl(load(p),amt)) -> load(p+amt/8) + // If we're shifting down byte aligned bit chunks from a larger load for + // truncation, see if we can convert the shift into a pointer offset instead. + // Limit this to normal (non-ext) scalar integer loads. + if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL && + Src.hasOneUse() && Src.getOperand(0).hasOneUse() && + ISD::isNormalLoad(Src.getOperand(0).getNode())) { + auto *Ld = cast(Src.getOperand(0)); + if (Ld->isSimple() && VT.isByteSized() && + isPowerOf2_64(VT.getSizeInBits())) { + SDValue ShAmt = Src.getOperand(1); + KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); + // Check the shift amount is byte aligned. + // Check the truncation doesn't use any shifted in (zero) top bits. + if (KnownAmt.countMinTrailingZeros() >= 3 && + KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() - + VT.getSizeInBits())) { + EVT PtrVT = Ld->getBasePtr().getValueType(); + SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT); + SDValue PtrByteOfs = + DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs, + DAG.getShiftAmountConstant(3, PtrVT, DL)); + SDValue NewPtr = DAG.getMemBasePlusOffset( + Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap); + SDValue NewLoad = + DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1), + NewLoad.getValue(1)); + return NewLoad; + } + } + } + // The bitcast source is a direct mmx result. // Detect bitcasts between i32 to x86mmx if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) { diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll index ea4d32bae9ccb..d08749174f85c 100644 --- a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll +++ b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll @@ -660,8 +660,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 { ; SSE2-LABEL: call_ret_v3bf16: ; SSE2: # %bb.0: ; SSE2-NEXT: pushq %rax -; SSE2-NEXT: movl 4(%rdi), %eax -; SSE2-NEXT: pinsrw $0, %eax, %xmm1 +; SSE2-NEXT: pinsrw $0, 4(%rdi), %xmm1 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: callq returns_v3bf16@PLT @@ -725,8 +724,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 { ; AVXNECONVERT-LABEL: call_ret_v3bf16: ; AVXNECONVERT: # %bb.0: ; AVXNECONVERT-NEXT: pushq %rax -; AVXNECONVERT-NEXT: movl 4(%rdi), %eax -; AVXNECONVERT-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVXNECONVERT-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm0 ; AVXNECONVERT-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVXNECONVERT-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; AVXNECONVERT-NEXT: callq returns_v3bf16@PLT diff --git a/llvm/test/CodeGen/X86/trunc-srl-load.ll b/llvm/test/CodeGen/X86/trunc-srl-load.ll index 4dae1433b2196..d9c21d3a3f570 100644 --- a/llvm/test/CodeGen/X86/trunc-srl-load.ll +++ b/llvm/test/CodeGen/X86/trunc-srl-load.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefixes=X86 -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64 ; Tests showing for the analysis of non-constant shift amounts to improve load address math @@ -12,42 +12,20 @@ define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind { ; X86-LABEL: extractSub64_16: ; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movb {{[0-9]+}}(%esp), %ch ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl 4(%eax), %esi -; X86-NEXT: movb %ch, %cl -; X86-NEXT: andb $16, %cl -; X86-NEXT: movl %esi, %eax -; X86-NEXT: shrl %cl, %eax -; X86-NEXT: shrdl %cl, %esi, %edx -; X86-NEXT: testb $32, %ch -; X86-NEXT: jne .LBB0_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %eax -; X86-NEXT: .LBB0_2: -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: popl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $48, %ecx +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: movzwl (%eax,%ecx), %eax ; X86-NEXT: retl ; -; SSE-LABEL: extractSub64_16: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: andb $48, %cl -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shrq %cl, %rax -; SSE-NEXT: # kill: def $ax killed $ax killed $rax -; SSE-NEXT: retq -; -; AVX-LABEL: extractSub64_16: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: andb $48, %sil -; AVX-NEXT: shrxq %rsi, (%rdi), %rax -; AVX-NEXT: # kill: def $ax killed $ax killed $rax -; AVX-NEXT: retq +; X64-LABEL: extractSub64_16: +; X64: # %bb.0: +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: andl $48, %esi +; X64-NEXT: shrl $3, %esi +; X64-NEXT: movzwl (%rdi,%rsi), %eax +; X64-NEXT: retq %idx_bounds = and i32 %idx, 63 %idx_align = and i32 %idx_bounds, -16 %sh = zext nneg i32 %idx_align to i64 @@ -60,67 +38,20 @@ define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind { define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind { ; X86-LABEL: extractSub128_16: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $32, %esp -; X86-NEXT: movzbl 12(%ebp), %eax -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %esi -; X86-NEXT: movl 8(%ecx), %edi -; X86-NEXT: movl 12(%ecx), %ecx -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andb $16, %cl -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: movzbl %al, %edx -; X86-NEXT: movl (%esp,%edx), %eax -; X86-NEXT: movl 4(%esp,%edx), %edx -; X86-NEXT: shrdl %cl, %edx, %eax -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: leal -8(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $112, %ecx +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: movzwl (%eax,%ecx), %eax ; X86-NEXT: retl ; -; SSE-LABEL: extractSub128_16: -; SSE: # %bb.0: -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq 8(%rdi), %rdx -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andb $48, %cl -; SSE-NEXT: movq %rdx, %rdi -; SSE-NEXT: shrq %cl, %rdi -; SSE-NEXT: shrdq %cl, %rdx, %rax -; SSE-NEXT: testb $64, %sil -; SSE-NEXT: cmovneq %rdi, %rax -; SSE-NEXT: # kill: def $ax killed $ax killed $rax -; SSE-NEXT: retq -; -; AVX-LABEL: extractSub128_16: -; AVX: # %bb.0: -; AVX-NEXT: movq (%rdi), %rdx -; AVX-NEXT: movq 8(%rdi), %rax -; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: andb $48, %cl -; AVX-NEXT: shrdq %cl, %rax, %rdx -; AVX-NEXT: shrxq %rcx, %rax, %rax -; AVX-NEXT: testb $64, %sil -; AVX-NEXT: cmoveq %rdx, %rax -; AVX-NEXT: # kill: def $ax killed $ax killed $rax -; AVX-NEXT: retq +; X64-LABEL: extractSub128_16: +; X64: # %bb.0: +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: andl $112, %esi +; X64-NEXT: shrl $3, %esi +; X64-NEXT: movzwl (%rdi,%rsi), %eax +; X64-NEXT: retq %idx_bounds = and i32 %idx, 127 %idx_align = and i32 %idx_bounds, -16 %sh = zext nneg i32 %idx_align to i128 @@ -133,62 +64,20 @@ define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind { define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind { ; X86-LABEL: extractSub128_32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $32, %esp -; X86-NEXT: movzbl 12(%ebp), %eax -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %esi -; X86-NEXT: movl 8(%ecx), %edi -; X86-NEXT: movl 12(%ecx), %ecx -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: andb $96, %al -; X86-NEXT: shrb $3, %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl (%esp,%eax), %eax -; X86-NEXT: leal -8(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $96, %ecx +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: movl (%eax,%ecx), %eax ; X86-NEXT: retl ; -; SSE-LABEL: extractSub128_32: -; SSE: # %bb.0: -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq 8(%rdi), %rdx -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andb $32, %cl -; SSE-NEXT: movq %rdx, %rdi -; SSE-NEXT: shrq %cl, %rdi -; SSE-NEXT: shrdq %cl, %rdx, %rax -; SSE-NEXT: testb $64, %sil -; SSE-NEXT: cmovneq %rdi, %rax -; SSE-NEXT: # kill: def $eax killed $eax killed $rax -; SSE-NEXT: retq -; -; AVX-LABEL: extractSub128_32: -; AVX: # %bb.0: -; AVX-NEXT: movq (%rdi), %rdx -; AVX-NEXT: movq 8(%rdi), %rax -; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: andb $32, %cl -; AVX-NEXT: shrdq %cl, %rax, %rdx -; AVX-NEXT: shrxq %rcx, %rax, %rax -; AVX-NEXT: testb $64, %sil -; AVX-NEXT: cmoveq %rdx, %rax -; AVX-NEXT: # kill: def $eax killed $eax killed $rax -; AVX-NEXT: retq +; X64-LABEL: extractSub128_32: +; X64: # %bb.0: +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: andl $96, %esi +; X64-NEXT: shrl $3, %esi +; X64-NEXT: movl (%rdi,%rsi), %eax +; X64-NEXT: retq %idx_bounds = and i32 %idx, 127 %idx_align = and i32 %idx_bounds, -32 %sh = zext nneg i32 %idx_align to i128 @@ -201,46 +90,20 @@ define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind { define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind { ; X86-LABEL: extractSub128_64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $32, %esp -; X86-NEXT: movzbl 12(%ebp), %eax -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %esi -; X86-NEXT: movl 8(%ecx), %edi -; X86-NEXT: movl 12(%ecx), %ecx -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: andb $64, %al -; X86-NEXT: shrb $3, %al -; X86-NEXT: movzbl %al, %ecx -; X86-NEXT: movl (%esp,%ecx), %eax -; X86-NEXT: movl 4(%esp,%ecx), %edx -; X86-NEXT: leal -8(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl $64, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%ecx,%edx), %eax +; X86-NEXT: movl 4(%ecx,%edx), %edx ; X86-NEXT: retl ; ; X64-LABEL: extractSub128_64: ; X64: # %bb.0: -; X64-NEXT: testb $64, %sil -; X64-NEXT: je .LBB3_1 -; X64-NEXT: # %bb.2: -; X64-NEXT: movq 8(%rdi), %rax -; X64-NEXT: retq -; X64-NEXT: .LBB3_1: -; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: andl $64, %esi +; X64-NEXT: shrl $3, %esi +; X64-NEXT: movq (%rdi,%rsi), %rax ; X64-NEXT: retq %idx_bounds = and i32 %idx, 127 %idx_align = and i32 %idx_bounds, -64 @@ -254,185 +117,20 @@ define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind { define i8 @extractSub512_8(ptr %word, i32 %idx) nounwind { ; X86-LABEL: extractSub512_8: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $192, %esp -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl (%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%eax), %ebx -; X86-NEXT: movl 44(%eax), %edi -; X86-NEXT: movl 48(%eax), %esi -; X86-NEXT: movl 52(%eax), %edx -; X86-NEXT: movl 56(%eax), %ecx -; X86-NEXT: movl 60(%eax), %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl 12(%ebp), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: andl $24, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrl $3, %edx -; X86-NEXT: andl $60, %edx -; X86-NEXT: movl 48(%esp,%edx), %eax -; X86-NEXT: movl 52(%esp,%edx), %edx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shrdl %cl, %edx, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: andl $63, %ecx +; X86-NEXT: movzbl (%eax,%ecx), %eax ; X86-NEXT: retl ; -; SSE-LABEL: extractSub512_8: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rax -; SSE-NEXT: # kill: def $esi killed $esi def $rsi -; SSE-NEXT: movups (%rdi), %xmm0 -; SSE-NEXT: movups 16(%rdi), %xmm1 -; SSE-NEXT: movups 32(%rdi), %xmm2 -; SSE-NEXT: movups 48(%rdi), %xmm3 -; SSE-NEXT: xorps %xmm4, %xmm4 -; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $56, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: movq -128(%rsp,%rsi), %rdx -; SSE-NEXT: shrq %cl, %rdx -; SSE-NEXT: movl -120(%rsp,%rsi), %eax -; SSE-NEXT: addl %eax, %eax -; SSE-NEXT: notl %ecx -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: orl %edx, %eax -; SSE-NEXT: # kill: def $al killed $al killed $rax -; SSE-NEXT: popq %rcx -; SSE-NEXT: retq -; -; AVX2-LABEL: extractSub512_8: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rax -; AVX2-NEXT: # kill: def $esi killed $esi def $rsi -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vmovups 32(%rdi), %ymm1 -; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $56, %ecx -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: notl %ecx -; AVX2-NEXT: movl -120(%rsp,%rsi), %edx -; AVX2-NEXT: addl %edx, %edx -; AVX2-NEXT: shlxq %rcx, %rdx, %rcx -; AVX2-NEXT: orl %ecx, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $rax -; AVX2-NEXT: popq %rcx -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: extractSub512_8: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rax -; AVX512-NEXT: vmovups (%rdi), %ymm0 -; AVX512-NEXT: vmovups 32(%rdi), %ymm1 -; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: # kill: def $esi killed $esi def $rsi -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $56, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rax -; AVX512-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX512-NEXT: notl %ecx -; AVX512-NEXT: movl -120(%rsp,%rsi), %edx -; AVX512-NEXT: addl %edx, %edx -; AVX512-NEXT: shlxq %rcx, %rdx, %rcx -; AVX512-NEXT: orl %ecx, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $rax -; AVX512-NEXT: popq %rcx -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; X64-LABEL: extractSub512_8: +; X64: # %bb.0: +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: shrl $3, %esi +; X64-NEXT: andl $63, %esi +; X64-NEXT: movzbl (%rdi,%rsi), %eax +; X64-NEXT: retq %idx_bounds = and i32 %idx, 511 %idx_align = and i32 %idx_bounds, -8 %ld = load i512, ptr %word, align 8 @@ -445,152 +143,21 @@ define i8 @extractSub512_8(ptr %word, i32 %idx) nounwind { define i64 @extractSub512_64(ptr %word, i32 %idx) nounwind { ; X86-LABEL: extractSub512_64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $192, %esp -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl (%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%eax), %ebx -; X86-NEXT: movl 44(%eax), %edi -; X86-NEXT: movl 48(%eax), %esi -; X86-NEXT: movl 52(%eax), %edx -; X86-NEXT: movl 56(%eax), %ecx -; X86-NEXT: movl 60(%eax), %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrl $3, %ecx -; X86-NEXT: andl $56, %ecx -; X86-NEXT: movl 48(%esp,%ecx), %eax -; X86-NEXT: movl 52(%esp,%ecx), %edx -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: andl $56, %edx +; X86-NEXT: movl (%ecx,%edx), %eax +; X86-NEXT: movl 4(%ecx,%edx), %edx ; X86-NEXT: retl ; -; SSE-LABEL: extractSub512_64: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rax -; SSE-NEXT: # kill: def $esi killed $esi def $rsi -; SSE-NEXT: movups (%rdi), %xmm0 -; SSE-NEXT: movups 16(%rdi), %xmm1 -; SSE-NEXT: movups 32(%rdi), %xmm2 -; SSE-NEXT: movups 48(%rdi), %xmm3 -; SSE-NEXT: xorps %xmm4, %xmm4 -; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: movq -128(%rsp,%rsi), %rax -; SSE-NEXT: popq %rcx -; SSE-NEXT: retq -; -; AVX2-LABEL: extractSub512_64: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rax -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vmovups 32(%rdi), %ymm1 -; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: # kill: def $esi killed $esi def $rsi -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: movq -128(%rsp,%rsi), %rax -; AVX2-NEXT: popq %rcx -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: extractSub512_64: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rax -; AVX512-NEXT: vmovups (%rdi), %ymm0 -; AVX512-NEXT: vmovups 32(%rdi), %ymm1 -; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: # kill: def $esi killed $esi def $rsi -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: movq -128(%rsp,%rsi), %rax -; AVX512-NEXT: popq %rcx -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; X64-LABEL: extractSub512_64: +; X64: # %bb.0: +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: shrl $3, %esi +; X64-NEXT: andl $56, %esi +; X64-NEXT: movq (%rdi,%rsi), %rax +; X64-NEXT: retq %idx_bounds = and i32 %idx, 511 %idx_align = and i32 %idx_bounds, -64 %sh = zext nneg i32 %idx_align to i512 @@ -603,143 +170,35 @@ define i64 @extractSub512_64(ptr %word, i32 %idx) nounwind { define i128 @extractSub512_128(ptr %word, i32 %idx) nounwind { ; X86-LABEL: extractSub512_128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $192, %esp -; X86-NEXT: movl 12(%ebp), %eax -; X86-NEXT: movl (%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%eax), %ebx -; X86-NEXT: movl 44(%eax), %edi -; X86-NEXT: movl 48(%eax), %esi -; X86-NEXT: movl 52(%eax), %edx -; X86-NEXT: movl 56(%eax), %ecx -; X86-NEXT: movl 60(%eax), %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT: movl 16(%ebp), %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrl $3, %edi -; X86-NEXT: andl $48, %edi -; X86-NEXT: movl 48(%esp,%edi), %ecx -; X86-NEXT: movl 52(%esp,%edi), %edx -; X86-NEXT: movl 56(%esp,%edi), %esi -; X86-NEXT: movl 60(%esp,%edi), %edi -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: andl $48, %edx +; X86-NEXT: movl (%ecx,%edx), %esi +; X86-NEXT: movl 4(%ecx,%edx), %edi +; X86-NEXT: movl 8(%ecx,%edx), %ebx +; X86-NEXT: movl 12(%ecx,%edx), %ecx +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %ebx, 8(%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; -; SSE-LABEL: extractSub512_128: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rax -; SSE-NEXT: # kill: def $esi killed $esi def $rsi -; SSE-NEXT: movups (%rdi), %xmm0 -; SSE-NEXT: movups 16(%rdi), %xmm1 -; SSE-NEXT: movups 32(%rdi), %xmm2 -; SSE-NEXT: movups 48(%rdi), %xmm3 -; SSE-NEXT: xorps %xmm4, %xmm4 -; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $48, %esi -; SSE-NEXT: movq -128(%rsp,%rsi), %rax -; SSE-NEXT: movq -120(%rsp,%rsi), %rdx -; SSE-NEXT: popq %rcx -; SSE-NEXT: retq -; -; AVX-LABEL: extractSub512_128: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rax -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: vmovups 32(%rdi), %ymm1 -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: shrl $3, %esi -; AVX-NEXT: andl $48, %esi -; AVX-NEXT: movq -128(%rsp,%rsi), %rax -; AVX-NEXT: movq -120(%rsp,%rsi), %rdx -; AVX-NEXT: popq %rcx -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; X64-LABEL: extractSub512_128: +; X64: # %bb.0: +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: shrl $3, %esi +; X64-NEXT: andl $48, %esi +; X64-NEXT: movq (%rdi,%rsi), %rax +; X64-NEXT: movq 8(%rdi,%rsi), %rdx +; X64-NEXT: retq %idx_bounds = and i32 %idx, 511 %idx_align = and i32 %idx_bounds, -128 %sh = zext nneg i32 %idx_align to i512 @@ -752,916 +211,21 @@ define i128 @extractSub512_128(ptr %word, i32 %idx) nounwind { define i64 @extractSub4096_64(ptr %word, i32 %idx) nounwind { ; X86-LABEL: extractSub4096_64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $1536, %esp # imm = 0x600 -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl 4(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 52(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 64(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 68(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 72(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 76(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 80(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 84(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 88(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 92(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 96(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 100(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 104(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 108(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 112(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 116(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 120(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 124(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 128(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 132(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 136(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 140(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 144(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 148(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 152(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 156(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 160(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 164(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 168(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 172(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 176(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 180(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 184(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 188(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 192(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 196(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 200(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 204(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 208(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 212(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 216(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 220(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 224(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 228(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 232(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 236(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 240(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 244(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 248(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 252(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 256(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 260(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 264(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 268(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 272(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 276(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 280(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 284(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 288(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 292(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 296(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 300(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 304(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 308(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 312(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 316(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 320(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 324(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 328(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 332(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 336(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 340(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 344(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 348(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 352(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 356(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 360(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 364(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 368(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 372(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 376(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 380(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 384(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 388(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 392(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 396(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 400(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 404(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 408(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 412(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 416(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 420(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 424(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 428(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 432(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 436(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 440(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 444(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 448(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 452(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 456(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 460(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 464(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 468(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 472(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 476(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 480(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 484(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 488(%eax), %ebx -; X86-NEXT: movl 492(%eax), %edi -; X86-NEXT: movl 496(%eax), %esi -; X86-NEXT: movl 500(%eax), %edx -; X86-NEXT: movl 504(%eax), %ecx -; X86-NEXT: movl 508(%eax), %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl $4032, %ecx # imm = 0xFC0 -; X86-NEXT: andl 12(%ebp), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrl $3, %ecx -; X86-NEXT: movl 496(%esp,%ecx), %eax -; X86-NEXT: movl 500(%esp,%ecx), %edx -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $4032, %edx # imm = 0xFC0 +; X86-NEXT: andl {{[0-9]+}}(%esp), %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%ecx,%edx), %eax +; X86-NEXT: movl 4(%ecx,%edx), %edx ; X86-NEXT: retl ; -; SSE-LABEL: extractSub4096_64: -; SSE: # %bb.0: -; SSE-NEXT: subq $1176, %rsp # imm = 0x498 -; SSE-NEXT: # kill: def $esi killed $esi def $rsi -; SSE-NEXT: movups (%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movups 16(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movups 32(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movups 48(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movups 64(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movups 80(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movups 96(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movups 112(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movups 128(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movups 144(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movups 160(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movups 176(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movups 192(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movups 208(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movups 224(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movups 240(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movups 256(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movups 272(%rdi), %xmm15 -; SSE-NEXT: movups 288(%rdi), %xmm14 -; SSE-NEXT: movups 304(%rdi), %xmm13 -; SSE-NEXT: movups 320(%rdi), %xmm12 -; SSE-NEXT: movups 336(%rdi), %xmm11 -; SSE-NEXT: movups 352(%rdi), %xmm10 -; SSE-NEXT: movups 368(%rdi), %xmm9 -; SSE-NEXT: movups 384(%rdi), %xmm8 -; SSE-NEXT: movups 400(%rdi), %xmm7 -; SSE-NEXT: movups 416(%rdi), %xmm6 -; SSE-NEXT: movups 432(%rdi), %xmm5 -; SSE-NEXT: movups 448(%rdi), %xmm4 -; SSE-NEXT: movups 464(%rdi), %xmm3 -; SSE-NEXT: movups 480(%rdi), %xmm2 -; SSE-NEXT: movups 496(%rdi), %xmm1 -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: andl $4032, %esi # imm = 0xFC0 -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: movq 144(%rsp,%rsi), %rax -; SSE-NEXT: addq $1176, %rsp # imm = 0x498 -; SSE-NEXT: retq -; -; AVX2-LABEL: extractSub4096_64: -; AVX2: # %bb.0: -; AVX2-NEXT: subq $936, %rsp # imm = 0x3A8 -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups 32(%rdi), %ymm1 -; AVX2-NEXT: vmovups 64(%rdi), %ymm2 -; AVX2-NEXT: vmovups 96(%rdi), %ymm3 -; AVX2-NEXT: vmovups 128(%rdi), %ymm4 -; AVX2-NEXT: vmovups 160(%rdi), %ymm5 -; AVX2-NEXT: vmovups 192(%rdi), %ymm6 -; AVX2-NEXT: vmovups 224(%rdi), %ymm7 -; AVX2-NEXT: vmovups 256(%rdi), %ymm8 -; AVX2-NEXT: vmovups 288(%rdi), %ymm9 -; AVX2-NEXT: vmovups 320(%rdi), %ymm10 -; AVX2-NEXT: vmovups 352(%rdi), %ymm11 -; AVX2-NEXT: vmovups 384(%rdi), %ymm12 -; AVX2-NEXT: vmovups 416(%rdi), %ymm13 -; AVX2-NEXT: vmovups 448(%rdi), %ymm14 -; AVX2-NEXT: vmovups 480(%rdi), %ymm15 -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm15, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm14, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm13, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm12, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm11, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm10, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm9, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm8, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm7, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm6, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm5, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm3, (%rsp) -; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: # kill: def $esi killed $esi def $rsi -; AVX2-NEXT: andl $4032, %esi # imm = 0xFC0 -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: movq -96(%rsp,%rsi), %rax -; AVX2-NEXT: addq $936, %rsp # imm = 0x3A8 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: extractSub4096_64: -; AVX512: # %bb.0: -; AVX512-NEXT: subq $904, %rsp # imm = 0x388 -; AVX512-NEXT: # kill: def $esi killed $esi def $rsi -; AVX512-NEXT: vmovups (%rdi), %ymm0 -; AVX512-NEXT: vmovups 32(%rdi), %ymm1 -; AVX512-NEXT: vmovups 64(%rdi), %ymm2 -; AVX512-NEXT: vmovups 96(%rdi), %ymm3 -; AVX512-NEXT: vmovups 128(%rdi), %ymm4 -; AVX512-NEXT: vmovups 160(%rdi), %ymm5 -; AVX512-NEXT: vmovups 192(%rdi), %ymm6 -; AVX512-NEXT: vmovups 224(%rdi), %ymm7 -; AVX512-NEXT: vmovups 256(%rdi), %ymm8 -; AVX512-NEXT: vmovups 288(%rdi), %ymm9 -; AVX512-NEXT: vmovups 320(%rdi), %ymm10 -; AVX512-NEXT: vmovups 352(%rdi), %ymm11 -; AVX512-NEXT: vmovups 384(%rdi), %ymm12 -; AVX512-NEXT: vmovups 416(%rdi), %ymm13 -; AVX512-NEXT: andl $4032, %esi # imm = 0xFC0 -; AVX512-NEXT: vmovups 448(%rdi), %ymm14 -; AVX512-NEXT: vmovups 480(%rdi), %ymm15 -; AVX512-NEXT: vxorps %xmm16, %xmm16, %xmm16 -; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm15, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm14, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm13, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm12, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm11, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm10, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm9, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm7, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm6, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm5, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm4, (%rsp) -; AVX512-NEXT: vmovups %ymm3, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: movq -128(%rsp,%rsi), %rax -; AVX512-NEXT: addq $904, %rsp # imm = 0x388 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; X64-LABEL: extractSub4096_64: +; X64: # %bb.0: +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: andl $4032, %esi # imm = 0xFC0 +; X64-NEXT: shrl $3, %esi +; X64-NEXT: movq (%rdi,%rsi), %rax +; X64-NEXT: retq %idx_bounds = and i32 %idx, 4095 %idx_align = and i32 %idx_bounds, -64 %sh = zext nneg i32 %idx_align to i4096 diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll index 81c4d5d71084c..c3054a365c466 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll @@ -962,39 +962,22 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 } define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: -; X64-NO-BMI2: # %bb.0: -; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 -; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1 -; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx -; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movl %ecx, %eax -; X64-NO-BMI2-NEXT: shrb $6, %al -; X64-NO-BMI2-NEXT: movzbl %al, %eax -; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rax -; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NO-BMI2-NEXT: shrq %cl, %rax -; X64-NO-BMI2-NEXT: movb %al, (%rdx) -; X64-NO-BMI2-NEXT: retq -; -; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: -; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: movups (%rdi), %xmm0 -; X64-BMI2-NEXT: xorps %xmm1, %xmm1 -; X64-BMI2-NEXT: shll $3, %esi -; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movl %esi, %eax -; X64-BMI2-NEXT: shrb $6, %al -; X64-BMI2-NEXT: movzbl %al, %eax -; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rax -; X64-BMI2-NEXT: movb %al, (%rdx) -; X64-BMI2-NEXT: retq +; X64-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X64: # %bb.0: +; X64-NEXT: movups (%rdi), %xmm0 +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: leal (,%rsi,8), %eax +; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: shrb $6, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: leaq -72(%rsp,%rax,8), %rax +; X64-NEXT: andl $7, %esi +; X64-NEXT: movzbl (%rsi,%rax), %eax +; X64-NEXT: movb %al, (%rdx) +; X64-NEXT: retq ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-NO-SHLD: # %bb.0: @@ -3417,7 +3400,6 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; ALL: {{.*}} -; X64: {{.*}} ; X64-NO-SHLD: {{.*}} ; X86: {{.*}} ; X86-HAVE-BMI2-HAVE-SHLD: {{.*}} diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll index 8d36eef952a2b..84c2cc6d5ec31 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll @@ -1220,41 +1220,23 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; no @load_16byte_chunk_of_16byte_alloca define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca: -; X64-NO-BMI2: # %bb.0: -; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 -; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1 -; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx -; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2 -; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movl %ecx, %eax -; X64-NO-BMI2-NEXT: shrb $6, %al -; X64-NO-BMI2-NEXT: movzbl %al, %eax -; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rax -; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NO-BMI2-NEXT: shrq %cl, %rax -; X64-NO-BMI2-NEXT: movb %al, (%rdx) -; X64-NO-BMI2-NEXT: retq -; -; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca: -; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: movups (%rdi), %xmm0 -; X64-BMI2-NEXT: movups 16(%rdi), %xmm1 -; X64-BMI2-NEXT: shll $3, %esi -; X64-BMI2-NEXT: xorps %xmm2, %xmm2 -; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT: movl %esi, %eax -; X64-BMI2-NEXT: shrb $6, %al -; X64-BMI2-NEXT: movzbl %al, %eax -; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rax -; X64-BMI2-NEXT: movb %al, (%rdx) -; X64-BMI2-NEXT: retq +; X64-LABEL: load_1byte_chunk_of_32byte_alloca: +; X64: # %bb.0: +; X64-NEXT: movups (%rdi), %xmm0 +; X64-NEXT: movups 16(%rdi), %xmm1 +; X64-NEXT: leal (,%rsi,8), %eax +; X64-NEXT: xorps %xmm2, %xmm2 +; X64-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: shrb $6, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: leaq -72(%rsp,%rax,8), %rax +; X64-NEXT: andl $7, %esi +; X64-NEXT: movzbl (%rsi,%rax), %eax +; X64-NEXT: movb %al, (%rdx) +; X64-NEXT: retq ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca: ; X86-NO-BMI2-NO-SHLD: # %bb.0: @@ -2156,7 +2138,6 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; no @load_32byte_chunk_of_32byte_alloca ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; ALL: {{.*}} -; X64: {{.*}} ; X64-NO-SHLD: {{.*}} ; X86: {{.*}} ; X86-NO-SHLD: {{.*}} From 95783bcd5650534a785f0d6033244d830bb4116d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Pettersson?= Date: Tue, 28 Oct 2025 14:33:04 +0100 Subject: [PATCH 013/539] [lit] Fix to make "RUN: env PATH=..." work as intended (#165308) There was a bug in llvm-lit related to setting PATH using env in the internal shell. The new PATH wasn't used when looking up the command to be executed. So when doing things like this in a test case RUN: mkdir %t RUN: env PATH=%t program ... the internal shell would search for "program" using the orignal PATH and not the PATH set by env when preceeding the command. It seems like this was a simple mistake in commit 57782eff31e9d454, since the logic to pick a PATH from the cmd_shenv instead of shenv actually was added in that patch, but the resulting path wasn't used. --- llvm/utils/lit/lit/TestRunner.py | 2 +- llvm/utils/lit/tests/Inputs/shtest-env-path/lit.cfg | 8 ++++++++ .../utils/lit/tests/Inputs/shtest-env-path/path.txt | 8 ++++++++ llvm/utils/lit/tests/Inputs/shtest-env-path/test.sh | 4 ++++ llvm/utils/lit/tests/shtest-env-path.py | 13 +++++++++++++ 5 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 llvm/utils/lit/tests/Inputs/shtest-env-path/lit.cfg create mode 100644 llvm/utils/lit/tests/Inputs/shtest-env-path/path.txt create mode 100755 llvm/utils/lit/tests/Inputs/shtest-env-path/test.sh create mode 100644 llvm/utils/lit/tests/shtest-env-path.py diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index f88314547bb3f..9fba96a1471a0 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -945,7 +945,7 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): path = ( cmd_shenv.env["PATH"] if "PATH" in cmd_shenv.env else shenv.env["PATH"] ) - executable = lit.util.which(args[0], shenv.env["PATH"]) + executable = lit.util.which(args[0], path) if not executable: raise InternalShellError(j, "%r: command not found" % args[0]) diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-path/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-env-path/lit.cfg new file mode 100644 index 0000000000000..36517f998530b --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-env-path/lit.cfg @@ -0,0 +1,8 @@ +import lit.formats + +config.name = "shtest-env-path" +config.suffixes = [".txt"] +config.test_format = lit.formats.ShTest() +config.test_source_root = None +config.test_exec_root = None +config.substitutions.append(("%{python}", '"%s"' % (sys.executable))) diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-path/path.txt b/llvm/utils/lit/tests/Inputs/shtest-env-path/path.txt new file mode 100644 index 0000000000000..b36e861ec5632 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-env-path/path.txt @@ -0,0 +1,8 @@ +## Tests env command for setting the PATH variable. + +## Check that test.sh can be found using the configured PATH. +# +# RUN: env PATH=%S test.sh | FileCheck --check-prefix=CHECK %s +# + +# CHECK: TEST-ENV-PATH-123 diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-path/test.sh b/llvm/utils/lit/tests/Inputs/shtest-env-path/test.sh new file mode 100755 index 0000000000000..a1e46fc210d49 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-env-path/test.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +echo "TEST-ENV-PATH-123" + diff --git a/llvm/utils/lit/tests/shtest-env-path.py b/llvm/utils/lit/tests/shtest-env-path.py new file mode 100644 index 0000000000000..bf459ae53fbc0 --- /dev/null +++ b/llvm/utils/lit/tests/shtest-env-path.py @@ -0,0 +1,13 @@ +## Tests env command for setting the PATH variable. + +# The test is using /bin/sh. Limit to system known to have /bin/sh. +# REQUIRES: system-linux + +# RUN: %{lit} -a -v %{inputs}/shtest-env-path/path.txt \ +# RUN: | FileCheck -match-full-lines %s +# +# END. + +# CHECK: -- Testing: 1 tests{{.*}} +# CHECK: PASS: shtest-env-path :: path.txt (1 of 1) +# CHECK: -- From 15626896c98289e36b2c4090ae93d59f9abb2d1b Mon Sep 17 00:00:00 2001 From: Ebuka Ezike Date: Tue, 28 Oct 2025 13:33:22 +0000 Subject: [PATCH 014/539] [lldb][test] When an external stdlib is specified do not link to the system stdlib (#164462) On linux if you specify the an external libc++ and clang will still link to the system's libc++. This patch fixes that. Fixes https://github.com/llvm/llvm-project/issues/116040 --- lldb/packages/Python/lldbsuite/test/make/Makefile.rules | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules index e72ffd1f030ec..09939e29e5b75 100644 --- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules +++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules @@ -386,7 +386,9 @@ ifeq (,$(filter 1, $(USE_LIBSTDCPP) $(USE_LIBCPP) $(USE_SYSTEM_STDLIB))) ifneq "$(LIBCPP_INCLUDE_TARGET_DIR)" "" CXXFLAGS += -cxx-isystem $(LIBCPP_INCLUDE_TARGET_DIR) endif - LDFLAGS += -L$(LIBCPP_LIBRARY_DIR) -Wl,-rpath,$(LIBCPP_LIBRARY_DIR) -lc++ + + # If `-nostdlib++` is not passed, clang will link to the system's stdlib. + LDFLAGS += -nostdlib++ -L$(LIBCPP_LIBRARY_DIR) -Wl,-rpath,$(LIBCPP_LIBRARY_DIR) -lc++ else USE_SYSTEM_STDLIB := 1 endif @@ -407,7 +409,8 @@ ifeq (1,$(USE_LIBCPP)) ifneq "$(LIBCPP_INCLUDE_TARGET_DIR)" "" CXXFLAGS += -cxx-isystem $(LIBCPP_INCLUDE_TARGET_DIR) endif - LDFLAGS += -L$(LIBCPP_LIBRARY_DIR) -Wl,-rpath,$(LIBCPP_LIBRARY_DIR) -lc++ + # If `-nostdlib++` is not passed, clang will link to the system's stdlib. + LDFLAGS += -nostdlib++ -L$(LIBCPP_LIBRARY_DIR) -Wl,-rpath,$(LIBCPP_LIBRARY_DIR) -lc++ else ifeq "$(OS)" "Android" # Nothing to do, this is already handled in From 1fb35099a73751314017ca8d7cfbdd77657000e2 Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Tue, 28 Oct 2025 14:38:34 +0100 Subject: [PATCH 015/539] Revert "[nsan] More unit tests for `float128`. (#165248)" (#165391) This reverts commit 2f869c427b6c800f37147458ac03d1fa6f9ad9d3. Breaks build on some configurations --- compiler-rt/lib/nsan/tests/NSanUnitTest.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/compiler-rt/lib/nsan/tests/NSanUnitTest.cpp b/compiler-rt/lib/nsan/tests/NSanUnitTest.cpp index 73b59671fe07a..d121292c36682 100644 --- a/compiler-rt/lib/nsan/tests/NSanUnitTest.cpp +++ b/compiler-rt/lib/nsan/tests/NSanUnitTest.cpp @@ -43,8 +43,8 @@ template void TestFT() { ASSERT_EQ(GetULPDiff(-X, -Y), 3); // Values with larger differences. - static constexpr const __uint128_t MantissaSize = - __uint128_t{1} << FTInfo::kMantissaBits; + static constexpr const __sanitizer::u64 MantissaSize = + __sanitizer::u64{1} << FTInfo::kMantissaBits; ASSERT_EQ(GetULPDiff(1.0, next(2.0, 1.0)), MantissaSize - 1); ASSERT_EQ(GetULPDiff(1.0, 2.0), MantissaSize); ASSERT_EQ(GetULPDiff(1.0, next(2.0, 3.0)), MantissaSize + 1); @@ -57,6 +57,11 @@ TEST(NSanTest, Double) { TestFT(nextafter)>(); } -TEST(NSanTest, Float128) { TestFT<__float128, nextafterf128>(); } +TEST(NSanTest, Float128) { + // Very basic tests. FIXME: improve when we have nextafter<__float128>. + ASSERT_EQ(GetULPDiff<__float128>(0.0, 0.0), 0); + ASSERT_EQ(GetULPDiff<__float128>(-0.0, 0.0), 0); + ASSERT_NE(GetULPDiff<__float128>(-0.01, 0.01), kMaxULPDiff); +} } // end namespace __nsan From e322b9fd5ac920eb42266c77ba5d0b927371873f Mon Sep 17 00:00:00 2001 From: Kunqiu Chen Date: Tue, 28 Oct 2025 21:45:17 +0800 Subject: [PATCH 016/539] [AbstractCallSite] Handle Indirect Calls Properly (#163003) AbstractCallSite handles three types of calls (direct, indirect, and callback). This patch fixes the handling of indirect calls in some methods, which incorrectly assumed that non-direct calls are always callback calls. Moreover, this PR adds 2 unit tests for direct call type and indirect call type. The aforementioned misassumption leads to the following problem: --- ## Problem When the underlying call is **indirect**, some APIs of `AbstractCallSite` behave unexpectedly. E.g., `AbstractCallSite::getCalledFunction()` currently triggers an **assertion failure**, instead of returning `nullptr` as documented: ```cpp /// Return the function being called if this is a direct call, otherwise /// return null (if it's an indirect call). Function *getCalledFunction() const; ``` Actual unexpected assertion failure: ``` AbstractCallSite.h:197: int llvm::AbstractCallSite::getCallArgOperandNoForCallee() const: Assertion `isCallbackCall()' failed. ``` This is because `AbstractCallSite` mistakenly entered the branch that handles Callback Calls as its guard condition (`!isDirectCall()`) does not take into account the case of indirect calls --- llvm/include/llvm/IR/AbstractCallSite.h | 10 +-- llvm/unittests/IR/AbstractCallSiteTest.cpp | 94 +++++++++++++++++++++- 2 files changed, 98 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/IR/AbstractCallSite.h b/llvm/include/llvm/IR/AbstractCallSite.h index 9e24ae7d1b431..f431e1d8a38ef 100644 --- a/llvm/include/llvm/IR/AbstractCallSite.h +++ b/llvm/include/llvm/IR/AbstractCallSite.h @@ -137,7 +137,7 @@ class AbstractCallSite { /// Return true if @p U is the use that defines the callee of this ACS. bool isCallee(const Use *U) const { - if (isDirectCall()) + if (!isCallbackCall()) return CB->isCallee(U); assert(!CI.ParameterEncoding.empty() && @@ -154,7 +154,7 @@ class AbstractCallSite { /// Return the number of parameters of the callee. unsigned getNumArgOperands() const { - if (isDirectCall()) + if (!isCallbackCall()) return CB->arg_size(); // Subtract 1 for the callee encoding. return CI.ParameterEncoding.size() - 1; @@ -169,7 +169,7 @@ class AbstractCallSite { /// Return the operand index of the underlying instruction associated with /// the function parameter number @p ArgNo or -1 if there is none. int getCallArgOperandNo(unsigned ArgNo) const { - if (isDirectCall()) + if (!isCallbackCall()) return ArgNo; // Add 1 for the callee encoding. return CI.ParameterEncoding[ArgNo + 1]; @@ -183,7 +183,7 @@ class AbstractCallSite { /// Return the operand of the underlying instruction associated with the /// function parameter number @p ArgNo or nullptr if there is none. Value *getCallArgOperand(unsigned ArgNo) const { - if (isDirectCall()) + if (!isCallbackCall()) return CB->getArgOperand(ArgNo); // Add 1 for the callee encoding. return CI.ParameterEncoding[ArgNo + 1] >= 0 @@ -210,7 +210,7 @@ class AbstractCallSite { /// Return the pointer to function that is being called. Value *getCalledOperand() const { - if (isDirectCall()) + if (!isCallbackCall()) return CB->getCalledOperand(); return CB->getArgOperand(getCallArgOperandNoForCallee()); } diff --git a/llvm/unittests/IR/AbstractCallSiteTest.cpp b/llvm/unittests/IR/AbstractCallSiteTest.cpp index ddb10911ad028..623d1b36e1c03 100644 --- a/llvm/unittests/IR/AbstractCallSiteTest.cpp +++ b/llvm/unittests/IR/AbstractCallSiteTest.cpp @@ -6,8 +6,9 @@ // //===----------------------------------------------------------------------===// -#include "llvm/AsmParser/Parser.h" #include "llvm/IR/AbstractCallSite.h" +#include "llvm/AsmParser/Parser.h" +#include "llvm/IR/Argument.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" #include "llvm/Support/SourceMgr.h" @@ -51,5 +52,96 @@ TEST(AbstractCallSite, CallbackCall) { EXPECT_TRUE(ACS); EXPECT_TRUE(ACS.isCallbackCall()); EXPECT_TRUE(ACS.isCallee(CallbackUse)); + EXPECT_EQ(ACS.getCalleeUseForCallback(), *CallbackUse); EXPECT_EQ(ACS.getCalledFunction(), Callback); + + // The callback metadata {CallbackNo, Arg0No, ..., isVarArg} = {1, -1, true} + EXPECT_EQ(ACS.getCallArgOperandNoForCallee(), 1); + // Though the callback metadata only specifies ONE unfixed argument No, the + // callback callee is vararg, hence the third arg is also considered as + // another arg for the callback. + EXPECT_EQ(ACS.getNumArgOperands(), 2u); + Argument *Param0 = Callback->getArg(0), *Param1 = Callback->getArg(1); + ASSERT_TRUE(Param0 && Param1); + EXPECT_EQ(ACS.getCallArgOperandNo(*Param0), -1); + EXPECT_EQ(ACS.getCallArgOperandNo(*Param1), 2); +} + +TEST(AbstractCallSite, DirectCall) { + LLVMContext C; + + const char *IR = "declare void @bar(i32 %x, i32 %y)\n" + "define void @foo() {\n" + " call void @bar(i32 1, i32 2)\n" + " ret void\n" + "}\n"; + + std::unique_ptr M = parseIR(C, IR); + ASSERT_TRUE(M); + + Function *Callee = M->getFunction("bar"); + ASSERT_NE(Callee, nullptr); + + const Use *DirectCallUse = Callee->getSingleUndroppableUse(); + ASSERT_NE(DirectCallUse, nullptr); + + AbstractCallSite ACS(DirectCallUse); + EXPECT_TRUE(ACS); + EXPECT_TRUE(ACS.isDirectCall()); + EXPECT_TRUE(ACS.isCallee(DirectCallUse)); + EXPECT_EQ(ACS.getCalledFunction(), Callee); + EXPECT_EQ(ACS.getNumArgOperands(), 2u); + Argument *ArgX = Callee->getArg(0); + ASSERT_NE(ArgX, nullptr); + Value *CAO1 = ACS.getCallArgOperand(*ArgX); + Value *CAO2 = ACS.getCallArgOperand(0); + ASSERT_NE(CAO2, nullptr); + // The two call arg operands should be the same object, since they are both + // the first argument of the call. + EXPECT_EQ(CAO2, CAO1); + + ConstantInt *FirstArgInt = dyn_cast(CAO2); + ASSERT_NE(FirstArgInt, nullptr); + EXPECT_EQ(FirstArgInt->getZExtValue(), 1ull); + + EXPECT_EQ(ACS.getCallArgOperandNo(*ArgX), 0); + EXPECT_EQ(ACS.getCallArgOperandNo(0), 0); + EXPECT_EQ(ACS.getCallArgOperandNo(1), 1); +} + +TEST(AbstractCallSite, IndirectCall) { + LLVMContext C; + + const char *IR = "define void @foo(ptr %0) {\n" + " call void %0(i32 1, i32 2)\n" + " ret void\n" + "}\n"; + + std::unique_ptr M = parseIR(C, IR); + ASSERT_TRUE(M); + + Function *Fun = M->getFunction("foo"); + ASSERT_NE(Fun, nullptr); + + Argument *ArgAsCallee = Fun->getArg(0); + ASSERT_NE(ArgAsCallee, nullptr); + + const Use *IndCallUse = ArgAsCallee->getSingleUndroppableUse(); + ASSERT_NE(IndCallUse, nullptr); + + AbstractCallSite ACS(IndCallUse); + EXPECT_TRUE(ACS); + EXPECT_TRUE(ACS.isIndirectCall()); + EXPECT_TRUE(ACS.isCallee(IndCallUse)); + EXPECT_EQ(ACS.getCalledFunction(), nullptr); + EXPECT_EQ(ACS.getCalledOperand(), ArgAsCallee); + EXPECT_EQ(ACS.getNumArgOperands(), 2u); + Value *CalledOperand = ACS.getCallArgOperand(0); + ASSERT_NE(CalledOperand, nullptr); + ConstantInt *FirstArgInt = dyn_cast(CalledOperand); + ASSERT_NE(FirstArgInt, nullptr); + EXPECT_EQ(FirstArgInt->getZExtValue(), 1ull); + + EXPECT_EQ(ACS.getCallArgOperandNo(0), 0); + EXPECT_EQ(ACS.getCallArgOperandNo(1), 1); } From ca4f91e6cb25ba2dd82b1b4bb87a3037b04aaee8 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Tue, 28 Oct 2025 13:44:57 +0000 Subject: [PATCH 017/539] [NFC][Clang] Regenerate CHECKs - CodeGen/AArch64/neon-across.c --- clang/test/CodeGen/AArch64/neon-across.c | 56 ++++++++++++------------ 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/clang/test/CodeGen/AArch64/neon-across.c b/clang/test/CodeGen/AArch64/neon-across.c index d365975593559..aa0387d89dfef 100644 --- a/clang/test/CodeGen/AArch64/neon-across.c +++ b/clang/test/CodeGen/AArch64/neon-across.c @@ -49,7 +49,7 @@ uint32_t test_vaddlv_u16(uint16x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s8 -// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> [[A]]) // CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16 @@ -60,7 +60,7 @@ int16_t test_vaddlvq_s8(int8x16_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s16 -// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> [[A]]) // CHECK-NEXT: ret i32 [[VADDLV_I]] @@ -70,7 +70,7 @@ int32_t test_vaddlvq_s16(int16x8_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s32 -// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VADDLVQ_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> [[A]]) // CHECK-NEXT: ret i64 [[VADDLVQ_S32_I]] @@ -80,7 +80,7 @@ int64_t test_vaddlvq_s32(int32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u8 -// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> [[A]]) // CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16 @@ -91,7 +91,7 @@ uint16_t test_vaddlvq_u8(uint8x16_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u16 -// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> [[A]]) // CHECK-NEXT: ret i32 [[VADDLV_I]] @@ -101,7 +101,7 @@ uint32_t test_vaddlvq_u16(uint16x8_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u32 -// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VADDLVQ_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> [[A]]) // CHECK-NEXT: ret i64 [[VADDLVQ_U32_I]] @@ -155,7 +155,7 @@ uint16_t test_vmaxv_u16(uint16x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s8 -// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> [[A]]) // CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8 @@ -166,7 +166,7 @@ int8_t test_vmaxvq_s8(int8x16_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s16 -// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> [[A]]) // CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16 @@ -177,7 +177,7 @@ int16_t test_vmaxvq_s16(int16x8_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s32 -// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VMAXVQ_S32_I]] @@ -187,7 +187,7 @@ int32_t test_vmaxvq_s32(int32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u8 -// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> [[A]]) // CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8 @@ -198,7 +198,7 @@ uint8_t test_vmaxvq_u8(uint8x16_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u16 -// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> [[A]]) // CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16 @@ -209,7 +209,7 @@ uint16_t test_vmaxvq_u16(uint16x8_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u32 -// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VMAXVQ_U32_I]] @@ -263,7 +263,7 @@ uint16_t test_vminv_u16(uint16x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vminvq_s8 -// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> [[A]]) // CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8 @@ -274,7 +274,7 @@ int8_t test_vminvq_s8(int8x16_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vminvq_s16 -// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> [[A]]) // CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16 @@ -285,7 +285,7 @@ int16_t test_vminvq_s16(int16x8_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vminvq_s32 -// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VMINVQ_S32_I]] @@ -295,7 +295,7 @@ int32_t test_vminvq_s32(int32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vminvq_u8 -// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> [[A]]) // CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8 @@ -306,7 +306,7 @@ uint8_t test_vminvq_u8(uint8x16_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vminvq_u16 -// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> [[A]]) // CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16 @@ -317,7 +317,7 @@ uint16_t test_vminvq_u16(uint16x8_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vminvq_u32 -// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VMINVQ_U32_I]] @@ -371,7 +371,7 @@ uint16_t test_vaddv_u16(uint16x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s8 -// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> [[A]]) // CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8 @@ -382,7 +382,7 @@ int8_t test_vaddvq_s8(int8x16_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s16 -// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> [[A]]) // CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16 @@ -393,7 +393,7 @@ int16_t test_vaddvq_s16(int16x8_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s32 -// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VADDVQ_S32_I]] @@ -403,7 +403,7 @@ int32_t test_vaddvq_s32(int32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u8 -// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> [[A]]) // CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8 @@ -414,7 +414,7 @@ uint8_t test_vaddvq_u8(uint8x16_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u16 -// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> [[A]]) // CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16 @@ -425,7 +425,7 @@ uint16_t test_vaddvq_u16(uint16x8_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u32 -// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VADDVQ_U32_I]] @@ -435,7 +435,7 @@ uint32_t test_vaddvq_u32(uint32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VMAXVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> [[A]]) // CHECK-NEXT: ret float [[VMAXVQ_F32_I]] @@ -445,7 +445,7 @@ float32_t test_vmaxvq_f32(float32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vminvq_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VMINVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> [[A]]) // CHECK-NEXT: ret float [[VMINVQ_F32_I]] @@ -455,7 +455,7 @@ float32_t test_vminvq_f32(float32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vmaxnmvq_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VMAXNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> [[A]]) // CHECK-NEXT: ret float [[VMAXNMVQ_F32_I]] @@ -465,7 +465,7 @@ float32_t test_vmaxnmvq_f32(float32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vminnmvq_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VMINNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> [[A]]) // CHECK-NEXT: ret float [[VMINNMVQ_F32_I]] From b561124f927a5f54fd0ee78bb06be73e60b56b6d Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Tue, 28 Oct 2025 08:50:51 -0500 Subject: [PATCH 018/539] [test][DebugInfo] Fix location of test build artifacts (#165349) The test added in #161067 writes artifacts to the current dir, i.e. `test.o` / `test.dwo` / `test.dwp`, which might not be writeable. Tests should use `%t` for test artifact location, i.e. `%t.o` / `%t.dwo` / `%t.dwp` However, since `"test.dwo"` is part of the assembly source file used as a test input, and that's not something lit will substitute, that typical approach doesn't work. We can instead ensure the output is in a good location by running `cd %t` (after setting it up). --- .../test/tools/llvm-dwarfdump/X86/type_units_split_dwp_v4.s | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llvm/test/tools/llvm-dwarfdump/X86/type_units_split_dwp_v4.s b/llvm/test/tools/llvm-dwarfdump/X86/type_units_split_dwp_v4.s index becd9d1b55693..519edf043be5d 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/type_units_split_dwp_v4.s +++ b/llvm/test/tools/llvm-dwarfdump/X86/type_units_split_dwp_v4.s @@ -1,6 +1,12 @@ ## This test uses TU index for type parsing in dwp and makes sure the DWARF4 type is ## successfully retrieved. +## cd to a unique dir so we can refer to the file as just "test.dwo" in the +## assembly test input below. +# RUN: rm -rf %t +# RUN: mkdir %t +# RUN: cd %t + # RUN: llvm-mc %s --split-dwarf-file=test.dwo -filetype obj -triple x86_64 -o test.o # RUN: llvm-dwp -e test.o -o test.dwp # RUN: llvm-dwarfdump test.dwp | FileCheck %s From 5e66db0811fe282ff1fd33160b37ebccae8fd7dd Mon Sep 17 00:00:00 2001 From: Kunqiu Chen Date: Tue, 28 Oct 2025 22:00:54 +0800 Subject: [PATCH 019/539] [UTC] Indent switch cases (#165212) LLVM prints switch cases indented by 2 additional spaces, as follows: ```LLVM switch i32 %x, label %default [ i32 0, label %phi i32 1, label %phi ] ``` Since this only changes the output IR of update_test_checks.py and does not change the logic of the File Check Pattern, there seems to be no need to update the existing test cases. --- .../update_test_checks/Inputs/switch_case.ll | 54 +++++++++ .../Inputs/switch_case.ll.expected | 106 ++++++++++++++++++ .../update_test_checks/switch_case.test | 3 + llvm/utils/UpdateTestChecks/common.py | 2 + llvm/utils/update_test_checks.py | 12 +- 5 files changed, 175 insertions(+), 2 deletions(-) create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/switch_case.test diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll new file mode 100644 index 0000000000000..a804225a380c8 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll @@ -0,0 +1,54 @@ +; RUN: opt < %s -S | FileCheck %s + +; Test whether the UTC format the switch-cases correctly, which requires TWO extra spaces. + +define i8 @testi8(i8 %x) { + switch i8 %x, label %default [ + i8 0, label %case1 + i8 1, label %case2 + i8 2, label %case3 + i8 3, label %case3 + ] +default: + ret i8 0 +case1: + ret i8 1 +case2: + ret i8 2 +case3: + ret i8 3 +} + +define i32 @testi32(i32 %x) { + switch i32 %x, label %default [ + i32 0, label %case1 + i32 1, label %case2 + i32 2, label %case3 + i32 3, label %case3 + ] +default: + ret i32 0 +case1: + ret i32 1 +case2: + ret i32 2 +case3: + ret i32 3 +} + +define i128 @testi128(i128 %x) { + switch i128 %x, label %default [ + i128 0, label %case1 + i128 1, label %case2 + i128 2, label %case3 + i128 3, label %case3 + ] +default: + ret i128 0 +case1: + ret i128 1 +case2: + ret i128 2 +case3: + ret i128 3 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected new file mode 100644 index 0000000000000..b1977e7ae2ee2 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected @@ -0,0 +1,106 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 7 +; RUN: opt < %s -S | FileCheck %s + +; Test whether the UTC format the switch-cases correctly, which requires TWO extra spaces. + +define i8 @testi8(i8 %x) { +; CHECK-LABEL: define i8 @testi8( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: switch i8 [[X]], label %[[DEFAULT:.*]] [ +; CHECK-NEXT: i8 0, label %[[CASE1:.*]] +; CHECK-NEXT: i8 1, label %[[CASE2:.*]] +; CHECK-NEXT: i8 2, label %[[CASE3:.*]] +; CHECK-NEXT: i8 3, label %[[CASE3]] +; CHECK-NEXT: ] +; CHECK: [[DEFAULT]]: +; CHECK-NEXT: ret i8 0 +; CHECK: [[CASE1]]: +; CHECK-NEXT: ret i8 1 +; CHECK: [[CASE2]]: +; CHECK-NEXT: ret i8 2 +; CHECK: [[CASE3]]: +; CHECK-NEXT: ret i8 3 +; + switch i8 %x, label %default [ + i8 0, label %case1 + i8 1, label %case2 + i8 2, label %case3 + i8 3, label %case3 + ] +default: + ret i8 0 +case1: + ret i8 1 +case2: + ret i8 2 +case3: + ret i8 3 +} + +define i32 @testi32(i32 %x) { +; CHECK-LABEL: define i32 @testi32( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: switch i32 [[X]], label %[[DEFAULT:.*]] [ +; CHECK-NEXT: i32 0, label %[[CASE1:.*]] +; CHECK-NEXT: i32 1, label %[[CASE2:.*]] +; CHECK-NEXT: i32 2, label %[[CASE3:.*]] +; CHECK-NEXT: i32 3, label %[[CASE3]] +; CHECK-NEXT: ] +; CHECK: [[DEFAULT]]: +; CHECK-NEXT: ret i32 0 +; CHECK: [[CASE1]]: +; CHECK-NEXT: ret i32 1 +; CHECK: [[CASE2]]: +; CHECK-NEXT: ret i32 2 +; CHECK: [[CASE3]]: +; CHECK-NEXT: ret i32 3 +; + switch i32 %x, label %default [ + i32 0, label %case1 + i32 1, label %case2 + i32 2, label %case3 + i32 3, label %case3 + ] +default: + ret i32 0 +case1: + ret i32 1 +case2: + ret i32 2 +case3: + ret i32 3 +} + +define i128 @testi128(i128 %x) { +; CHECK-LABEL: define i128 @testi128( +; CHECK-SAME: i128 [[X:%.*]]) { +; CHECK-NEXT: switch i128 [[X]], label %[[DEFAULT:.*]] [ +; CHECK-NEXT: i128 0, label %[[CASE1:.*]] +; CHECK-NEXT: i128 1, label %[[CASE2:.*]] +; CHECK-NEXT: i128 2, label %[[CASE3:.*]] +; CHECK-NEXT: i128 3, label %[[CASE3]] +; CHECK-NEXT: ] +; CHECK: [[DEFAULT]]: +; CHECK-NEXT: ret i128 0 +; CHECK: [[CASE1]]: +; CHECK-NEXT: ret i128 1 +; CHECK: [[CASE2]]: +; CHECK-NEXT: ret i128 2 +; CHECK: [[CASE3]]: +; CHECK-NEXT: ret i128 3 +; + switch i128 %x, label %default [ + i128 0, label %case1 + i128 1, label %case2 + i128 2, label %case3 + i128 3, label %case3 + ] +default: + ret i128 0 +case1: + ret i128 1 +case2: + ret i128 2 +case3: + ret i128 3 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/switch_case.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/switch_case.test new file mode 100644 index 0000000000000..891dbe06bbf59 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/switch_case.test @@ -0,0 +1,3 @@ +## switch_case test checking that update_test_checks.py works correctly +# RUN: cp -f %S/Inputs/switch_case.ll %t.ll && %update_test_checks %t.ll --version 7 +# RUN: diff -u %t.ll %S/Inputs/switch_case.ll.expected diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index a5e3c39bfdecd..8cd200c93a482 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -29,6 +29,7 @@ 'none' and 'all'. 'smart' is the default. 5: Basic block labels are matched by FileCheck expressions 6: The semantics of TBAA checks has been incorporated in the check lines. +7: Indent switch-cases correctly. """ DEFAULT_VERSION = 6 @@ -606,6 +607,7 @@ def invoke_tool(exe, cmd_args, ir, preprocess_cmd=None, verbose=False): DEBUG_ONLY_ARG_RE = re.compile(r"-debug-only[= ]([^ ]+)") IS_DEBUG_RECORD_RE = re.compile(r"^(\s+)#dbg_") +IS_SWITCH_CASE_RE = re.compile(r"^\s+i\d+ \d+, label %\w+") SCRUB_LEADING_WHITESPACE_RE = re.compile(r"^(\s+)") SCRUB_WHITESPACE_RE = re.compile(r"(?!^(| \w))[ \t]+", flags=re.M) diff --git a/llvm/utils/update_test_checks.py b/llvm/utils/update_test_checks.py index 3b562fbc54f78..42227b20fca76 100755 --- a/llvm/utils/update_test_checks.py +++ b/llvm/utils/update_test_checks.py @@ -260,9 +260,17 @@ def update_test(ti: common.TestInfo): skip_same_checks=dropped_previous_line, ): # This input line of the function body will go as-is into the output. - # Except make leading whitespace uniform: 2 spaces. 4 for debug records. + # Except make leading whitespace uniform: 2 spaces. 4 for debug records/switch cases. indent = ( - " " if not common.IS_DEBUG_RECORD_RE.match(input_line) else " " + " " * 4 + if ( + common.IS_DEBUG_RECORD_RE.match(input_line) + or ( + ti.args.version > 6 + and common.IS_SWITCH_CASE_RE.match(input_line) + ) + ) + else " " * 2 ) input_line = common.SCRUB_LEADING_WHITESPACE_RE.sub(indent, input_line) output_lines.append(input_line) From e7fa94ad57846c9d4f683a21c14a2aaad5fa4b8a Mon Sep 17 00:00:00 2001 From: Aleksei Nurmukhametov Date: Tue, 28 Oct 2025 14:01:15 +0000 Subject: [PATCH 020/539] [mlir][complex] Fix exp accuracy (#164952) This ports openxla/stablehlo#2682 implementation by @pearu. Three tests were added to `Integration/Dialect/Complex/CPU/correctness.mlir`. I also verified accuracy using XLA's complex_unary_op_test and its MLIR emitters. --- .../ComplexToStandard/ComplexToStandard.cpp | 54 ++++++++++++++----- .../convert-to-standard.mlir | 40 +++++++++++--- .../Dialect/Complex/CPU/correctness.mlir | 32 +++++++++++ 3 files changed, 107 insertions(+), 19 deletions(-) diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp index 0fe72394b61d6..9e46b7d78baca 100644 --- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp +++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp @@ -313,25 +313,53 @@ struct DivOpConversion : public OpConversionPattern { struct ExpOpConversion : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; + // exp(x+I*y) = exp(x)*(cos(y)+I*sin(y)) + // Handle special cases as StableHLO implementation does: + // 1. When b == 0, set imag(exp(z)) = 0 + // 2. When exp(x) == inf, use exp(x/2)*(cos(y)+I*sin(y))*exp(x/2) LogicalResult matchAndRewrite(complex::ExpOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto loc = op.getLoc(); auto type = cast(adaptor.getComplex().getType()); - auto elementType = cast(type.getElementType()); - arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr(); - - Value real = - complex::ReOp::create(rewriter, loc, elementType, adaptor.getComplex()); - Value imag = - complex::ImOp::create(rewriter, loc, elementType, adaptor.getComplex()); - Value expReal = math::ExpOp::create(rewriter, loc, real, fmf.getValue()); - Value cosImag = math::CosOp::create(rewriter, loc, imag, fmf.getValue()); + auto ET = cast(type.getElementType()); + arith::FastMathFlags fmf = op.getFastMathFlagsAttr().getValue(); + const auto &floatSemantics = ET.getFloatSemantics(); + ImplicitLocOpBuilder b(loc, rewriter); + + Value x = complex::ReOp::create(b, ET, adaptor.getComplex()); + Value y = complex::ImOp::create(b, ET, adaptor.getComplex()); + Value zero = arith::ConstantOp::create(b, ET, b.getZeroAttr(ET)); + Value half = arith::ConstantOp::create(b, ET, b.getFloatAttr(ET, 0.5)); + Value inf = arith::ConstantOp::create( + b, ET, b.getFloatAttr(ET, APFloat::getInf(floatSemantics))); + + Value exp = math::ExpOp::create(b, x, fmf); + Value xHalf = arith::MulFOp::create(b, x, half, fmf); + Value expHalf = math::ExpOp::create(b, xHalf, fmf); + Value cos = math::CosOp::create(b, y, fmf); + Value sin = math::SinOp::create(b, y, fmf); + + Value expIsInf = + arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, exp, inf, fmf); + Value yIsZero = + arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, y, zero); + + // Real path: select between exp(x)*cos(y) and exp(x/2)*cos(y)*exp(x/2) + Value realNormal = arith::MulFOp::create(b, exp, cos, fmf); + Value expHalfCos = arith::MulFOp::create(b, expHalf, cos, fmf); + Value realOverflow = arith::MulFOp::create(b, expHalfCos, expHalf, fmf); Value resultReal = - arith::MulFOp::create(rewriter, loc, expReal, cosImag, fmf.getValue()); - Value sinImag = math::SinOp::create(rewriter, loc, imag, fmf.getValue()); - Value resultImag = - arith::MulFOp::create(rewriter, loc, expReal, sinImag, fmf.getValue()); + arith::SelectOp::create(b, expIsInf, realOverflow, realNormal); + + // Imaginary part: if y == 0 return 0 else select between exp(x)*sin(y) and + // exp(x/2)*sin(y)*exp(x/2) + Value imagNormal = arith::MulFOp::create(b, exp, sin, fmf); + Value expHalfSin = arith::MulFOp::create(b, expHalf, sin, fmf); + Value imagOverflow = arith::MulFOp::create(b, expHalfSin, expHalf, fmf); + Value imagNonZero = + arith::SelectOp::create(b, expIsInf, imagOverflow, imagNormal); + Value resultImag = arith::SelectOp::create(b, yIsZero, zero, imagNonZero); rewriter.replaceOpWithNewOp(op, type, resultReal, resultImag); diff --git a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir index dec62f92c7b2e..7a82236b0656e 100644 --- a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir +++ b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir @@ -211,11 +211,25 @@ func.func @complex_exp(%arg: complex) -> complex { } // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex -// CHECK-DAG: %[[COS_IMAG:.*]] = math.cos %[[IMAG]] : f32 +// CHECK-DAG: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK-DAG: %[[HALF:.*]] = arith.constant 5.000000e-01 : f32 +// CHECK-DAG: %[[INF:.*]] = arith.constant 0x7F800000 : f32 // CHECK-DAG: %[[EXP_REAL:.*]] = math.exp %[[REAL]] : f32 -// CHECK-DAG: %[[RESULT_REAL:.]] = arith.mulf %[[EXP_REAL]], %[[COS_IMAG]] : f32 +// CHECK-DAG: %[[REAL_HALF:.*]] = arith.mulf %[[REAL]], %[[HALF]] : f32 +// CHECK-DAG: %[[EXP_HALF:.*]] = math.exp %[[REAL_HALF]] : f32 +// CHECK-DAG: %[[COS_IMAG:.*]] = math.cos %[[IMAG]] : f32 // CHECK-DAG: %[[SIN_IMAG:.*]] = math.sin %[[IMAG]] : f32 -// CHECK-DAG: %[[RESULT_IMAG:.*]] = arith.mulf %[[EXP_REAL]], %[[SIN_IMAG]] : f32 +// CHECK-DAG: %[[IS_INF:.*]] = arith.cmpf oeq, %[[EXP_REAL]], %[[INF]] : f32 +// CHECK-DAG: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32 +// CHECK-DAG: %[[REAL_NORMAL:.*]] = arith.mulf %[[EXP_REAL]], %[[COS_IMAG]] : f32 +// CHECK-DAG: %[[EXP_HALF_COS:.*]] = arith.mulf %[[EXP_HALF]], %[[COS_IMAG]] : f32 +// CHECK-DAG: %[[REAL_OVERFLOW:.*]] = arith.mulf %[[EXP_HALF_COS]], %[[EXP_HALF]] : f32 +// CHECK: %[[RESULT_REAL:.*]] = arith.select %[[IS_INF]], %[[REAL_OVERFLOW]], %[[REAL_NORMAL]] : f32 +// CHECK-DAG: %[[IMAG_NORMAL:.*]] = arith.mulf %[[EXP_REAL]], %[[SIN_IMAG]] : f32 +// CHECK-DAG: %[[EXP_HALF_SIN:.*]] = arith.mulf %[[EXP_HALF]], %[[SIN_IMAG]] : f32 +// CHECK-DAG: %[[IMAG_OVERFLOW:.*]] = arith.mulf %[[EXP_HALF_SIN]], %[[EXP_HALF]] : f32 +// CHECK-DAG: %[[IMAG_NONZERO:.*]] = arith.select %[[IS_INF]], %[[IMAG_OVERFLOW]], %[[IMAG_NORMAL]] : f32 +// CHECK: %[[RESULT_IMAG:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[ZERO]], %[[IMAG_NONZERO]] : f32 // CHECK: %[[RESULT:.*]] = complex.create %[[RESULT_REAL]], %[[RESULT_IMAG]] : complex // CHECK: return %[[RESULT]] : complex @@ -832,11 +846,25 @@ func.func @complex_exp_with_fmf(%arg: complex) -> complex { } // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex -// CHECK-DAG: %[[COS_IMAG:.*]] = math.cos %[[IMAG]] fastmath : f32 +// CHECK-DAG: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK-DAG: %[[HALF:.*]] = arith.constant 5.000000e-01 : f32 +// CHECK-DAG: %[[INF:.*]] = arith.constant 0x7F800000 : f32 // CHECK-DAG: %[[EXP_REAL:.*]] = math.exp %[[REAL]] fastmath : f32 -// CHECK-DAG: %[[RESULT_REAL:.]] = arith.mulf %[[EXP_REAL]], %[[COS_IMAG]] fastmath : f32 +// CHECK-DAG: %[[REAL_HALF:.*]] = arith.mulf %[[REAL]], %[[HALF]] fastmath : f32 +// CHECK-DAG: %[[EXP_HALF:.*]] = math.exp %[[REAL_HALF]] fastmath : f32 +// CHECK-DAG: %[[COS_IMAG:.*]] = math.cos %[[IMAG]] fastmath : f32 // CHECK-DAG: %[[SIN_IMAG:.*]] = math.sin %[[IMAG]] fastmath : f32 -// CHECK-DAG: %[[RESULT_IMAG:.*]] = arith.mulf %[[EXP_REAL]], %[[SIN_IMAG]] fastmath : f32 +// CHECK-DAG: %[[IS_INF:.*]] = arith.cmpf oeq, %[[EXP_REAL]], %[[INF]] fastmath : f32 +// CHECK-DAG: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32 +// CHECK-DAG: %[[REAL_NORMAL:.*]] = arith.mulf %[[EXP_REAL]], %[[COS_IMAG]] fastmath : f32 +// CHECK-DAG: %[[EXP_HALF_COS:.*]] = arith.mulf %[[EXP_HALF]], %[[COS_IMAG]] fastmath : f32 +// CHECK-DAG: %[[REAL_OVERFLOW:.*]] = arith.mulf %[[EXP_HALF_COS]], %[[EXP_HALF]] fastmath : f32 +// CHECK: %[[RESULT_REAL:.*]] = arith.select %[[IS_INF]], %[[REAL_OVERFLOW]], %[[REAL_NORMAL]] : f32 +// CHECK-DAG: %[[IMAG_NORMAL:.*]] = arith.mulf %[[EXP_REAL]], %[[SIN_IMAG]] fastmath : f32 +// CHECK-DAG: %[[EXP_HALF_SIN:.*]] = arith.mulf %[[EXP_HALF]], %[[SIN_IMAG]] fastmath : f32 +// CHECK-DAG: %[[IMAG_OVERFLOW:.*]] = arith.mulf %[[EXP_HALF_SIN]], %[[EXP_HALF]] fastmath : f32 +// CHECK-DAG: %[[IMAG_NONZERO:.*]] = arith.select %[[IS_INF]], %[[IMAG_OVERFLOW]], %[[IMAG_NORMAL]] : f32 +// CHECK: %[[RESULT_IMAG:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[ZERO]], %[[IMAG_NONZERO]] : f32 // CHECK: %[[RESULT:.*]] = complex.create %[[RESULT_REAL]], %[[RESULT_IMAG]] : complex // CHECK: return %[[RESULT]] : complex diff --git a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir index 1bcef0a0df316..ea587e92674d7 100644 --- a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir +++ b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir @@ -49,6 +49,11 @@ func.func @conj(%arg: complex) -> complex { func.return %conj : complex } +func.func @exp(%arg: complex) -> complex { + %exp = complex.exp %arg : complex + func.return %exp : complex +} + // %input contains pairs of lhs, rhs, i.e. [lhs_0, rhs_0, lhs_1, rhs_1,...] func.func @test_binary(%input: tensor>, %func: (complex, complex) -> complex) { @@ -353,5 +358,32 @@ func.func @entry() { call @test_element_f64(%abs_test_cast, %abs_func) : (tensor>, (complex) -> f64) -> () + // complex.exp test + %exp_test = arith.constant dense<[ + (1.0, 2.0), + // CHECK: -1.1312 + // CHECK-NEXT: 2.4717 + + // The first case to consider is overflow of exp(real_part). If computed + // directly, this yields inf * 0 = NaN, which is incorrect. + (500.0, 0.0), + // CHECK-NEXT: inf + // CHECK-NOT: nan + // CHECK-NEXT: 0 + + // In this case, the overflow of exp(real_part) is compensated when + // sin(imag_part) is close to zero, yielding a finite imaginary part. + (90.0238094, 5.900613e-39) + // CHECK-NEXT: inf + // CHECK-NOT: inf + // CHECK-NEXT: 7.3746 + ]> : tensor<3xcomplex> + %exp_test_cast = tensor.cast %exp_test + : tensor<3xcomplex> to tensor> + + %exp_func = func.constant @exp : (complex) -> complex + call @test_unary(%exp_test_cast, %exp_func) + : (tensor>, (complex) -> complex) -> () + func.return } From 5c922c2838be94fb3fc7463400c512c74ea96655 Mon Sep 17 00:00:00 2001 From: Fateme Hosseini Date: Tue, 28 Oct 2025 09:20:59 -0500 Subject: [PATCH 021/539] Bug fixes for ISelLowering for HVX (#164416) 1. createHvxPrefixPred was computing an invalid byte count for small predicate types, leading to a crash during instruction selection. 2. HexagonTargetLowering::SplitHvxMemOp assumed the memory vector type is always simple. This patch adds a guard to avoid processing non-simple vector types, which can lead to failure. Patch By: Fateme Hosseini Co-authored-by: pavani karveti Co-authored-by: Sergei Larin Co-authored-by: Pavani Karveti --- .../Target/Hexagon/HexagonISelLoweringHVX.cpp | 29 +++++- .../CodeGen/Hexagon/inst_masked_store_bug1.ll | 94 +++++++++++++++++++ 2 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/Hexagon/inst_masked_store_bug1.ll diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index 54c89721bc1f0..0573f64084d6f 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -1061,8 +1061,11 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl, SDValue W0 = isUndef(PredV) ? DAG.getUNDEF(MVT::i64) : DAG.getNode(HexagonISD::P2D, dl, MVT::i64, PredV); - Words[IdxW].push_back(HiHalf(W0, DAG)); - Words[IdxW].push_back(LoHalf(W0, DAG)); + if (Bytes < BitBytes) { + Words[IdxW].push_back(HiHalf(W0, DAG)); + Words[IdxW].push_back(LoHalf(W0, DAG)); + } else + Words[IdxW].push_back(W0); while (Bytes < BitBytes) { IdxW ^= 1; @@ -1083,7 +1086,26 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl, Bytes *= 2; } + while (Bytes > BitBytes) { + IdxW ^= 1; + Words[IdxW].clear(); + + if (Bytes <= 4) { + for (const SDValue &W : Words[IdxW ^ 1]) { + SDValue T = contractPredicate(W, dl, DAG); + Words[IdxW].push_back(T); + } + } else { + for (const SDValue &W : Words[IdxW ^ 1]) { + Words[IdxW].push_back(W); + } + } + Bytes /= 2; + } + assert(Bytes == BitBytes); + if (BitBytes == 1 && PredTy == MVT::v2i1) + ByteTy = MVT::getVectorVT(MVT::i16, HwLen); SDValue Vec = ZeroFill ? getZero(dl, ByteTy, DAG) : DAG.getUNDEF(ByteTy); SDValue S4 = DAG.getConstant(HwLen-4, dl, MVT::i32); @@ -3157,6 +3179,9 @@ SDValue HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const { auto *MemN = cast(Op.getNode()); + if (!MemN->getMemoryVT().isSimple()) + return Op; + MVT MemTy = MemN->getMemoryVT().getSimpleVT(); if (!isHvxPairTy(MemTy)) return Op; diff --git a/llvm/test/CodeGen/Hexagon/inst_masked_store_bug1.ll b/llvm/test/CodeGen/Hexagon/inst_masked_store_bug1.ll new file mode 100644 index 0000000000000..fcf124699e8e7 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/inst_masked_store_bug1.ll @@ -0,0 +1,94 @@ +;; REQUIRES: asserts +;; RUN: llc --mtriple=hexagon -mattr=+hvxv79,+hvx-length128b %s -o - | FileCheck %s +;; Sanity check for lowering masked scatter without assertion errors. + +define void @outer_product(ptr %aptr, ptr %bptr, ptr %cptr, i32 %T, i32 %W) { +entry: + %W.ripple.bcast.splatinsert = insertelement <8 x i32> poison, i32 %W, i64 0 + %W.ripple.bcast.splat = shufflevector <8 x i32> %W.ripple.bcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer + %div1194 = lshr i32 %T, 3 + %cmp84.not = icmp ult i32 %T, 8 + br i1 %cmp84.not, label %for.end49, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %div10195 = lshr i32 %W, 3 + %cmp1782.not = icmp ult i32 %W, 8 + %arrayidx27.ripple.LS.dim.slope = mul <8 x i32> %W.ripple.bcast.splat, + %arrayidx27.ripple.LS.dim.slope.ripple.bcast = shufflevector <8 x i32> %arrayidx27.ripple.LS.dim.slope, <8 x i32> poison, <64 x i32> + %arrayidx27.ripple.LS.slope = add <64 x i32> %arrayidx27.ripple.LS.dim.slope.ripple.bcast, + %invariant.gep196 = getelementptr i8, ptr %cptr, <64 x i32> %arrayidx27.ripple.LS.slope + br label %for.body + +for.body: ; preds = %for.end, %for.body.preheader + %ripple.par.iv.085 = phi i32 [ %add48, %for.end ], [ 0, %for.body.preheader ] + %mul2 = shl i32 %ripple.par.iv.085, 3 + br i1 %cmp1782.not, label %for.end, label %for.body18.lr.ph + +for.body18.lr.ph: ; preds = %for.body + %arrayidx = getelementptr inbounds nuw i8, ptr %aptr, i32 %mul2 + %mul25 = mul i32 %mul2, %W + %gep197 = getelementptr i8, <64 x ptr> %invariant.gep196, i32 %mul25 + br label %for.body18 + +for.body18: ; preds = %for.body18, %for.body18.lr.ph + %ripple.par.iv15.083 = phi i32 [ 0, %for.body18.lr.ph ], [ %add28, %for.body18 ] + %mul19 = shl i32 %ripple.par.iv15.083, 3 + %.ripple.LS.instance184 = load <8 x i8>, ptr %arrayidx, align 1 + %.ripple.LS.instance184.ripple.bcast = shufflevector <8 x i8> %.ripple.LS.instance184, <8 x i8> poison, <64 x i32> + %arrayidx21 = getelementptr inbounds nuw i8, ptr %bptr, i32 %mul19 + %.ripple.LS.instance = load <8 x i8>, ptr %arrayidx21, align 1 + %.ripple.LS.instance.ripple.bcast = shufflevector <8 x i8> %.ripple.LS.instance, <8 x i8> poison, <64 x i32> + %mul23.ripple.LS.instance = mul <64 x i8> %.ripple.LS.instance.ripple.bcast, %.ripple.LS.instance184.ripple.bcast + %gep = getelementptr i8, <64 x ptr> %gep197, i32 %mul19 + tail call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %mul23.ripple.LS.instance, <64 x ptr> %gep, i32 1, <64 x i1> splat (i1 true)) + %add28 = add nuw i32 %ripple.par.iv15.083, 1 + %cmp17 = icmp ult i32 %add28, %div10195 + br i1 %cmp17, label %for.body18, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body18 + %0 = shl i32 %add28, 3 + br label %for.end + +for.end: ; preds = %for.end.loopexit, %for.body + %ripple.par.iv15.0.lcssa = phi i32 [ 0, %for.body ], [ %0, %for.end.loopexit ] + %add30.ripple.bcast.splatinsert = insertelement <8 x i32> poison, i32 %ripple.par.iv15.0.lcssa, i64 0 + %add30.ripple.bcast.splat = shufflevector <8 x i32> %add30.ripple.bcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer + %add30.ripple.LS.instance = or disjoint <8 x i32> %add30.ripple.bcast.splat, + %cmp32.ripple.LS.instance = icmp ne i32 %ripple.par.iv15.0.lcssa, %W + %cmp32.ripple.LS.instance.ripple.bcast.splatinsert = insertelement <8 x i1> poison, i1 %cmp32.ripple.LS.instance, i64 0 + %cmp32.ripple.LS.instance.ripple.bcast.splat = shufflevector <8 x i1> %cmp32.ripple.LS.instance.ripple.bcast.splatinsert, <8 x i1> poison, <8 x i32> zeroinitializer + %cmp33.ripple.vectorized = icmp ult <8 x i32> %add30.ripple.LS.instance, %W.ripple.bcast.splat + %or.cond.ripple.LS.instance = select <8 x i1> %cmp32.ripple.LS.instance.ripple.bcast.splat, <8 x i1> %cmp33.ripple.vectorized, <8 x i1> zeroinitializer + %or.cond.ripple.LS.instance.ripple.bcast = shufflevector <8 x i1> %or.cond.ripple.LS.instance, <8 x i1> poison, <64 x i32> + %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle = shufflevector <8 x i1> %or.cond.ripple.LS.instance, <8 x i1> , <8 x i32> + %or.cond.ripple.LS.instance.ripple.reducelog2.operator = or <8 x i1> %or.cond.ripple.LS.instance, %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle + %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle189 = shufflevector <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator, <8 x i1> , <8 x i32> + %or.cond.ripple.LS.instance.ripple.reducelog2.operator190 = or <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator, %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle189 + %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle191 = shufflevector <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator190, <8 x i1> poison, <8 x i32> + %or.cond.ripple.LS.instance.ripple.reducelog2.operator192 = or <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator190, %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle191 + %ripple.red.extract.ripple.bcast.splat = shufflevector <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator192, <8 x i1> poison, <8 x i32> zeroinitializer + %arrayidx34.ripple.branch.clone = getelementptr inbounds nuw i8, ptr %aptr, i32 %mul2 + %.ripple.LS.instance188.ripple.branch.clone.ripple.masked.load = tail call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %arrayidx34.ripple.branch.clone, i32 1, <8 x i1> %ripple.red.extract.ripple.bcast.splat, <8 x i8> poison) + %.ripple.LS.instance188.ripple.bcast.ripple.branch.clone = shufflevector <8 x i8> %.ripple.LS.instance188.ripple.branch.clone.ripple.masked.load, <8 x i8> poison, <64 x i32> + %arrayidx36.ripple.branch.clone = getelementptr inbounds nuw i8, ptr %bptr, i32 %ripple.par.iv15.0.lcssa + %.ripple.LS.instance187.ripple.branch.clone.ripple.masked.load = tail call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %arrayidx36.ripple.branch.clone, i32 1, <8 x i1> %or.cond.ripple.LS.instance, <8 x i8> poison) + %.ripple.LS.instance187.ripple.bcast.ripple.branch.clone = shufflevector <8 x i8> %.ripple.LS.instance187.ripple.branch.clone.ripple.masked.load, <8 x i8> poison, <64 x i32> + %mul38.ripple.LS.instance.ripple.branch.clone = mul <64 x i8> %.ripple.LS.instance187.ripple.bcast.ripple.branch.clone, %.ripple.LS.instance188.ripple.bcast.ripple.branch.clone + %mul40.ripple.branch.clone = mul i32 %mul2, %W + %1 = getelementptr i8, ptr %cptr, i32 %mul40.ripple.branch.clone + %arrayidx42.ripple.branch.clone = getelementptr i8, ptr %1, i32 %ripple.par.iv15.0.lcssa + %arrayidx42.ripple.LS.instance.ripple.branch.clone = getelementptr i8, ptr %arrayidx42.ripple.branch.clone, <64 x i32> %arrayidx27.ripple.LS.slope + tail call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %mul38.ripple.LS.instance.ripple.branch.clone, <64 x ptr> %arrayidx42.ripple.LS.instance.ripple.branch.clone, i32 1, <64 x i1> %or.cond.ripple.LS.instance.ripple.bcast) + %add48 = add nuw i32 %ripple.par.iv.085, 1 + %cmp = icmp ult i32 %add48, %div1194 + br i1 %cmp, label %for.body, label %for.end49 + +for.end49: ; preds = %for.end, %entry + ret void +} + +;; CHECK: outer_product +;; CHECK: {{r[0-9]+}} = lsr({{r[0-9]+}},#3) +;; CHECK: {{q[0-9]+}} = vand({{v[0-9]+}},{{r[0-9]+}}) +;; CHECK: {{v[0-9]+}} = vmux(q0,{{v[0-9]+}},{{v[0-9]+}}) +;; CHECK: vmem{{.*}} = {{v[0-9]+}} From 59392e2e099c0c3465e37858633ad290d3092410 Mon Sep 17 00:00:00 2001 From: Connector Switch Date: Tue, 28 Oct 2025 22:22:26 +0800 Subject: [PATCH 022/539] [libcxx] Optimize `rng::generate_n` for segmented iterators (#165280) Part of #102817. This patch optimizes `rng::generate_n` for segmented iterators by forwarding the implementation directly to `std::generate_n`. - before ``` rng::generate_n(deque)/32 21.7 ns 22.0 ns 32000000 rng::generate_n(deque)/50 30.8 ns 30.7 ns 22400000 rng::generate_n(deque)/1024 492 ns 488 ns 1120000 rng::generate_n(deque)/8192 3938 ns 3924 ns 179200 ``` - after ``` rng::generate_n(deque)/32 11.0 ns 11.0 ns 64000000 rng::generate_n(deque)/50 16.2 ns 16.1 ns 40727273 rng::generate_n(deque)/1024 292 ns 286 ns 2240000 rng::generate_n(deque)/8192 2291 ns 2302 ns 298667 ``` --- libcxx/docs/ReleaseNotes/22.rst | 5 +++-- libcxx/include/__algorithm/generate_n.h | 16 ++++++++++++++-- libcxx/include/__algorithm/ranges_generate_n.h | 8 ++------ 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst index 25d33a9c2eb50..980390c4fe3d7 100644 --- a/libcxx/docs/ReleaseNotes/22.rst +++ b/libcxx/docs/ReleaseNotes/22.rst @@ -76,8 +76,9 @@ Improvements and New Features - The ``std::{fill, fill_n}`` and ``std::ranges::{fill, fill_n}`` algorithms have been optimized for segmented iterators, resulting in a performance improvement of at least 10x for ``std::deque`` iterators and ``std::join_view>>`` iterators. -- The ``std::generate`` and ``std::generate_n`` algorithms have been optimized for segmented iterators, resulting in a - performance improvement for ``std::deque`` and ``std::join_view>>`` iterators. +- The ``std::{generate, generate_n}`` and ``std::ranges::generate_n`` algorithms have been optimized for segmented + iterators, resulting in a performance improvement for ``std::deque`` and + ``std::join_view>>`` iterators. Deprecations and Removals ------------------------- diff --git a/libcxx/include/__algorithm/generate_n.h b/libcxx/include/__algorithm/generate_n.h index e9da133f0570a..23899e49e0b65 100644 --- a/libcxx/include/__algorithm/generate_n.h +++ b/libcxx/include/__algorithm/generate_n.h @@ -13,22 +13,34 @@ #include <__config> #include <__functional/identity.h> #include <__utility/forward.h> +#include <__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator -generate_n(_OutputIterator __first, _Size __orig_n, _Generator __gen) { +__generate_n(_OutputIterator __first, _Size __orig_n, _Generator& __gen) { using __iter_ref = decltype(*__first); __identity __proj; auto __f = [&](__iter_ref __element) { std::forward<__iter_ref>(__element) = __gen(); }; - return std::__for_each_n(__first, __orig_n, __f, __proj); + return std::__for_each_n(std::move(__first), __orig_n, __f, __proj); +} + +template +inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator +generate_n(_OutputIterator __first, _Size __orig_n, _Generator __gen) { + return std::__generate_n(std::move(__first), __orig_n, __gen); } _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_GENERATE_N_H diff --git a/libcxx/include/__algorithm/ranges_generate_n.h b/libcxx/include/__algorithm/ranges_generate_n.h index a318994d0eaf8..0cc9ce7b1193b 100644 --- a/libcxx/include/__algorithm/ranges_generate_n.h +++ b/libcxx/include/__algorithm/ranges_generate_n.h @@ -9,6 +9,7 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_GENERATE_N_H #define _LIBCPP___ALGORITHM_RANGES_GENERATE_N_H +#include <__algorithm/generate_n.h> #include <__concepts/constructible.h> #include <__concepts/invocable.h> #include <__config> @@ -38,12 +39,7 @@ struct __generate_n { requires invocable<_Func&> && indirectly_writable<_OutIter, invoke_result_t<_Func&>> _LIBCPP_HIDE_FROM_ABI constexpr _OutIter operator()(_OutIter __first, iter_difference_t<_OutIter> __n, _Func __gen) const { - for (; __n > 0; --__n) { - *__first = __gen(); - ++__first; - } - - return __first; + return std::__generate_n(std::move(__first), __n, __gen); } }; From 1bddc63242e18913b1965a9aa4d142a71131aac1 Mon Sep 17 00:00:00 2001 From: Shimin Cui Date: Tue, 28 Oct 2025 10:24:32 -0400 Subject: [PATCH 023/539] [PPC] Set minimum of largest number of comparisons to use bit test for switch lowering (#155910) Currently it is considered suitable to lower to a bit test for a set of switch case clusters when the the number of unique destinations (`NumDests`) and the number of total comparisons (`NumCmps`) satisfy: `(NumDests == 1 && NumCmps >= 3) || (NumDests == 2 && NumCmps >= 5) || (NumDests == 3 && NumCmps >= 6)` However it is found for some cases on powerpc, for example, when NumDests is 3, and the number of comparisons for each destination is all 2, it's not profitable to lower the switch to bit test. This is to add an option to set the minimum of largest number of comparisons to use bit test for switch lowering. --------- Co-authored-by: Shimin Cui --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 11 +- llvm/include/llvm/CodeGen/TargetLowering.h | 29 ++- llvm/lib/CodeGen/SwitchLoweringUtils.cpp | 22 +-- llvm/lib/CodeGen/TargetLoweringBase.cpp | 16 ++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 8 + llvm/test/CodeGen/PowerPC/bittest.ll | 193 ++++++++++++++++++++ 6 files changed, 260 insertions(+), 19 deletions(-) create mode 100644 llvm/test/CodeGen/PowerPC/bittest.ll diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 76b6c8ec68c72..e8dbc964a943e 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -594,12 +594,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // Check if suitable for a bit test if (N <= DL.getIndexSizeInBits(0u)) { - SmallPtrSet Dests; - for (auto I : SI.cases()) - Dests.insert(I.getCaseSuccessor()); + DenseMap DestMap; + for (auto I : SI.cases()) { + const BasicBlock *BB = I.getCaseSuccessor(); + ++DestMap[BB]; + } - if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, - DL)) + if (TLI->isSuitableForBitTests(DestMap, MinCaseVal, MaxCaseVal, DL)) return 1; } diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index d6ed3a8f739b3..4058dd728e5d1 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1433,9 +1433,9 @@ class LLVM_ABI TargetLoweringBase { /// \p High as its lowest and highest case values, and expects \p NumCmps /// case value comparisons. Check if the number of destinations, comparison /// metric, and range are all suitable. - bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps, - const APInt &Low, const APInt &High, - const DataLayout &DL) const { + bool isSuitableForBitTests( + const DenseMap &DestCmps, + const APInt &Low, const APInt &High, const DataLayout &DL) const { // FIXME: I don't think NumCmps is the correct metric: a single case and a // range of cases both require only one branch to lower. Just looking at the // number of clusters and destinations should be enough to decide whether to @@ -1446,6 +1446,20 @@ class LLVM_ABI TargetLoweringBase { if (!rangeFitsInWord(Low, High, DL)) return false; + unsigned NumDests = DestCmps.size(); + unsigned NumCmps = 0; + unsigned int MaxBitTestEntry = 0; + for (auto &DestCmp : DestCmps) { + NumCmps += DestCmp.second; + if (DestCmp.second > MaxBitTestEntry) + MaxBitTestEntry = DestCmp.second; + } + + // Comparisons might be cheaper for small number of comparisons, which can + // be Arch Target specific. + if (MaxBitTestEntry < getMinimumBitTestCmps()) + return false; + // Decide whether it's profitable to lower this range with bit tests. Each // destination requires a bit test and branch, and there is an overall range // check branch. For a small number of clusters, separate comparisons might @@ -2055,6 +2069,9 @@ class LLVM_ABI TargetLoweringBase { virtual bool isJumpTableRelative() const; + /// Retuen the minimum of largest number of comparisons in BitTest. + unsigned getMinimumBitTestCmps() const; + /// If a physical register, this specifies the register that /// llvm.savestack/llvm.restorestack should save and restore. Register getStackPointerRegisterToSaveRestore() const { @@ -2577,6 +2594,9 @@ class LLVM_ABI TargetLoweringBase { /// Set to zero to generate unlimited jump tables. void setMaximumJumpTableSize(unsigned); + /// Set the minimum of largest of number of comparisons to generate BitTest. + void setMinimumBitTestCmps(unsigned Val); + /// If set to a physical register, this specifies the register that /// llvm.savestack/llvm.restorestack should save and restore. void setStackPointerRegisterToSaveRestore(Register R) { @@ -3719,6 +3739,9 @@ class LLVM_ABI TargetLoweringBase { /// backend supports. unsigned MinCmpXchgSizeInBits; + /// The minimum of largest number of comparisons to use bit test for switch. + unsigned MinimumBitTestCmps; + /// This indicates if the target supports unaligned atomic operations. bool SupportsUnalignedAtomics; diff --git a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp index 038c499fe236e..3fa8243c03423 100644 --- a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp +++ b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp @@ -198,7 +198,6 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters, assert(First <= Last); auto Prob = BranchProbability::getZero(); - unsigned NumCmps = 0; std::vector Table; DenseMap JTProbs; @@ -206,12 +205,16 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters, for (unsigned I = First; I <= Last; ++I) JTProbs[Clusters[I].MBB] = BranchProbability::getZero(); + DenseMap DestMap; for (unsigned I = First; I <= Last; ++I) { assert(Clusters[I].Kind == CC_Range); Prob += Clusters[I].Prob; const APInt &Low = Clusters[I].Low->getValue(); const APInt &High = Clusters[I].High->getValue(); - NumCmps += (Low == High) ? 1 : 2; + unsigned int NumCmp = (Low == High) ? 1 : 2; + const BasicBlock *BB = Clusters[I].MBB->getBasicBlock(); + DestMap[BB] += NumCmp; + if (I != First) { // Fill the gap between this and the previous cluster. const APInt &PreviousHigh = Clusters[I - 1].High->getValue(); @@ -226,9 +229,7 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters, JTProbs[Clusters[I].MBB] += Clusters[I].Prob; } - unsigned NumDests = JTProbs.size(); - if (TLI->isSuitableForBitTests(NumDests, NumCmps, - Clusters[First].Low->getValue(), + if (TLI->isSuitableForBitTests(DestMap, Clusters[First].Low->getValue(), Clusters[Last].High->getValue(), *DL)) { // Clusters[First..Last] should be lowered as bit tests instead. return false; @@ -372,20 +373,19 @@ bool SwitchCG::SwitchLowering::buildBitTests(CaseClusterVector &Clusters, if (First == Last) return false; - BitVector Dests(FuncInfo.MF->getNumBlockIDs()); - unsigned NumCmps = 0; + DenseMap DestMap; for (int64_t I = First; I <= Last; ++I) { assert(Clusters[I].Kind == CC_Range); - Dests.set(Clusters[I].MBB->getNumber()); - NumCmps += (Clusters[I].Low == Clusters[I].High) ? 1 : 2; + unsigned NumCmp = (Clusters[I].Low == Clusters[I].High) ? 1 : 2; + const BasicBlock *BB = Clusters[I].MBB->getBasicBlock(); + DestMap[BB] += NumCmp; } - unsigned NumDests = Dests.count(); APInt Low = Clusters[First].Low->getValue(); APInt High = Clusters[Last].High->getValue(); assert(Low.slt(High)); - if (!TLI->isSuitableForBitTests(NumDests, NumCmps, Low, High, *DL)) + if (!TLI->isSuitableForBitTests(DestMap, Low, High, *DL)) return false; APInt LowBound; diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 59798b3cf201a..f3631fab885df 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" @@ -90,6 +91,11 @@ static cl::opt OptsizeJumpTableDensity( cl::desc("Minimum density for building a jump table in " "an optsize function")); +static cl::opt MinimumBitTestCmpsOverride( + "min-bit-test-cmps", cl::init(2), cl::Hidden, + cl::desc("Set minimum of largest number of comparisons " + "to use bit test for switch.")); + // FIXME: This option is only to test if the strict fp operation processed // correctly by preventing mutating strict fp operation to normal fp operation // during development. When the backend supports strict float operation, this @@ -719,6 +725,8 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) MinCmpXchgSizeInBits = 0; SupportsUnalignedAtomics = false; + + MinimumBitTestCmps = MinimumBitTestCmpsOverride; } // Define the virtual destructor out-of-line to act as a key method to anchor @@ -2129,6 +2137,14 @@ bool TargetLoweringBase::isJumpTableRelative() const { return getTargetMachine().isPositionIndependent(); } +unsigned TargetLoweringBase::getMinimumBitTestCmps() const { + return MinimumBitTestCmps; +} + +void TargetLoweringBase::setMinimumBitTestCmps(unsigned Val) { + MinimumBitTestCmps = Val; +} + Align TargetLoweringBase::getPrefLoopAlignment(MachineLoop *ML) const { if (TM.Options.LoopAlignment) return Align(TM.Options.LoopAlignment); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 17f04d0fd05e8..20fc849ea4aa5 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -138,6 +138,11 @@ static cl::opt PPCMinimumJumpTableEntries( "ppc-min-jump-table-entries", cl::init(64), cl::Hidden, cl::desc("Set minimum number of entries to use a jump table on PPC")); +static cl::opt PPCMinimumBitTestCmps( + "ppc-min-bit-test-cmps", cl::init(3), cl::Hidden, + cl::desc("Set minimum of largest number of comparisons to use bit test for " + "switch on PPC.")); + static cl::opt PPCGatherAllAliasesMaxDepth( "ppc-gather-alias-max-depth", cl::init(18), cl::Hidden, cl::desc("max depth when checking alias info in GatherAllAliases()")); @@ -1436,6 +1441,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // Re-evaluate this value on future HWs that can do better with mtctr. setMinimumJumpTableEntries(PPCMinimumJumpTableEntries); + // The default minimum of largest number in a BitTest cluster is 3. + setMinimumBitTestCmps(PPCMinimumBitTestCmps); + setMinFunctionAlignment(Align(4)); setMinCmpXchgSizeInBits(Subtarget.hasPartwordAtomics() ? 8 : 32); diff --git a/llvm/test/CodeGen/PowerPC/bittest.ll b/llvm/test/CodeGen/PowerPC/bittest.ll new file mode 100644 index 0000000000000..cba56e3d5798f --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/bittest.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -verify-machineinstrs < %s -O3 -mcpu=ppc -mtriple powerpc-ibm-aix \ +; RUN: -ppc-asm-full-reg-names | FileCheck %s + +define i32 @foo(i32 noundef signext %x) { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: stwu r1, -64(r1) +; CHECK-NEXT: stw r0, 72(r1) +; CHECK-NEXT: cmpwi r3, 8 +; CHECK-NEXT: stw r31, 60(r1) # 4-byte Folded Spill +; CHECK-NEXT: mr r31, r3 +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: ble cr0, L..BB0_4 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: cmpwi r31, 11 +; CHECK-NEXT: bge cr0, L..BB0_7 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: cmplwi r31, 9 +; CHECK-NEXT: beq cr0, L..BB0_9 +; CHECK-NEXT: # %bb.3: # %entry +; CHECK-NEXT: cmplwi r31, 10 +; CHECK-NEXT: beq cr0, L..BB0_11 +; CHECK-NEXT: b L..BB0_13 +; CHECK-NEXT: L..BB0_4: # %entry +; CHECK-NEXT: cmplwi r31, 4 +; CHECK-NEXT: beq cr0, L..BB0_12 +; CHECK-NEXT: # %bb.5: # %entry +; CHECK-NEXT: cmplwi r31, 7 +; CHECK-NEXT: beq cr0, L..BB0_11 +; CHECK-NEXT: # %bb.6: # %entry +; CHECK-NEXT: cmplwi r31, 8 +; CHECK-NEXT: beq cr0, L..BB0_10 +; CHECK-NEXT: b L..BB0_13 +; CHECK-NEXT: L..BB0_7: # %entry +; CHECK-NEXT: beq cr0, L..BB0_10 +; CHECK-NEXT: # %bb.8: # %entry +; CHECK-NEXT: cmplwi r31, 12 +; CHECK-NEXT: bne cr0, L..BB0_13 +; CHECK-NEXT: L..BB0_9: # %sw.bb2 +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: bl .foo3[PR] +; CHECK-NEXT: nop +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: b L..BB0_13 +; CHECK-NEXT: L..BB0_10: # %sw.bb1 +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: bl .foo2[PR] +; CHECK-NEXT: nop +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: b L..BB0_13 +; CHECK-NEXT: L..BB0_11: # %sw.bb +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: bl .foo1[PR] +; CHECK-NEXT: nop +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: b L..BB0_13 +; CHECK-NEXT: L..BB0_12: # %sw.bb3 +; CHECK-NEXT: li r3, 4 +; CHECK-NEXT: bl .foo4[PR] +; CHECK-NEXT: nop +; CHECK-NEXT: li r3, 4 +; CHECK-NEXT: L..BB0_13: # %return +; CHECK-NEXT: lwz r31, 60(r1) # 4-byte Folded Reload +; CHECK-NEXT: addi r1, r1, 64 +; CHECK-NEXT: lwz r0, 8(r1) +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr +entry: + switch i32 %x, label %return [ + i32 7, label %sw.bb + i32 10, label %sw.bb + i32 8, label %sw.bb1 + i32 11, label %sw.bb1 + i32 9, label %sw.bb2 + i32 12, label %sw.bb2 + i32 4, label %sw.bb3 + ] + +sw.bb: ; preds = %entry, %entry + tail call void @foo1(i32 noundef signext %x) + br label %return + +sw.bb1: ; preds = %entry, %entry + tail call void @foo2(i32 noundef signext %x) + br label %return + +sw.bb2: ; preds = %entry, %entry + tail call void @foo3(i32 noundef signext %x) + br label %return + +sw.bb3: ; preds = %entry + tail call void @foo4(i32 noundef signext 4) + br label %return + +return: ; preds = %sw.bb, %sw.bb1, %sw.bb2, %sw.bb3, %entry + %retval.0 = phi i32 [ 0, %entry ], [ 4, %sw.bb3 ], [ %x, %sw.bb2 ], [ %x, %sw.bb1 ], [ %x, %sw.bb ] + ret i32 %retval.0 +} + +define i32 @goo(i32 noundef signext %x) { +; CHECK-LABEL: goo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: stwu r1, -64(r1) +; CHECK-NEXT: stw r0, 72(r1) +; CHECK-NEXT: cmplwi r3, 12 +; CHECK-NEXT: stw r31, 60(r1) # 4-byte Folded Spill +; CHECK-NEXT: mr r31, r3 +; CHECK-NEXT: bgt cr0, L..BB1_7 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: li r3, 1 +; CHECK-NEXT: slw r3, r3, r31 +; CHECK-NEXT: andi. r4, r3, 5632 +; CHECK-NEXT: bne cr0, L..BB1_4 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: andi. r3, r3, 2304 +; CHECK-NEXT: beq cr0, L..BB1_5 +; CHECK-NEXT: # %bb.3: # %sw.bb1 +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: bl .foo2[PR] +; CHECK-NEXT: nop +; CHECK-NEXT: b L..BB1_9 +; CHECK-NEXT: L..BB1_4: # %sw.bb2 +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: bl .foo3[PR] +; CHECK-NEXT: nop +; CHECK-NEXT: b L..BB1_9 +; CHECK-NEXT: L..BB1_5: # %entry +; CHECK-NEXT: cmplwi r31, 7 +; CHECK-NEXT: bne cr0, L..BB1_7 +; CHECK-NEXT: # %bb.6: # %sw.bb +; CHECK-NEXT: li r3, 7 +; CHECK-NEXT: li r31, 7 +; CHECK-NEXT: bl .foo1[PR] +; CHECK-NEXT: nop +; CHECK-NEXT: b L..BB1_9 +; CHECK-NEXT: L..BB1_7: # %entry +; CHECK-NEXT: cmplwi r31, 4 +; CHECK-NEXT: li r31, 0 +; CHECK-NEXT: bne cr0, L..BB1_9 +; CHECK-NEXT: # %bb.8: # %sw.bb3 +; CHECK-NEXT: li r3, 4 +; CHECK-NEXT: li r31, 4 +; CHECK-NEXT: bl .foo4[PR] +; CHECK-NEXT: nop +; CHECK-NEXT: L..BB1_9: # %return +; CHECK-NEXT: mr r3, r31 +; CHECK-NEXT: lwz r31, 60(r1) # 4-byte Folded Reload +; CHECK-NEXT: addi r1, r1, 64 +; CHECK-NEXT: lwz r0, 8(r1) +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr +entry: + switch i32 %x, label %return [ + i32 7, label %sw.bb + i32 8, label %sw.bb1 + i32 11, label %sw.bb1 + i32 9, label %sw.bb2 + i32 10, label %sw.bb2 + i32 12, label %sw.bb2 + i32 4, label %sw.bb3 + ] + +sw.bb: ; preds = %entry + tail call void @foo1(i32 noundef signext 7) + br label %return + +sw.bb1: ; preds = %entry, %entry + tail call void @foo2(i32 noundef signext %x) + br label %return + +sw.bb2: ; preds = %entry, %entry, %entry + tail call void @foo3(i32 noundef signext %x) + br label %return + +sw.bb3: ; preds = %entry + tail call void @foo4(i32 noundef signext 4) + br label %return + +return: ; preds = %sw.bb, %sw.bb1, %sw.bb2, %sw.bb3, %entry + %retval.0 = phi i32 [ 0, %entry ], [ 4, %sw.bb3 ], [ %x, %sw.bb2 ], [ %x, %sw.bb1 ], [ 7, %sw.bb ] + ret i32 %retval.0 +} + +declare void @foo1(i32 noundef signext) + +declare void @foo2(i32 noundef signext) + +declare void @foo3(i32 noundef signext) + +declare void @foo4(i32 noundef signext) From 1db4d8bf62e25dffb5539fb26f67e95142d93b1b Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Tue, 28 Oct 2025 07:46:48 -0700 Subject: [PATCH 024/539] Add switch_case.test to profcheck-xfail.txt (#165407) --- llvm/utils/profcheck-xfail.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt index 3d07b16cac661..aef7c0987fda7 100644 --- a/llvm/utils/profcheck-xfail.txt +++ b/llvm/utils/profcheck-xfail.txt @@ -550,6 +550,7 @@ tools/UpdateTestChecks/update_test_checks/stable_ir_values5.test tools/UpdateTestChecks/update_test_checks/stable_ir_values6.test tools/UpdateTestChecks/update_test_checks/stable_ir_values_funcs.test tools/UpdateTestChecks/update_test_checks/stable_ir_values.test +tools/UpdateTestChecks/update_test_checks/switch_case.test tools/UpdateTestChecks/update_test_checks/tbaa-semantics-checks.test tools/UpdateTestChecks/update_test_checks/various_ir_values_dbgrecords.test Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll From 7db225b597e38b8ee18639e2a6ed09d8350f87bd Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Tue, 28 Oct 2025 14:53:08 +0000 Subject: [PATCH 025/539] [AArch64][SME] Disable tail calls for callees that require saving ZT0 (#165371) We may need to load ZT0 after the call, so we can't perform a tail call. --- .../lib/Target/AArch64/AArch64ISelLowering.cpp | 3 ++- llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d16b11686e3c1..60aa61e993b26 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9028,11 +9028,12 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( CallingConv::ID CallerCC = CallerF.getCallingConv(); // SME Streaming functions are not eligible for TCO as they may require - // the streaming mode or ZA to be restored after returning from the call. + // the streaming mode or ZA/ZT0 to be restored after returning from the call. SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI); if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() || CallAttrs.requiresPreservingAllZAState() || + CallAttrs.requiresPreservingZT0() || CallAttrs.caller().hasStreamingBody()) return false; diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index 2583a93e514a2..5b81f5dafe421 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -426,3 +426,21 @@ define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwin call void %callee() ret void } + +define void @disable_tailcallopt(ptr %callee) "aarch64_inout_zt0" nounwind { +; CHECK-COMMON-LABEL: disable_tailcallopt: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: sub sp, sp, #80 +; CHECK-COMMON-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x19, sp +; CHECK-COMMON-NEXT: str zt0, [x19] +; CHECK-COMMON-NEXT: smstop za +; CHECK-COMMON-NEXT: blr x0 +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: ldr zt0, [x19] +; CHECK-COMMON-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: add sp, sp, #80 +; CHECK-COMMON-NEXT: ret + tail call void %callee() + ret void +} From 76815cc858299d6ccec9069b3c97a3f893960776 Mon Sep 17 00:00:00 2001 From: Dan Blackwell Date: Tue, 28 Oct 2025 14:59:51 +0000 Subject: [PATCH 026/539] [Fuzzer][Test-Only] Increase runs for reduce-inputs.test (#165402) This test fails on some arm64 macOS runs currently. This patch bumps up the number of runs by 10x to hopefully get it passing consistently. rdar://162122184 --- compiler-rt/test/fuzzer/reduce_inputs.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/test/fuzzer/reduce_inputs.test b/compiler-rt/test/fuzzer/reduce_inputs.test index e65f572277297..d296fa42191af 100644 --- a/compiler-rt/test/fuzzer/reduce_inputs.test +++ b/compiler-rt/test/fuzzer/reduce_inputs.test @@ -12,5 +12,5 @@ RUN: %run %t-ShrinkControlFlowSimpleTest -runs=0 %t/C 2>&1 | FileCheck %s --chec COUNT: seed corpus: files: 4 # a bit longer test -RUN: %run %t-ShrinkControlFlowTest -exit_on_item=0eb8e4ed029b774d80f2b66408203801cb982a60 -seed=42 -runs=1000000 2>&1 | FileCheck %s +RUN: %run %t-ShrinkControlFlowTest -exit_on_item=0eb8e4ed029b774d80f2b66408203801cb982a60 -seed=42 -runs=10000000 2>&1 | FileCheck %s From 311c9cbb5e34290ff0d2f7219079978242e506a9 Mon Sep 17 00:00:00 2001 From: Dan Blackwell Date: Tue, 28 Oct 2025 15:00:21 +0000 Subject: [PATCH 027/539] [Fuzzer][Test-Only] Re-enable fuzzer-ubsan.test on Darwin (#165403) This test is now XPASSing due to a linker update on the platform. This patch removes the XFAIL from the test. rdar://163149345 --- compiler-rt/test/fuzzer/fuzzer-ubsan.test | 3 --- 1 file changed, 3 deletions(-) diff --git a/compiler-rt/test/fuzzer/fuzzer-ubsan.test b/compiler-rt/test/fuzzer/fuzzer-ubsan.test index d22339d72e261..6bc2c38636688 100644 --- a/compiler-rt/test/fuzzer/fuzzer-ubsan.test +++ b/compiler-rt/test/fuzzer/fuzzer-ubsan.test @@ -1,6 +1,3 @@ -// This test currently fails to compile on green.lab.llvm.org (arm) -// XFAIL: system-darwin && target=arm{{.*}} - RUN: %cpp_compiler -fsanitize=undefined -fno-sanitize-recover=all %S/SignedIntOverflowTest.cpp -o %t-SignedIntOverflowTest-Ubsan RUN: not %run %t-SignedIntOverflowTest-Ubsan 2>&1 | FileCheck %s CHECK: runtime error: signed integer overflow: 2147483647 + 1 cannot be represented in type 'int' From c68a7339e07f8d3055797aa9157c64224f14524d Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 28 Oct 2025 08:28:09 -0700 Subject: [PATCH 028/539] DAG: Consider __sincos_stret when deciding to form fsincos (#165169) --- .../include/llvm/CodeGen/RuntimeLibcallUtil.h | 4 +++ llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 13 +++++----- llvm/lib/CodeGen/TargetLoweringBase.cpp | 5 ++++ llvm/lib/Target/ARM/ARMISelLowering.cpp | 25 +++++++++---------- llvm/lib/Target/X86/X86ISelLowering.cpp | 23 +++++++++-------- 5 files changed, 40 insertions(+), 30 deletions(-) diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h index a9e53bae897ad..f980d3dc255ca 100644 --- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h +++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h @@ -84,6 +84,10 @@ LLVM_ABI Libcall getSINCOS(EVT RetVT); /// UNKNOWN_LIBCALL if there is none. LLVM_ABI Libcall getSINCOSPI(EVT RetVT); +/// Return the SINCOS_STRET_ value for the given types, or UNKNOWN_LIBCALL if +/// there is none. +LLVM_ABI Libcall getSINCOS_STRET(EVT RetVT); + /// getMODF - Return the MODF_* value for the given types, or /// UNKNOWN_LIBCALL if there is none. LLVM_ABI Libcall getMODF(EVT RetVT); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 5fb7e63cfb605..431a81002074f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2400,10 +2400,11 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, Results.push_back(Rem); } -/// Return true if sincos libcall is available. +/// Return true if sincos or __sincos_stret libcall is available. static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) { - RTLIB::Libcall LC = RTLIB::getSINCOS(Node->getSimpleValueType(0).SimpleTy); - return TLI.getLibcallName(LC) != nullptr; + MVT::SimpleValueType VT = Node->getSimpleValueType(0).SimpleTy; + return TLI.getLibcallImpl(RTLIB::getSINCOS(VT)) != RTLIB::Unsupported || + TLI.getLibcallImpl(RTLIB::getSINCOS_STRET(VT)) != RTLIB::Unsupported; } /// Only issue sincos libcall if both sin and cos are needed. @@ -3752,9 +3753,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { EVT VT = Node->getValueType(0); // Turn fsin / fcos into ISD::FSINCOS node if there are a pair of fsin / // fcos which share the same operand and both are used. - if ((TLI.isOperationLegalOrCustom(ISD::FSINCOS, VT) || - isSinCosLibcallAvailable(Node, TLI)) - && useSinCos(Node)) { + if ((TLI.isOperationLegal(ISD::FSINCOS, VT) || + isSinCosLibcallAvailable(Node, TLI)) && + useSinCos(Node)) { SDVTList VTs = DAG.getVTList(VT, VT); Tmp1 = DAG.getNode(ISD::FSINCOS, dl, VTs, Node->getOperand(0)); if (Node->getOpcode() == ISD::FCOS) diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index f3631fab885df..b3535eaca5e9d 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -434,6 +434,11 @@ RTLIB::Libcall RTLIB::getSINCOSPI(EVT RetVT) { SINCOSPI_F128, SINCOSPI_PPCF128); } +RTLIB::Libcall RTLIB::getSINCOS_STRET(EVT RetVT) { + return getFPLibCall(RetVT, SINCOS_STRET_F32, SINCOS_STRET_F64, + UNKNOWN_LIBCALL, UNKNOWN_LIBCALL, UNKNOWN_LIBCALL); +} + RTLIB::Libcall RTLIB::getMODF(EVT RetVT) { return getFPLibCall(RetVT, MODF_F32, MODF_F64, MODF_F80, MODF_F128, MODF_PPCF128); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 313ae3d68fb83..fdba45461377d 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1298,12 +1298,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); } - // Use __sincos_stret if available. - if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && - getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); - } + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); // FP-ARMv8 implements a lot of rounding-like FP operations. if (Subtarget->hasFPARMv8Base()) { @@ -9835,13 +9831,18 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { } SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { - assert(Subtarget->isTargetDarwin()); - // For iOS, we want to call an alternative entry point: __sincos_stret, // return values are passed via sret. SDLoc dl(Op); SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); + RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT); + RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC); + if (SincosStret == RTLIB::Unsupported) + return SDValue(); + + assert(Subtarget->isTargetDarwin()); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); auto PtrVT = getPointerTy(DAG.getDataLayout()); @@ -9871,11 +9872,9 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { Args.emplace_back(Arg, ArgTy); - RTLIB::Libcall LC = - (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = getLibcallName(LC); - CallingConv::ID CC = getLibcallCallingConv(LC); - SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); + StringRef LibcallName = getLibcallImplName(SincosStret); + CallingConv::ID CC = getLibcallImplCallingConv(SincosStret); + SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL)); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f514621094f13..b86020aa512ea 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2572,11 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Combine sin / cos into _sincos_stret if it is available. - if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && - getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); - } + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); if (Subtarget.isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); @@ -33067,26 +33064,30 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Arg = Op.getOperand(0); + EVT ArgVT = Arg.getValueType(); + bool isF64 = ArgVT == MVT::f64; + + RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; + const char *LibcallName = TLI.getLibcallName(LC); + if (!LibcallName) + return SDValue(); + assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit()); // For MacOSX, we want to call an alternative entry point: __sincos_stret, // which returns the values as { float, float } (in XMM0) or // { double, double } (which is returned in XMM0, XMM1). SDLoc dl(Op); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); TargetLowering::ArgListTy Args; Args.emplace_back(Arg, ArgTy); - bool isF64 = ArgVT == MVT::f64; // Only optimize x86_64 for now. i386 is a bit messy. For f32, // the small struct {f32, f32} is returned in (eax, edx). For f64, // the results are returned via SRet in memory. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = TLI.getLibcallName(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); From 08bfb3140864c2b307fffada32303f3ca7840c63 Mon Sep 17 00:00:00 2001 From: Dmitry Vasilyev Date: Tue, 28 Oct 2025 19:37:27 +0400 Subject: [PATCH 029/539] [lldb] The test added for PR#164905 doesn't run on Windows host. (#165417) Skip the test for Windows hosts. This patch fixes the buildbot `lldb-remote-linux-win`. https://lab.llvm.org/buildbot/#/builders/197/builds/10304 --- lldb/test/API/driver/stdio_closed/TestDriverWithClosedSTDIO.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/API/driver/stdio_closed/TestDriverWithClosedSTDIO.py b/lldb/test/API/driver/stdio_closed/TestDriverWithClosedSTDIO.py index 13437d05557bf..a73322c78d81e 100644 --- a/lldb/test/API/driver/stdio_closed/TestDriverWithClosedSTDIO.py +++ b/lldb/test/API/driver/stdio_closed/TestDriverWithClosedSTDIO.py @@ -24,7 +24,7 @@ class TestDriverWithClosedSTDIO(TestBase): # Windows doesn't have the fcntl module, so we can't run this # test there. - @skipIf(oslist=["windows"]) + @skipIf(hostoslist=["windows"]) def test_run_lldb_and_wait(self): """This test forks, closes the stdio channels and exec's lldb. Then it waits for it to exit and asserts it did that successfully""" From 631b2c9ca117ae65797deac77ed3771acff166da Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 28 Oct 2025 16:55:29 +0100 Subject: [PATCH 030/539] [Clang][CodeGen] Implement code generation for __builtin_infer_alloc_token() (#156842) Implement code generation for `__builtin_infer_alloc_token()`. The `AllocToken` pass is now registered to run unconditionally in the optimization pipeline. This ensures that all instances of the `llvm.alloc.token.id` intrinsic are lowered to constant token IDs, regardless of whether `-fsanitize=alloc-token` is enabled. This guarantees that the builtin always resolves to a token value, providing a consistent and reliable mechanism for compile-time token querying. This completes `__builtin_infer_alloc_token(, ...)` to allow compile-time querying of the token ID, where the builtin arguments mirror those normally passed to any allocation function. The argument expressions are unevaluated operands. For type-based token modes, the same type inference logic is used as for untyped allocation calls. For example the ID that is passed to (with `-fsanitize=alloc-token`): some_malloc(sizeof(Type), ...) is equivalent to the token ID returned by __builtin_infer_alloc_token(sizeof(Type), ...) The builtin provides a mechanism to pass or compare token IDs in code that needs to be explicitly allocation token-aware (such as inside an allocator, or through wrapper macros). A more concrete demonstration of __builtin_infer_alloc_token's use is enabling type-aware Slab allocations in the Linux kernel: https://lore.kernel.org/all/20250825154505.1558444-1-elver@google.com/ Notably, any kind of allocation-call rewriting is a poor fit for the Linux kernel's kmalloc-family functions, which are macros that wrap (multiple) layers of inline and non-inline wrapper functions. Given the Linux kernel defines its own allocation APIs, the more explicit builtin gives the right level of control over where the type inference happens and the resulting token is passed. --- clang/docs/AllocToken.rst | 43 ++++++-- clang/docs/ReleaseNotes.rst | 3 + clang/lib/CodeGen/BackendUtil.cpp | 28 ++++-- clang/lib/CodeGen/CGBuiltin.cpp | 9 ++ clang/test/CodeGen/lto-newpm-pipeline.c | 8 +- clang/test/CodeGenCXX/alloc-token-builtin.cpp | 97 +++++++++++++++++++ 6 files changed, 166 insertions(+), 22 deletions(-) create mode 100644 clang/test/CodeGenCXX/alloc-token-builtin.cpp diff --git a/clang/docs/AllocToken.rst b/clang/docs/AllocToken.rst index b65e18ccfa967..1a740e5e22c29 100644 --- a/clang/docs/AllocToken.rst +++ b/clang/docs/AllocToken.rst @@ -49,6 +49,39 @@ change or removal. These may (experimentally) be selected with ``-Xclang * ``increment``: This mode assigns a simple, incrementally increasing token ID to each allocation site. +The following command-line options affect generated token IDs: + +* ``-falloc-token-max=`` + Configures the maximum number of tokens. No max by default (tokens bounded + by ``SIZE_MAX``). + +Querying Token IDs with ``__builtin_infer_alloc_token`` +======================================================= + +For use cases where the token ID must be known at compile time, Clang provides +a builtin function: + +.. code-block:: c + + size_t __builtin_infer_alloc_token(, ...); + +This builtin returns the token ID inferred from its argument expressions, which +mirror arguments normally passed to any allocation function. The argument +expressions are **unevaluated**, so it can be used with expressions that would +have side effects without any runtime impact. + +For example, it can be used as follows: + +.. code-block:: c + + struct MyType { ... }; + void *__partition_alloc(size_t size, size_t partition); + #define partition_alloc(...) __partition_alloc(__VA_ARGS__, __builtin_infer_alloc_token(__VA_ARGS__)) + + void foo(void) { + MyType *x = partition_alloc(sizeof(*x)); + } + Allocation Token Instrumentation ================================ @@ -70,16 +103,6 @@ example: // Instrumented: ptr = __alloc_token_malloc(size, ); -The following command-line options affect generated token IDs: - -* ``-falloc-token-max=`` - Configures the maximum number of tokens. No max by default (tokens bounded - by ``SIZE_MAX``). - - .. code-block:: console - - % clang++ -fsanitize=alloc-token -falloc-token-max=512 example.cc - Runtime Interface ----------------- diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index e6e33e7a9a280..add1582344a0e 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -281,6 +281,9 @@ Non-comprehensive list of changes in this release allocator-level heap organization strategies. A feature to instrument all allocation functions with a token ID can be enabled via the ``-fsanitize=alloc-token`` flag. +- A builtin ``__builtin_infer_alloc_token(, ...)`` is provided to allow + compile-time querying of allocation token IDs, where the builtin arguments + mirror those normally passed to an allocation function. - Clang now rejects the invalid use of ``constexpr`` with ``auto`` and an explicit type in C. (#GH163090) diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index aefc262dca17f..3c313149ca1fc 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -800,16 +800,6 @@ static void addSanitizers(const Triple &TargetTriple, MPM.addPass(DataFlowSanitizerPass(LangOpts.NoSanitizeFiles, PB.getVirtualFileSystemPtr())); } - - if (LangOpts.Sanitize.has(SanitizerKind::AllocToken)) { - if (Level == OptimizationLevel::O0) { - // The default pass builder only infers libcall function attrs when - // optimizing, so we insert it here because we need it for accurate - // memory allocation function detection. - MPM.addPass(InferFunctionAttrsPass()); - } - MPM.addPass(AllocTokenPass(getAllocTokenOptions(LangOpts, CodeGenOpts))); - } }; if (ClSanitizeOnOptimizerEarlyEP) { PB.registerOptimizerEarlyEPCallback( @@ -852,6 +842,23 @@ static void addSanitizers(const Triple &TargetTriple, } } +static void addAllocTokenPass(const Triple &TargetTriple, + const CodeGenOptions &CodeGenOpts, + const LangOptions &LangOpts, PassBuilder &PB) { + PB.registerOptimizerLastEPCallback([&](ModulePassManager &MPM, + OptimizationLevel Level, + ThinOrFullLTOPhase) { + if (Level == OptimizationLevel::O0 && + LangOpts.Sanitize.has(SanitizerKind::AllocToken)) { + // The default pass builder only infers libcall function attrs when + // optimizing, so we insert it here because we need it for accurate + // memory allocation function detection with -fsanitize=alloc-token. + MPM.addPass(InferFunctionAttrsPass()); + } + MPM.addPass(AllocTokenPass(getAllocTokenOptions(LangOpts, CodeGenOpts))); + }); +} + void EmitAssemblyHelper::RunOptimizationPipeline( BackendAction Action, std::unique_ptr &OS, std::unique_ptr &ThinLinkOS, BackendConsumer *BC) { @@ -1106,6 +1113,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline( if (!IsThinLTOPostLink) { addSanitizers(TargetTriple, CodeGenOpts, LangOpts, PB); addKCFIPass(TargetTriple, LangOpts, PB); + addAllocTokenPass(TargetTriple, CodeGenOpts, LangOpts, PB); } if (std::optional Options = diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index fd14cd6926fe2..b81e0d02da2c9 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -4506,6 +4506,15 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, return RValue::get(AI); } + case Builtin::BI__builtin_infer_alloc_token: { + llvm::MDNode *MDN = buildAllocToken(E); + llvm::Value *MDV = MetadataAsValue::get(getLLVMContext(), MDN); + llvm::Function *F = + CGM.getIntrinsic(llvm::Intrinsic::alloc_token_id, {IntPtrTy}); + llvm::CallBase *TokenID = Builder.CreateCall(F, MDV); + return RValue::get(TokenID); + } + case Builtin::BIbzero: case Builtin::BI__builtin_bzero: { Address Dest = EmitPointerWithAlignment(E->getArg(0)); diff --git a/clang/test/CodeGen/lto-newpm-pipeline.c b/clang/test/CodeGen/lto-newpm-pipeline.c index ea9784a76f923..dceaaf136ebfc 100644 --- a/clang/test/CodeGen/lto-newpm-pipeline.c +++ b/clang/test/CodeGen/lto-newpm-pipeline.c @@ -32,10 +32,12 @@ // CHECK-FULL-O0-NEXT: Running pass: AlwaysInlinerPass // CHECK-FULL-O0-NEXT: Running analysis: ProfileSummaryAnalysis // CHECK-FULL-O0-NEXT: Running pass: CoroConditionalWrapper +// CHECK-FULL-O0-NEXT: Running pass: AllocTokenPass +// CHECK-FULL-O0-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis +// CHECK-FULL-O0-NEXT: Running analysis: TargetLibraryAnalysis // CHECK-FULL-O0-NEXT: Running pass: CanonicalizeAliasesPass // CHECK-FULL-O0-NEXT: Running pass: NameAnonGlobalPass // CHECK-FULL-O0-NEXT: Running pass: AnnotationRemarksPass -// CHECK-FULL-O0-NEXT: Running analysis: TargetLibraryAnalysis // CHECK-FULL-O0-NEXT: Running pass: VerifierPass // CHECK-FULL-O0-NEXT: Running pass: BitcodeWriterPass @@ -46,10 +48,12 @@ // CHECK-THIN-O0-NEXT: Running pass: AlwaysInlinerPass // CHECK-THIN-O0-NEXT: Running analysis: ProfileSummaryAnalysis // CHECK-THIN-O0-NEXT: Running pass: CoroConditionalWrapper +// CHECK-THIN-O0-NEXT: Running pass: AllocTokenPass +// CHECK-THIN-O0-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis +// CHECK-THIN-O0-NEXT: Running analysis: TargetLibraryAnalysis // CHECK-THIN-O0-NEXT: Running pass: CanonicalizeAliasesPass // CHECK-THIN-O0-NEXT: Running pass: NameAnonGlobalPass // CHECK-THIN-O0-NEXT: Running pass: AnnotationRemarksPass -// CHECK-THIN-O0-NEXT: Running analysis: TargetLibraryAnalysis // CHECK-THIN-O0-NEXT: Running pass: VerifierPass // CHECK-THIN-O0-NEXT: Running pass: ThinLTOBitcodeWriterPass diff --git a/clang/test/CodeGenCXX/alloc-token-builtin.cpp b/clang/test/CodeGenCXX/alloc-token-builtin.cpp new file mode 100644 index 0000000000000..adadf7bbe4174 --- /dev/null +++ b/clang/test/CodeGenCXX/alloc-token-builtin.cpp @@ -0,0 +1,97 @@ +// To test IR generation of the builtin without evaluating the LLVM intrinsic, +// we set the mode to a stateful mode, which prohibits constant evaluation. +// RUN: %clang_cc1 -triple x86_64-linux-gnu -Werror -std=c++20 -emit-llvm -falloc-token-mode=random -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-CODEGEN +// RUN: %clang_cc1 -triple x86_64-linux-gnu -Werror -std=c++20 -emit-llvm -falloc-token-max=2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LOWER + +extern "C" void *my_malloc(unsigned long, unsigned long); + +struct NoPtr { + int x; + long y; +}; + +struct WithPtr { + int a; + char *buf; +}; + +int unevaluated_fn(); + +// CHECK-LABEL: @_Z16test_builtin_intv( +// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_INT:[0-9]+]]) +// CHECK-LOWER: ret i64 0 +unsigned long test_builtin_int() { + return __builtin_infer_alloc_token(sizeof(1)); +} + +// CHECK-LABEL: @_Z16test_builtin_ptrv( +// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_PTR:[0-9]+]]) +// CHECK-LOWER: ret i64 1 +unsigned long test_builtin_ptr() { + return __builtin_infer_alloc_token(sizeof(int *)); +} + +// CHECK-LABEL: @_Z25test_builtin_struct_noptrv( +// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_NOPTR:[0-9]+]]) +// CHECK-LOWER: ret i64 0 +unsigned long test_builtin_struct_noptr() { + return __builtin_infer_alloc_token(sizeof(NoPtr)); +} + +// CHECK-LABEL: @_Z25test_builtin_struct_w_ptrv( +// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_WITHPTR:[0-9]+]]) +// CHECK-LOWER: ret i64 1 +unsigned long test_builtin_struct_w_ptr() { + return __builtin_infer_alloc_token(sizeof(WithPtr), 123); +} + +// CHECK-LABEL: @_Z24test_builtin_unevaluatedv( +// CHECK-NOT: call{{.*}}unevaluated_fn +// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_INT:[0-9]+]]) +// CHECK-LOWER: ret i64 0 +unsigned long test_builtin_unevaluated() { + return __builtin_infer_alloc_token(sizeof(int) * unevaluated_fn()); +} + +// CHECK-LABEL: @_Z36test_builtin_unsequenced_unevaluatedi( +// CHECK: add nsw +// CHECK-NOT: add nsw +// CHECK-CODEGEN: %[[REG:[0-9]+]] = call i64 @llvm.alloc.token.id.i64(metadata ![[META_UNKNOWN:[0-9]+]]) +// CHECK-CODEGEN: call{{.*}}@my_malloc({{.*}}, i64 noundef %[[REG]]) +// CHECK-LOWER: call{{.*}}@my_malloc({{.*}}, i64 noundef 0) +void test_builtin_unsequenced_unevaluated(int x) { + my_malloc(++x, __builtin_infer_alloc_token(++x)); +} + +// CHECK-LABEL: @_Z20test_builtin_unknownv( +// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_UNKNOWN:[0-9]+]]) +// CHECK-LOWER: ret i64 0 +unsigned long test_builtin_unknown() { + return __builtin_infer_alloc_token(4096); +} + +// Test template instantiation. +template +constexpr unsigned long get_token() { + return __builtin_infer_alloc_token(sizeof(T)); +} + +// CHECK-LABEL: @_Z13get_token_intv() +// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_INT]]) +// CHECK-LOWER: ret i64 0 +unsigned long get_token_int() { + return get_token(); +} + +// CHECK-LABEL: @_Z13get_token_ptrv() +// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_PTR]]) +// CHECK-LOWER: ret i64 1 +unsigned long get_token_ptr() { + return get_token(); +} + +// CHECK-CODEGEN: ![[META_INT]] = !{!"int", i1 false} +// CHECK-CODEGEN: ![[META_PTR]] = !{!"int *", i1 true} +// CHECK-CODEGEN: ![[META_NOPTR]] = !{!"NoPtr", i1 false} +// CHECK-CODEGEN: ![[META_WITHPTR]] = !{!"WithPtr", i1 true} +// CHECK-CODEGEN: ![[META_UNKNOWN]] = !{} From b008287a047f16a7da92e756a16c09517d865d0e Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Mon, 27 Oct 2025 17:02:46 +0000 Subject: [PATCH 031/539] Extend vector reduction constants folding tests to include scalable vectors. --- .../InstSimplify/ConstProp/vecreduce.ll | 418 +++++++++++++++--- 1 file changed, 361 insertions(+), 57 deletions(-) diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll index 9f9e3f9ffc070..77a7f0d4e4acf 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll @@ -1,26 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s -; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -S | FileCheck %s - -declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a) -declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a) -declare i32 @llvm.vector.reduce.mul.v1i32(<1 x i32> %a) -declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a) -declare i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %a) -declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a) -declare i32 @llvm.vector.reduce.or.v1i32(<1 x i32> %a) -declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a) -declare i32 @llvm.vector.reduce.xor.v1i32(<1 x i32> %a) -declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %a) -declare i32 @llvm.vector.reduce.smin.v1i32(<1 x i32> %a) -declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %a) -declare i32 @llvm.vector.reduce.smax.v1i32(<1 x i32> %a) -declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %a) -declare i32 @llvm.vector.reduce.umin.v1i32(<1 x i32> %a) -declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %a) -declare i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a) -declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %a) - +; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat -S | FileCheck %s define i32 @add_0() { ; CHECK-LABEL: @add_0( @@ -30,6 +10,15 @@ define i32 @add_0() { ret i32 %x } +define i32 @add_0_scalable_vector() { +; CHECK-LABEL: @add_0_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( zeroinitializer) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.add.nxv8i32( zeroinitializer) + ret i32 %x +} + define i32 @add_1() { ; CHECK-LABEL: @add_1( ; CHECK-NEXT: ret i32 8 @@ -38,6 +27,15 @@ define i32 @add_1() { ret i32 %x } +define i32 @add_1_scalable_vector() { +; CHECK-LABEL: @add_1_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( splat (i32 1)) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.add.nxv8i32( splat (i32 1)) + ret i32 %x +} + define i32 @add_inc() { ; CHECK-LABEL: @add_inc( ; CHECK-NEXT: ret i32 18 @@ -63,8 +61,17 @@ define i32 @add_undef() { ret i32 %x } -define i32 @add_undef1() { -; CHECK-LABEL: @add_undef1( +define i32 @add_undef_scalable_vector() { +; CHECK-LABEL: @add_undef_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( undef) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.add.nxv8i32( undef) + ret i32 %x +} + +define i32 @add_undef_elt() { +; CHECK-LABEL: @add_undef_elt( ; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; @@ -80,8 +87,17 @@ define i32 @add_poison() { ret i32 %x } -define i32 @add_poison1() { -; CHECK-LABEL: @add_poison1( +define i32 @add_poison_scalable_vector() { +; CHECK-LABEL: @add_poison_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( poison) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.add.nxv8i32( poison) + ret i32 %x +} + +define i32 @add_poison_elt() { +; CHECK-LABEL: @add_poison_elt( ; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> ) @@ -105,6 +121,15 @@ define i32 @mul_0() { ret i32 %x } +define i32 @mul_0_scalable_vector() { +; CHECK-LABEL: @mul_0_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32( zeroinitializer) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.mul.nxv8i32( zeroinitializer) + ret i32 %x +} + define i32 @mul_1() { ; CHECK-LABEL: @mul_1( ; CHECK-NEXT: ret i32 1 @@ -113,6 +138,15 @@ define i32 @mul_1() { ret i32 %x } +define i32 @mul_1_scalable_vector() { +; CHECK-LABEL: @mul_1_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32( splat (i32 1)) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.mul.nxv8i32( splat (i32 1)) + ret i32 %x +} + define i32 @mul_inc() { ; CHECK-LABEL: @mul_inc( ; CHECK-NEXT: ret i32 40320 @@ -138,8 +172,17 @@ define i32 @mul_undef() { ret i32 %x } -define i32 @mul_undef1() { -; CHECK-LABEL: @mul_undef1( +define i32 @mul_undef_scalable_vector() { +; CHECK-LABEL: @mul_undef_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32( undef) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.mul.nxv8i32( undef) + ret i32 %x +} + +define i32 @mul_undef_elt() { +; CHECK-LABEL: @mul_undef_elt( ; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; @@ -155,8 +198,17 @@ define i32 @mul_poison() { ret i32 %x } -define i32 @mul_poison1() { -; CHECK-LABEL: @mul_poison1( +define i32 @mul_poison_scalable_vector() { +; CHECK-LABEL: @mul_poison_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32( poison) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.mul.nxv8i32( poison) + ret i32 %x +} + +define i32 @mul_poison_elt() { +; CHECK-LABEL: @mul_poison_elt( ; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> ) @@ -171,6 +223,15 @@ define i32 @and_0() { ret i32 %x } +define i32 @and_0_scalable_vector() { +; CHECK-LABEL: @and_0_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32( zeroinitializer) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.and.nxv8i32( zeroinitializer) + ret i32 %x +} + define i32 @and_1() { ; CHECK-LABEL: @and_1( ; CHECK-NEXT: ret i32 1 @@ -179,6 +240,15 @@ define i32 @and_1() { ret i32 %x } +define i32 @and_1_scalable_vector() { +; CHECK-LABEL: @and_1_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32( splat (i32 1)) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.and.nxv8i32( splat (i32 1)) + ret i32 %x +} + define i32 @and_inc() { ; CHECK-LABEL: @and_inc( ; CHECK-NEXT: ret i32 0 @@ -204,8 +274,17 @@ define i32 @and_undef() { ret i32 %x } -define i32 @and_undef1() { -; CHECK-LABEL: @and_undef1( +define i32 @and_undef_scalable_vector() { +; CHECK-LABEL: @and_undef_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32( undef) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.and.nxv8i32( undef) + ret i32 %x +} + +define i32 @and_undef_elt() { +; CHECK-LABEL: @and_undef_elt( ; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; @@ -221,8 +300,17 @@ define i32 @and_poison() { ret i32 %x } -define i32 @and_poison1() { -; CHECK-LABEL: @and_poison1( +define i32 @and_poison_scalable_vector() { +; CHECK-LABEL: @and_poison_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32( poison) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.and.nxv8i32( poison) + ret i32 %x +} + +define i32 @and_poison_elt() { +; CHECK-LABEL: @and_poison_elt( ; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> ) @@ -237,6 +325,15 @@ define i32 @or_0() { ret i32 %x } +define i32 @or_0_scalable_vector() { +; CHECK-LABEL: @or_0_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32( zeroinitializer) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.or.nxv8i32( zeroinitializer) + ret i32 %x +} + define i32 @or_1() { ; CHECK-LABEL: @or_1( ; CHECK-NEXT: ret i32 1 @@ -245,6 +342,15 @@ define i32 @or_1() { ret i32 %x } +define i32 @or_1_scalable_vector() { +; CHECK-LABEL: @or_1_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32( splat (i32 1)) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.or.nxv8i32( splat (i32 1)) + ret i32 %x +} + define i32 @or_inc() { ; CHECK-LABEL: @or_inc( ; CHECK-NEXT: ret i32 -1 @@ -270,8 +376,17 @@ define i32 @or_undef() { ret i32 %x } -define i32 @or_undef1() { -; CHECK-LABEL: @or_undef1( +define i32 @or_undef_scalable_vector() { +; CHECK-LABEL: @or_undef_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32( undef) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.or.v8i32( undef) + ret i32 %x +} + +define i32 @or_undef_elt() { +; CHECK-LABEL: @or_undef_elt( ; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; @@ -287,8 +402,17 @@ define i32 @or_poison() { ret i32 %x } -define i32 @or_poison1() { -; CHECK-LABEL: @or_poison1( +define i32 @or_poison_scalable_vector() { +; CHECK-LABEL: @or_poison_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32( poison) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.or.nxv8i32( poison) + ret i32 %x +} + +define i32 @or_poison_elt() { +; CHECK-LABEL: @or_poison_elt( ; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> ) @@ -303,6 +427,15 @@ define i32 @xor_0() { ret i32 %x } +define i32 @xor_0_scalable_vector() { +; CHECK-LABEL: @xor_0_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32( zeroinitializer) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.xor.nxv8i32( zeroinitializer) + ret i32 %x +} + define i32 @xor_1() { ; CHECK-LABEL: @xor_1( ; CHECK-NEXT: ret i32 0 @@ -311,6 +444,15 @@ define i32 @xor_1() { ret i32 %x } +define i32 @xor_1_scalable_vector() { +; CHECK-LABEL: @xor_1_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32( splat (i32 1)) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.xor.nxv8i32( splat(i32 1)) + ret i32 %x +} + define i32 @xor_inc() { ; CHECK-LABEL: @xor_inc( ; CHECK-NEXT: ret i32 10 @@ -336,8 +478,17 @@ define i32 @xor_undef() { ret i32 %x } -define i32 @xor_undef1() { -; CHECK-LABEL: @xor_undef1( +define i32 @xor_undef_scalable_vector() { +; CHECK-LABEL: @xor_undef_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32( undef) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.xor.nxv8i32( undef) + ret i32 %x +} + +define i32 @xor_undef_elt() { +; CHECK-LABEL: @xor_undef_elt( ; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; @@ -353,8 +504,17 @@ define i32 @xor_poison() { ret i32 %x } -define i32 @xor_poison1() { -; CHECK-LABEL: @xor_poison1( +define i32 @xor_poison_scalable_vector() { +; CHECK-LABEL: @xor_poison_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32( poison) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.xor.nxv8i32( poison) + ret i32 %x +} + +define i32 @xor_poison_elt() { +; CHECK-LABEL: @xor_poison_elt( ; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> ) @@ -369,6 +529,15 @@ define i32 @smin_0() { ret i32 %x } +define i32 @smin_0_scalable_vector() { +; CHECK-LABEL: @smin_0_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32( zeroinitializer) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.smin.nxv8i32( zeroinitializer) + ret i32 %x +} + define i32 @smin_1() { ; CHECK-LABEL: @smin_1( ; CHECK-NEXT: ret i32 1 @@ -377,6 +546,15 @@ define i32 @smin_1() { ret i32 %x } +define i32 @smin_1_scalable_vector() { +; CHECK-LABEL: @smin_1_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32( splat (i32 1)) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.smin.nxv8i32( splat(i32 1)) + ret i32 %x +} + define i32 @smin_inc() { ; CHECK-LABEL: @smin_inc( ; CHECK-NEXT: ret i32 -6 @@ -402,8 +580,17 @@ define i32 @smin_undef() { ret i32 %x } -define i32 @smin_undef1() { -; CHECK-LABEL: @smin_undef1( +define i32 @smin_undef_scalable_vector() { +; CHECK-LABEL: @smin_undef_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32( undef) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.smin.nxv8i32( undef) + ret i32 %x +} + +define i32 @smin_undef_elt() { +; CHECK-LABEL: @smin_undef_elt( ; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; @@ -419,8 +606,17 @@ define i32 @smin_poison() { ret i32 %x } -define i32 @smin_poison1() { -; CHECK-LABEL: @smin_poison1( +define i32 @smin_poison_scalable_vector() { +; CHECK-LABEL: @smin_poison_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32( poison) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.smin.nxv8i32( poison) + ret i32 %x +} + +define i32 @smin_poison_elt() { +; CHECK-LABEL: @smin_poison_elt( ; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> ) @@ -435,6 +631,15 @@ define i32 @smax_0() { ret i32 %x } +define i32 @smax_0_scalable_vector() { +; CHECK-LABEL: @smax_0_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32( zeroinitializer) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.smax.nxv8i32( zeroinitializer) + ret i32 %x +} + define i32 @smax_1() { ; CHECK-LABEL: @smax_1( ; CHECK-NEXT: ret i32 1 @@ -443,6 +648,15 @@ define i32 @smax_1() { ret i32 %x } +define i32 @smax_1_scalable_vector() { +; CHECK-LABEL: @smax_1_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32( splat (i32 1)) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.smax.nxv8i32( splat(i32 1)) + ret i32 %x +} + define i32 @smax_inc() { ; CHECK-LABEL: @smax_inc( ; CHECK-NEXT: ret i32 8 @@ -468,8 +682,17 @@ define i32 @smax_undef() { ret i32 %x } -define i32 @smax_undef1() { -; CHECK-LABEL: @smax_undef1( +define i32 @smax_undef_scalable_vector() { +; CHECK-LABEL: @smax_undef_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32( undef) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.smax.nxv8i32( undef) + ret i32 %x +} + +define i32 @smax_undef_elt() { +; CHECK-LABEL: @smax_undef_elt( ; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; @@ -485,8 +708,17 @@ define i32 @smax_poison() { ret i32 %x } -define i32 @smax_poison1() { -; CHECK-LABEL: @smax_poison1( +define i32 @smax_poison_scalable_vector() { +; CHECK-LABEL: @smax_poison_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32( poison) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.smax.nxv8i32( poison) + ret i32 %x +} + +define i32 @smax_poison_elt() { +; CHECK-LABEL: @smax_poison_elt( ; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> ) @@ -501,6 +733,15 @@ define i32 @umin_0() { ret i32 %x } +define i32 @umin_0_scalable_vector() { +; CHECK-LABEL: @umin_0_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32( zeroinitializer) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.umin.nxv8i32( zeroinitializer) + ret i32 %x +} + define i32 @umin_1() { ; CHECK-LABEL: @umin_1( ; CHECK-NEXT: ret i32 1 @@ -509,6 +750,15 @@ define i32 @umin_1() { ret i32 %x } +define i32 @umin_1_scalable_vector() { +; CHECK-LABEL: @umin_1_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32( splat (i32 1)) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.umin.nxv8i32( splat (i32 1)) + ret i32 %x +} + define i32 @umin_inc() { ; CHECK-LABEL: @umin_inc( ; CHECK-NEXT: ret i32 1 @@ -534,8 +784,17 @@ define i32 @umin_undef() { ret i32 %x } -define i32 @umin_undef1() { -; CHECK-LABEL: @umin_undef1( +define i32 @umin_undef_scalable_vector() { +; CHECK-LABEL: @umin_undef_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32( undef) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.umin.nxv8i32( undef) + ret i32 %x +} + +define i32 @umin_undef_elt() { +; CHECK-LABEL: @umin_undef_elt( ; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; @@ -551,8 +810,17 @@ define i32 @umin_poison() { ret i32 %x } -define i32 @umin_poison1() { -; CHECK-LABEL: @umin_poison1( +define i32 @umin_poison_scalable_vector() { +; CHECK-LABEL: @umin_poison_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32( poison) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.umin.nxv8i32( poison) + ret i32 %x +} + +define i32 @umin_poison_elt() { +; CHECK-LABEL: @umin_poison_elt( ; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> ) @@ -567,6 +835,15 @@ define i32 @umax_0() { ret i32 %x } +define i32 @umax_0_scalable_vector() { +; CHECK-LABEL: @umax_0_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32( zeroinitializer) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.umax.nxv8i32( zeroinitializer) + ret i32 %x +} + define i32 @umax_1() { ; CHECK-LABEL: @umax_1( ; CHECK-NEXT: ret i32 1 @@ -575,6 +852,15 @@ define i32 @umax_1() { ret i32 %x } +define i32 @umax_1_scalable_vector() { +; CHECK-LABEL: @umax_1_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32( splat (i32 1)) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.umax.nxv8i32( splat(i32 1)) + ret i32 %x +} + define i32 @umax_inc() { ; CHECK-LABEL: @umax_inc( ; CHECK-NEXT: ret i32 -3 @@ -600,8 +886,17 @@ define i32 @umax_undef() { ret i32 %x } -define i32 @umax_undef1() { -; CHECK-LABEL: @umax_undef1( +define i32 @umax_undef_scalable_vector() { +; CHECK-LABEL: @umax_undef_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32( undef) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.umax.nxv8i32( undef) + ret i32 %x +} + +define i32 @umax_undef_elt() { +; CHECK-LABEL: @umax_undef_elt( ; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; @@ -617,8 +912,17 @@ define i32 @umax_poison() { ret i32 %x } -define i32 @umax_poison1() { -; CHECK-LABEL: @umax_poison1( +define i32 @umax_poison_scalable_vector() { +; CHECK-LABEL: @umax_poison_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32( poison) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.umax.nxv8i32( poison) + ret i32 %x +} + +define i32 @umax_poison_elt() { +; CHECK-LABEL: @umax_poison_elt( ; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> ) From 320ef09f5fae2571840515fa86d7293c46b7a593 Mon Sep 17 00:00:00 2001 From: sskzakaria Date: Tue, 28 Oct 2025 12:05:20 -0400 Subject: [PATCH 032/539] [X86][Clang] Add AVX512 Integer Comparison Intrinsics for constexpr Evaluation (#164026) Enables constexpr evaluation for the following AVX512 Integer Comparison Intrinsics: ``` _mm_cmp_epi8_mask _mm_cmp_epu8_mask _mm_cmp_epi16_mask _mm_cmp_epu16_mask _mm_cmp_epi32_mask _mm_cmp_epu32_mask _mm_cmp_epi64_mask _mm_cmp_epu64_mask _mm256_cmp_epi8_mask _mm256_cmp_epu8_mask _mm256_cmp_epi16_mask _mm256_cmp_epu16_mask _mm256_cmp_epi32_mask _mm256_cmp_epu32_mask _mm256_cmp_epi64_mask _mm256_cmp_epu64_mask _mm512_cmp_epi8_mask _mm512_cmp_epu8_mask _mm512_cmp_epi16_mask _mm512_cmp_epu16_mask _mm512_cmp_epi32_mask _mm512_cmp_epu32_mask _mm512_cmp_epi64_mask _mm512_cmp_epu64_mask ``` Part 1 of #162054 --- clang/include/clang/Basic/BuiltinsX86.td | 54 ++++++++----- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 83 ++++++++++++++++++++ clang/lib/AST/ExprConstant.cpp | 83 ++++++++++++++++++++ clang/lib/Headers/avx512vlbwintrin.h | 20 ++--- clang/test/CodeGen/X86/avx512vlbw-builtins.c | 38 +++++++++ 5 files changed, 248 insertions(+), 30 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 0c85e280e748b..500aa85fe5356 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -1282,81 +1282,99 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in { def knotdi : X86Builtin<"unsigned long long int(unsigned long long int)">; } -let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl,avx512bw", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def cmpb128_mask : X86Builtin<"unsigned short(_Vector<16, char>, _Vector<16, char>, _Constant int, unsigned short)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def cmpd128_mask : X86Builtin<"unsigned char(_Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">; def cmpq128_mask : X86Builtin<"unsigned char(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">; } -let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl,avx512bw", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def cmpw128_mask : X86Builtin<"unsigned char(_Vector<8, short>, _Vector<8, short>, _Constant int, unsigned char)">; } -let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl,avx512bw", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def cmpb256_mask : X86Builtin<"unsigned int(_Vector<32, char>, _Vector<32, char>, _Constant int, unsigned int)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def cmpd256_mask : X86Builtin<"unsigned char(_Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">; def cmpq256_mask : X86Builtin<"unsigned char(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">; } -let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl,avx512bw", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def cmpw256_mask : X86Builtin<"unsigned short(_Vector<16, short>, _Vector<16, short>, _Constant int, unsigned short)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512bw", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def cmpb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, _Constant int, unsigned long long int)">; } -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def cmpd512_mask : X86Builtin<"unsigned short(_Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">; def cmpq512_mask : X86Builtin<"unsigned char(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512bw", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def cmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">; } -let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl,avx512bw", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def ucmpb128_mask : X86Builtin<"unsigned short(_Vector<16, char>, _Vector<16, char>, _Constant int, unsigned short)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def ucmpd128_mask : X86Builtin<"unsigned char(_Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">; def ucmpq128_mask : X86Builtin<"unsigned char(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">; } -let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl,avx512bw", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def ucmpw128_mask : X86Builtin<"unsigned char(_Vector<8, short>, _Vector<8, short>, _Constant int, unsigned char)">; } -let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl,avx512bw", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def ucmpb256_mask : X86Builtin<"unsigned int(_Vector<32, char>, _Vector<32, char>, _Constant int, unsigned int)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def ucmpd256_mask : X86Builtin<"unsigned char(_Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">; def ucmpq256_mask : X86Builtin<"unsigned char(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">; } -let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl,avx512bw", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def ucmpw256_mask : X86Builtin<"unsigned short(_Vector<16, short>, _Vector<16, short>, _Constant int, unsigned short)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512bw", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def ucmpb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, _Constant int, unsigned long long int)">; } -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def ucmpd512_mask : X86Builtin<"unsigned short(_Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">; def ucmpq512_mask : X86Builtin<"unsigned char(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512bw", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">; } diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 8f23001ea5a39..ab6b3ed1be0aa 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -3296,6 +3296,60 @@ static bool interp__builtin_vec_set(InterpState &S, CodePtr OpPC, return true; } +static bool evalICmpImm(uint8_t Imm, const APSInt &A, const APSInt &B, + bool IsUnsigned) { + switch (Imm & 0x7) { + case 0x00: // _MM_CMPINT_EQ + return (A == B); + case 0x01: // _MM_CMPINT_LT + return IsUnsigned ? A.ult(B) : A.slt(B); + case 0x02: // _MM_CMPINT_LE + return IsUnsigned ? A.ule(B) : A.sle(B); + case 0x03: // _MM_CMPINT_FALSE + return false; + case 0x04: // _MM_CMPINT_NE + return (A != B); + case 0x05: // _MM_CMPINT_NLT + return IsUnsigned ? A.ugt(B) : A.sgt(B); + case 0x06: // _MM_CMPINT_NLE + return IsUnsigned ? A.uge(B) : A.sge(B); + case 0x07: // _MM_CMPINT_TRUE + return true; + default: + llvm_unreachable("Invalid Op"); + } +} + +static bool interp__builtin_ia32_cmp_mask(InterpState &S, CodePtr OpPC, + const CallExpr *Call, unsigned ID, + bool IsUnsigned) { + assert(Call->getNumArgs() == 4); + + APSInt Mask = popToAPSInt(S, Call->getArg(3)); + APSInt Opcode = popToAPSInt(S, Call->getArg(2)); + unsigned CmpOp = static_cast(Opcode.getZExtValue()); + const Pointer &RHS = S.Stk.pop(); + const Pointer &LHS = S.Stk.pop(); + + assert(LHS.getNumElems() == RHS.getNumElems()); + + APInt RetMask = APInt::getZero(LHS.getNumElems()); + unsigned VectorLen = LHS.getNumElems(); + PrimType ElemT = LHS.getFieldDesc()->getPrimType(); + + for (unsigned ElemNum = 0; ElemNum < VectorLen; ++ElemNum) { + APSInt A, B; + INT_TYPE_SWITCH_NO_BOOL(ElemT, { + A = LHS.elem(ElemNum).toAPSInt(); + B = RHS.elem(ElemNum).toAPSInt(); + }); + RetMask.setBitVal(ElemNum, + Mask[ElemNum] && evalICmpImm(CmpOp, A, B, IsUnsigned)); + } + pushInteger(S, RetMask, Call->getType()); + return true; +} + static bool interp__builtin_ia32_vpconflict(InterpState &S, CodePtr OpPC, const CallExpr *Call) { assert(Call->getNumArgs() == 1); @@ -4488,6 +4542,35 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case X86::BI__builtin_ia32_vec_set_v4di: return interp__builtin_vec_set(S, OpPC, Call, BuiltinID); + case X86::BI__builtin_ia32_cmpb128_mask: + case X86::BI__builtin_ia32_cmpw128_mask: + case X86::BI__builtin_ia32_cmpd128_mask: + case X86::BI__builtin_ia32_cmpq128_mask: + case X86::BI__builtin_ia32_cmpb256_mask: + case X86::BI__builtin_ia32_cmpw256_mask: + case X86::BI__builtin_ia32_cmpd256_mask: + case X86::BI__builtin_ia32_cmpq256_mask: + case X86::BI__builtin_ia32_cmpb512_mask: + case X86::BI__builtin_ia32_cmpw512_mask: + case X86::BI__builtin_ia32_cmpd512_mask: + case X86::BI__builtin_ia32_cmpq512_mask: + return interp__builtin_ia32_cmp_mask(S, OpPC, Call, BuiltinID, + /*IsUnsigned=*/false); + + case X86::BI__builtin_ia32_ucmpb128_mask: + case X86::BI__builtin_ia32_ucmpw128_mask: + case X86::BI__builtin_ia32_ucmpd128_mask: + case X86::BI__builtin_ia32_ucmpq128_mask: + case X86::BI__builtin_ia32_ucmpb256_mask: + case X86::BI__builtin_ia32_ucmpw256_mask: + case X86::BI__builtin_ia32_ucmpd256_mask: + case X86::BI__builtin_ia32_ucmpq256_mask: + case X86::BI__builtin_ia32_ucmpb512_mask: + case X86::BI__builtin_ia32_ucmpw512_mask: + case X86::BI__builtin_ia32_ucmpd512_mask: + case X86::BI__builtin_ia32_ucmpq512_mask: + return interp__builtin_ia32_cmp_mask(S, OpPC, Call, BuiltinID, + /*IsUnsigned=*/true); case X86::BI__builtin_ia32_pslldqi128_byteshift: case X86::BI__builtin_ia32_pslldqi256_byteshift: case X86::BI__builtin_ia32_pslldqi512_byteshift: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 29ee089505125..d0404b957ab03 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -15766,6 +15766,89 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, unsigned Idx = static_cast(IdxAPS.getZExtValue() & (N - 1)); return Success(Vec.getVectorElt(Idx).getInt(), E); } + + case clang::X86::BI__builtin_ia32_cmpb128_mask: + case clang::X86::BI__builtin_ia32_cmpw128_mask: + case clang::X86::BI__builtin_ia32_cmpd128_mask: + case clang::X86::BI__builtin_ia32_cmpq128_mask: + case clang::X86::BI__builtin_ia32_cmpb256_mask: + case clang::X86::BI__builtin_ia32_cmpw256_mask: + case clang::X86::BI__builtin_ia32_cmpd256_mask: + case clang::X86::BI__builtin_ia32_cmpq256_mask: + case clang::X86::BI__builtin_ia32_cmpb512_mask: + case clang::X86::BI__builtin_ia32_cmpw512_mask: + case clang::X86::BI__builtin_ia32_cmpd512_mask: + case clang::X86::BI__builtin_ia32_cmpq512_mask: + case clang::X86::BI__builtin_ia32_ucmpb128_mask: + case clang::X86::BI__builtin_ia32_ucmpw128_mask: + case clang::X86::BI__builtin_ia32_ucmpd128_mask: + case clang::X86::BI__builtin_ia32_ucmpq128_mask: + case clang::X86::BI__builtin_ia32_ucmpb256_mask: + case clang::X86::BI__builtin_ia32_ucmpw256_mask: + case clang::X86::BI__builtin_ia32_ucmpd256_mask: + case clang::X86::BI__builtin_ia32_ucmpq256_mask: + case clang::X86::BI__builtin_ia32_ucmpb512_mask: + case clang::X86::BI__builtin_ia32_ucmpw512_mask: + case clang::X86::BI__builtin_ia32_ucmpd512_mask: + case clang::X86::BI__builtin_ia32_ucmpq512_mask: { + assert(E->getNumArgs() == 4); + + bool IsUnsigned = + (BuiltinOp >= clang::X86::BI__builtin_ia32_ucmpb128_mask && + BuiltinOp <= clang::X86::BI__builtin_ia32_ucmpq512_mask); + + APValue LHS, RHS; + APSInt Mask, Opcode; + if (!EvaluateVector(E->getArg(0), LHS, Info) || + !EvaluateVector(E->getArg(1), RHS, Info) || + !EvaluateInteger(E->getArg(2), Opcode, Info) || + !EvaluateInteger(E->getArg(3), Mask, Info)) + return false; + + assert(LHS.getVectorLength() == RHS.getVectorLength()); + + unsigned VectorLen = LHS.getVectorLength(); + unsigned RetWidth = Mask.getBitWidth(); + + APSInt RetMask(llvm::APInt(RetWidth, 0), /*isUnsigned=*/true); + + for (unsigned ElemNum = 0; ElemNum < VectorLen; ++ElemNum) { + const APSInt &A = LHS.getVectorElt(ElemNum).getInt(); + const APSInt &B = RHS.getVectorElt(ElemNum).getInt(); + bool Result = false; + + switch (Opcode.getExtValue() & 0x7) { + case 0: // _MM_CMPINT_EQ + Result = (A == B); + break; + case 1: // _MM_CMPINT_LT + Result = IsUnsigned ? A.ult(B) : A.slt(B); + break; + case 2: // _MM_CMPINT_LE + Result = IsUnsigned ? A.ule(B) : A.sle(B); + break; + case 3: // _MM_CMPINT_FALSE + Result = false; + break; + case 4: // _MM_CMPINT_NE + Result = (A != B); + break; + case 5: // _MM_CMPINT_NLT (>=) + Result = IsUnsigned ? A.uge(B) : A.sge(B); + break; + case 6: // _MM_CMPINT_NLE (>) + Result = IsUnsigned ? A.ugt(B) : A.sgt(B); + break; + case 7: // _MM_CMPINT_TRUE + Result = true; + break; + } + + RetMask.setBitVal(ElemNum, Mask[ElemNum] && Result); + } + + return Success(APValue(RetMask), E); + } } } diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h index 0fcfe3779fa19..263a1079b26d5 100644 --- a/clang/lib/Headers/avx512vlbwintrin.h +++ b/clang/lib/Headers/avx512vlbwintrin.h @@ -2385,22 +2385,19 @@ _mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A) (__mmask32) __U); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 -_mm_test_epi8_mask (__m128i __A, __m128i __B) -{ +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_test_epi8_mask(__m128i __A, __m128i __B) { return _mm_cmpneq_epi8_mask (_mm_and_si128(__A, __B), _mm_setzero_si128()); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 -_mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) -{ +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_test_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) { return _mm_mask_cmpneq_epi8_mask (__U, _mm_and_si128 (__A, __B), _mm_setzero_si128()); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 -_mm256_test_epi8_mask (__m256i __A, __m256i __B) -{ +static __inline__ __mmask32 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_test_epi8_mask(__m256i __A, __m256i __B) { return _mm256_cmpneq_epi8_mask (_mm256_and_si256(__A, __B), _mm256_setzero_si256()); } @@ -2439,9 +2436,8 @@ _mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) _mm256_setzero_si256()); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 -_mm_testn_epi8_mask (__m128i __A, __m128i __B) -{ +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_testn_epi8_mask(__m128i __A, __m128i __B) { return _mm_cmpeq_epi8_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); } diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c index 116d86fcd597d..febef46458ae9 100644 --- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c @@ -645,6 +645,21 @@ __mmask16 test_mm_cmp_epi8_mask(__m128i __a, __m128i __b) { return (__mmask16)_mm_cmp_epi8_mask(__a, __b, 0); } +TEST_CONSTEXPR(_mm_cmpeq_epi8_mask( + ((__m128i)(__v16qi){5, 3, 7, 2, 9, 3, 7, 1, 5, 4, 8, 2, 9, 6, 7, 5}), + ((__m128i)(__v16qi){5, 2, 7, 3, 9, 4, 6, 1, 5, 3, 8, 1, 9, 5, 7, 5}) +) == (__mmask16)0xd595); + +TEST_CONSTEXPR(_mm_cmplt_epi8_mask( + ((__m128i)(__v16qi){1, 5, 3, 7, 2, 8, 4, 6, 9, 5, 3, 11, 2, 6, 15, 8}), + ((__m128i)(__v16qi){2, 4, 6, 8, 3, 5, 7, 9, 4, 6, 8, 10, 5, 7, 9, 11}) +) == (__mmask16)0xb6dd); + +TEST_CONSTEXPR(_mm_cmple_epi8_mask( + ((__m128i)(__v16qi){1, 3, 5, 7, 2, 6, 6, 8, 1, 3, 9, 7, 2, 4, 6, 10}), + ((__m128i)(__v16qi){2, 3, 4, 7, 3, 4, 5, 8, 2, 3, 4, 7, 3, 4, 5, 8}) +) == (__mmask16)0x3b9b); + __mmask16 test_mm_mask_cmp_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { // CHECK-LABEL: test_mm_mask_cmp_epi8_mask // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}} @@ -2894,6 +2909,12 @@ __mmask16 test_mm_test_epi8_mask(__m128i __A, __m128i __B) { return _mm_test_epi8_mask(__A, __B); } +TEST_CONSTEXPR(_mm_test_epi8_mask( + (__m128i)(__v16qi){1, 2, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, + (__m128i)(__v16qi){1, 2, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} +) +== (__mmask16)0xfffb); + __mmask16 test_mm_mask_test_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_test_epi8_mask // CHECK: and <2 x i64> %{{.*}}, %{{.*}} @@ -2901,6 +2922,12 @@ __mmask16 test_mm_mask_test_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) { // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return _mm_mask_test_epi8_mask(__U, __A, __B); } +TEST_CONSTEXPR(_mm_mask_test_epi8_mask( + 0xFFFF, + (__m128i)(__v16qi){1, 2, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, + (__m128i)(__v16qi){1, 2, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} +) +== (__mmask16)0xfffb); __mmask32 test_mm256_test_epi8_mask(__m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_test_epi8_mask @@ -2908,6 +2935,11 @@ __mmask32 test_mm256_test_epi8_mask(__m256i __A, __m256i __B) { // CHECK: icmp ne <32 x i8> %{{.*}}, %{{.*}} return _mm256_test_epi8_mask(__A, __B); } +TEST_CONSTEXPR(_mm256_test_epi8_mask( + (__m256i)(__v32qi){1, 2, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, + (__m256i)(__v32qi){1, 2, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} +) +== (__mmask32)0xfffbfffb); __mmask32 test_mm256_mask_test_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_test_epi8_mask @@ -2954,6 +2986,12 @@ __mmask16 test_mm_testn_epi8_mask(__m128i __A, __m128i __B) { return _mm_testn_epi8_mask(__A, __B); } +TEST_CONSTEXPR(_mm_testn_epi8_mask( + (__m128i)(__v16qi){1, 2, 77, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 16, 16}, + (__m128i)(__v16qi){2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15} +) +== (__mmask16)0xe001); + __mmask16 test_mm_mask_testn_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_testn_epi8_mask // CHECK: and <2 x i64> %{{.*}}, %{{.*}} From 314bb6165579acb59716094c97a85e038be28e5e Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Tue, 28 Oct 2025 17:23:15 +0100 Subject: [PATCH 033/539] [CIR] Upstream Try block with only noexcept calls (#165153) Upstream try block with only noexcept calls inside, which doesn't need to be converted to TryCallOp Issue https://github.com/llvm/llvm-project/issues/154992 --- .../lib/CIR/Dialect/Transforms/FlattenCFG.cpp | 4 ++- clang/test/CIR/CodeGen/try-catch.cpp | 30 +++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp index 21c96febf8403..ca7554e4e3754 100644 --- a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp +++ b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp @@ -606,10 +606,12 @@ class CIRTryOpFlattening : public mlir::OpRewritePattern { // `cir.try_call`. llvm::SmallVector callsToRewrite; tryOp.getTryRegion().walk([&](CallOp op) { + if (op.getNothrow()) + return; + // Only grab calls within immediate closest TryOp scope. if (op->getParentOfType() != tryOp) return; - assert(!cir::MissingFeatures::opCallExceptionAttr()); callsToRewrite.push_back(op); }); diff --git a/clang/test/CIR/CodeGen/try-catch.cpp b/clang/test/CIR/CodeGen/try-catch.cpp index 1e4d2a63ada01..27e3d8ef41115 100644 --- a/clang/test/CIR/CodeGen/try-catch.cpp +++ b/clang/test/CIR/CodeGen/try-catch.cpp @@ -164,3 +164,33 @@ void try_catch_with_alloca() { // OGCG: %[[TMP_B:.*]] = load i32, ptr %[[B_ADDR]], align 4 // OGCG: %[[RESULT:.*]] = add nsw i32 %[[TMP_A]], %[[TMP_B]] // OGCG: store i32 %[[RESULT]], ptr %[[C_ADDR]], align 4 + +void function_with_noexcept() noexcept; + +void calling_noexcept_function_inside_try_block() { + try { + function_with_noexcept(); + } catch (...) { + } +} + +// CIR: cir.scope { +// CIR: cir.try { +// CIR: cir.call @_Z22function_with_noexceptv() nothrow : () -> () +// CIR: cir.yield +// CIR: } +// CIR: } + +// LLVM: br label %[[LABEL_1:.*]] +// LLVM: [[LABEL_1]]: +// LLVM: br label %[[LABEL_2:.*]] +// LLVM: [[LABEL_2]]: +// LLVM: call void @_Z22function_with_noexceptv() +// LLVM: br label %[[LABEL_3:.*]] +// LLVM: [[LABEL_3]]: +// LLVM: br label %[[LABEL_4:.*]] +// LLVM: [[LABEL_4]]: +// LLVM: ret void + +// OGCG: call void @_Z22function_with_noexceptv() +// OGCG: ret void From b48251187b660cdc8ff49d828fd29d2773bfce4a Mon Sep 17 00:00:00 2001 From: Jorn Tuyls Date: Tue, 28 Oct 2025 17:24:35 +0100 Subject: [PATCH 034/539] [MemRef] Implement value bounds interface for CollapseShapeOp (#164955) --- .../MemRef/IR/ValueBoundsOpInterfaceImpl.cpp | 23 +++++++++++++++++++ .../value-bounds-op-interface-impl.mlir | 18 +++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp index 6fa8ce4efff3b..3aa801b48a2e9 100644 --- a/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp @@ -98,6 +98,27 @@ struct RankOpInterface } }; +struct CollapseShapeOpInterface + : public ValueBoundsOpInterface::ExternalModel { + void populateBoundsForShapedValueDim(Operation *op, Value value, int64_t dim, + ValueBoundsConstraintSet &cstr) const { + auto collapseOp = cast(op); + assert(value == collapseOp.getResult() && "invalid value"); + + // Multiply the expressions for the dimensions in the reassociation group. + const ReassociationIndices &reassocIndices = + collapseOp.getReassociationIndices()[dim]; + AffineExpr productExpr = + cstr.getExpr(collapseOp.getSrc(), reassocIndices[0]); + for (size_t i = 1; i < reassocIndices.size(); ++i) { + productExpr = + productExpr * cstr.getExpr(collapseOp.getSrc(), reassocIndices[i]); + } + cstr.bound(value)[dim] == productExpr; + } +}; + struct SubViewOpInterface : public ValueBoundsOpInterface::ExternalModel { @@ -134,6 +155,8 @@ void mlir::memref::registerValueBoundsOpInterfaceExternalModels( memref::AllocOpInterface>(*ctx); memref::CastOp::attachInterface(*ctx); memref::DimOp::attachInterface(*ctx); + memref::CollapseShapeOp::attachInterface( + *ctx); memref::ExpandShapeOp::attachInterface( *ctx); memref::GetGlobalOp::attachInterface(*ctx); diff --git a/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir b/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir index f9b81dfc7d468..d0aec68d54988 100644 --- a/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir +++ b/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir @@ -77,6 +77,24 @@ func.func @memref_expand(%m: memref, %sz: index) -> (index, index) { // ----- +// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 2)> +// CHECK-LABEL: func @memref_collapse( +// CHECK-SAME: %[[sz0:.*]]: index +// CHECK-DAG: %[[c2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[c12:.*]] = arith.constant 12 : index +// CHECK: %[[dim:.*]] = memref.dim %{{.*}}, %[[c2]] : memref<3x4x?x2xf32> +// CHECK: %[[mul:.*]] = affine.apply #[[$MAP]]()[%[[dim]]] +// CHECK: return %[[c12]], %[[mul]] +func.func @memref_collapse(%sz0: index) -> (index, index) { + %0 = memref.alloc(%sz0) : memref<3x4x?x2xf32> + %1 = memref.collapse_shape %0 [[0, 1], [2, 3]] : memref<3x4x?x2xf32> into memref<12x?xf32> + %2 = "test.reify_bound"(%1) {dim = 0} : (memref<12x?xf32>) -> (index) + %3 = "test.reify_bound"(%1) {dim = 1} : (memref<12x?xf32>) -> (index) + return %2, %3 : index, index +} + +// ----- + // CHECK-LABEL: func @memref_get_global( // CHECK: %[[c4:.*]] = arith.constant 4 : index // CHECK: return %[[c4]] From e15e57224c97362921e418090787c58b1e0821a3 Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Tue, 28 Oct 2025 12:42:39 -0400 Subject: [PATCH 035/539] [mlir][amdgpu][rocdl] Add gfx1250 wmma ops (#165064) Update `amdgpu.wmma` op definition and implement amdgpu to rocdl conversion for new variants. --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 27 ++- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 175 +++++++++++++++--- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 10 +- .../AMDGPUToROCDL/wmma-gfx1250.mlir | 89 +++++++++ mlir/test/Dialect/AMDGPU/invalid.mlir | 60 +++++- mlir/test/Dialect/AMDGPU/ops.mlir | 35 ++++ 6 files changed, 346 insertions(+), 50 deletions(-) create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx1250.mlir diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 37db096f1ba75..45cb67f0eee4a 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -912,9 +912,10 @@ def ScaledMFMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[32], [F8E5M2, F8E4M3FN VectorOfLengthAndType<[32], [F6E2M3FN, F6E3M2FN, F4E2M1FN]>]>; def ScaledMFMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 16], [F32]>]>; // wmma -def WMMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 8, 16], [F16, BF16]>, - VectorOfLengthAndType<[4, 8, 16], [I8, SI8, UI8]>, - VectorOfLengthAndType<[4, 8], [F8E4M3FN, F8E5M2]>, +def WMMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[2], [F32]>, + VectorOfLengthAndType<[4, 8, 16], [F16, BF16]>, + VectorOfLengthAndType<[4, 8, 16, 32], [I8, SI8, UI8]>, + VectorOfLengthAndType<[4, 8, 32, 64], [F8E4M3FN, F8E5M2]>, VectorOfLengthAndType<[4, 8, 16], [I<4>, SI<4>, UI<4>]>]>; def WMMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 8], [F32, I32]>, VectorOfLengthAndType<[4, 8, 16], [F16, BF16]>]>; @@ -992,7 +993,7 @@ def AMDGPU_WMMAOp : Arguments<(ins ConfinedAttr]>:$m, ConfinedAttr]>:$n, - ConfinedAttr]>:$k, + ConfinedAttr]>:$k, WMMAInTypes:$sourceA, WMMAInTypes:$sourceB, WMMAOutTypes:$destC, @@ -1005,8 +1006,14 @@ def AMDGPU_WMMAOp : let description = [{ The `amdgpu.wmma` op is an MLIR wrapper around intrinsics for various `wmma` instructions in the AMDGPU architecture, which perform matrix multiplication. - Note that all wmma intrinsics have M=N=16 dimensions but vary by in allowed K - dimensions. + + On gfx11/RDNA3, wmma intrinsics have M=N=K=16 dimensions. + + On gfx12/RDNA4, wmma intrinsics have M=N=16 dimensions and support K=16 for + all element types, and K=32 for i4 sources. + + On gfx1250, wmma intrinsics have M=N=16 and K dimensions of 4, 32, 64, or 128, + depending on the element types. On gfx11/RDNA3, emitting f16->f16 (or bf16->bf16) wmma the output is a 16xf16 (or 16xbf16) vector containing only 8 valid values: @@ -1022,7 +1029,13 @@ def AMDGPU_WMMAOp : Example: ```mlir - %0 = amdgpu.wmma 16x16x16 %matA * %matB + %matC : vector<16xf16>, vector<16xf16>, vector<8xf16> + %0 = amdgpu.wmma 16x16x16 %matA * %matB + %matC : vector<8xf16>, vector<8xf16>, vector<8xf16> + + %1 = amdgpu.wmma 16x16x64 %matD * %matE + %matF : vector<32xi8>, vector<8xf32>, vector<8xf32> + + %2 = amdgpu.wmma 16x16x128 %matG * %matH + %matI : vector<64xf4E2M1FN>, vector<64xf4E2M1FN>, vector<8xf32> + + %3 = amdgpu.wmma 16x16x4 %matJ * %matK + %matL : vector<2xf32>, vector<2xf32>, vector<8xf32> ``` }]; let assemblyFormat = [{ diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 478b6aaaec83a..1eca43d96fe85 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -989,21 +989,17 @@ mfmaOpToScaledIntrinsic(ScaledMFMAOp smfma, Chipset chipset) { smfma.getN(), smfma.getK(), 1u, chipset); } -/// Return the `rocdl` intrinsic corresponding to a WMMA operation `wmma` -/// if one exists. This includes checking to ensure the intrinsic is supported -/// on the architecture you are compiling for. -static std::optional wmmaOpToIntrinsic(WMMAOp wmma, - Chipset chipset) { - auto sourceVectorType = cast(wmma.getSourceA().getType()); - auto sourceBVectorType = cast(wmma.getSourceB().getType()); - auto destVectorType = cast(wmma.getDestC().getType()); - Type elemSourceType = sourceVectorType.getElementType(); - Type elemBSourceType = sourceBVectorType.getElementType(); - Type elemDestType = destVectorType.getElementType(); - - const uint32_t k = wmma.getK(); - +/// Returns the `rocdl` intrinsic corresponding to a WMMA operation `wmma` +/// for RDNA3/4 architectures. +static std::optional +wmmaOpToIntrinsicRDNA(Type elemSourceType, Type elemBSourceType, + Type elemDestType, uint32_t k, bool isRDNA3) { + using fp8 = Float8E4M3FNType; + using bf8 = Float8E5M2Type; + + // Handle k == 16 for RDNA3/4. if (k == 16) { + // Common patterns for RDNA3 and RDNA4. if (elemSourceType.isF16() && elemDestType.isF32()) return ROCDL::wmma_f32_16x16x16_f16::getOperationName(); if (elemSourceType.isBF16() && elemDestType.isF32()) @@ -1014,39 +1010,160 @@ static std::optional wmmaOpToIntrinsic(WMMAOp wmma, return ROCDL::wmma_bf16_16x16x16_bf16::getOperationName(); if (elemSourceType.isInteger(8) && elemDestType.isInteger(32)) return ROCDL::wmma_i32_16x16x16_iu8::getOperationName(); - if (chipset.majorVersion == 11) { + + // RDNA3 specific patterns. + if (isRDNA3) { if (elemSourceType.isInteger(4) && elemDestType.isInteger(32)) return ROCDL::wmma_i32_16x16x16_iu4::getOperationName(); + return std::nullopt; } - } - if (chipset.majorVersion < 12) - return std::nullopt; - // gfx12+ - if (k == 16) { - if (isa(elemSourceType) && - isa(elemBSourceType) && elemDestType.isF32()) + // RDNA4 specific patterns (fp8/bf8). + if (isa(elemSourceType) && isa(elemBSourceType) && + elemDestType.isF32()) return ROCDL::wmma_f32_16x16x16_fp8_fp8::getOperationName(); - if (isa(elemSourceType) && - isa(elemBSourceType) && elemDestType.isF32()) + if (isa(elemSourceType) && isa(elemBSourceType) && + elemDestType.isF32()) return ROCDL::wmma_f32_16x16x16_fp8_bf8::getOperationName(); - if (isa(elemSourceType) && - isa(elemBSourceType) && elemDestType.isF32()) + if (isa(elemSourceType) && isa(elemBSourceType) && + elemDestType.isF32()) return ROCDL::wmma_f32_16x16x16_bf8_bf8::getOperationName(); - if (isa(elemSourceType) && - isa(elemBSourceType) && elemDestType.isF32()) + if (isa(elemSourceType) && isa(elemBSourceType) && + elemDestType.isF32()) return ROCDL::wmma_f32_16x16x16_bf8_fp8::getOperationName(); if (elemSourceType.isInteger(4) && elemDestType.isInteger(32)) return ROCDL::wmma_i32_16x16x16_iu4::getOperationName(); return std::nullopt; } - if (k == 32) { + + // Handle k == 32 for RDNA4. + if (k == 32 && !isRDNA3) { if (elemSourceType.isInteger(4) && elemDestType.isInteger(32)) return ROCDL::wmma_i32_16x16x32_iu4::getOperationName(); + } + + llvm_unreachable("Unsupported k value"); +} + +/// Return the `rocdl` intrinsic corresponding to a WMMA operation `wmma` +/// for the gfx1250 architecture. +static std::optional wmmaOpToIntrinsicGfx1250(Type elemSourceType, + Type elemBSourceType, + Type elemDestType, + uint32_t k) { + using fp8 = Float8E4M3FNType; + using bf8 = Float8E5M2Type; + + if (k == 4) { + if (elemSourceType.isF32() && elemDestType.isF32()) + return ROCDL::wmma_f32_16x16x4_f32::getOperationName(); + return std::nullopt; } + if (k == 32) { + if (elemSourceType.isF16() && elemDestType.isF32()) + return ROCDL::wmma_f32_16x16x32_f16::getOperationName(); + if (elemSourceType.isBF16() && elemDestType.isF32()) + return ROCDL::wmma_f32_16x16x32_bf16::getOperationName(); + if (elemSourceType.isF16() && elemDestType.isF16()) + return ROCDL::wmma_f16_16x16x32_f16::getOperationName(); + if (elemSourceType.isBF16() && elemDestType.isBF16()) + return ROCDL::wmma_bf16_16x16x32_bf16::getOperationName(); + + return std::nullopt; + } + + if (k == 64) { + if (isa(elemSourceType) && isa(elemBSourceType)) { + if (elemDestType.isF32()) + return ROCDL::wmma_f32_16x16x64_fp8_fp8::getOperationName(); + if (elemDestType.isF16()) + return ROCDL::wmma_f16_16x16x64_fp8_fp8::getOperationName(); + } + if (isa(elemSourceType) && isa(elemBSourceType)) { + if (elemDestType.isF32()) + return ROCDL::wmma_f32_16x16x64_fp8_bf8::getOperationName(); + if (elemDestType.isF16()) + return ROCDL::wmma_f16_16x16x64_fp8_bf8::getOperationName(); + } + if (isa(elemSourceType) && isa(elemBSourceType)) { + if (elemDestType.isF32()) + return ROCDL::wmma_f32_16x16x64_bf8_bf8::getOperationName(); + if (elemDestType.isF16()) + return ROCDL::wmma_f16_16x16x64_bf8_bf8::getOperationName(); + } + if (isa(elemSourceType) && isa(elemBSourceType)) { + if (elemDestType.isF32()) + return ROCDL::wmma_f32_16x16x64_bf8_fp8::getOperationName(); + if (elemDestType.isF16()) + return ROCDL::wmma_f16_16x16x64_bf8_fp8::getOperationName(); + } + if (elemSourceType.isInteger(8) && elemDestType.isInteger(32)) + return ROCDL::wmma_i32_16x16x64_iu8::getOperationName(); + + return std::nullopt; + } + + if (k == 128) { + if (isa(elemSourceType) && isa(elemBSourceType)) { + if (elemDestType.isF32()) + return ROCDL::wmma_f32_16x16x128_fp8_fp8::getOperationName(); + if (elemDestType.isF16()) + return ROCDL::wmma_f16_16x16x128_fp8_fp8::getOperationName(); + } + if (isa(elemSourceType) && isa(elemBSourceType)) { + if (elemDestType.isF32()) + return ROCDL::wmma_f32_16x16x128_fp8_bf8::getOperationName(); + if (elemDestType.isF16()) + return ROCDL::wmma_f16_16x16x128_fp8_bf8::getOperationName(); + } + if (isa(elemSourceType) && isa(elemBSourceType)) { + if (elemDestType.isF32()) + return ROCDL::wmma_f32_16x16x128_bf8_bf8::getOperationName(); + if (elemDestType.isF16()) + return ROCDL::wmma_f16_16x16x128_bf8_bf8::getOperationName(); + } + if (isa(elemSourceType) && isa(elemBSourceType)) { + if (elemDestType.isF32()) + return ROCDL::wmma_f32_16x16x128_bf8_fp8::getOperationName(); + if (elemDestType.isF16()) + return ROCDL::wmma_f16_16x16x128_bf8_fp8::getOperationName(); + } + + return std::nullopt; + } + + llvm_unreachable("Unsupported k value"); +} + +/// Returns the `rocdl` intrinsic corresponding to a WMMA operation `wmma` +/// if one exists. This includes checking to ensure the intrinsic is supported +/// on the architecture you are compiling for. +static std::optional wmmaOpToIntrinsic(WMMAOp wmma, + Chipset chipset) { + auto sourceVectorType = cast(wmma.getSourceA().getType()); + auto sourceBVectorType = cast(wmma.getSourceB().getType()); + auto destVectorType = cast(wmma.getDestC().getType()); + Type elemSourceType = sourceVectorType.getElementType(); + Type elemBSourceType = sourceBVectorType.getElementType(); + Type elemDestType = destVectorType.getElementType(); + + const uint32_t k = wmma.getK(); + const bool isRDNA3 = chipset.majorVersion == 11; + const bool isRDNA4 = chipset.majorVersion == 12 && chipset.minorVersion == 0; + + // Handle RDNA3 and RDNA4. + if (isRDNA3 || isRDNA4) + return wmmaOpToIntrinsicRDNA(elemSourceType, elemBSourceType, elemDestType, + k, isRDNA3); + + // Handle gfx1250. + if (chipset == Chipset{12, 5, 0}) + return wmmaOpToIntrinsicGfx1250(elemSourceType, elemBSourceType, + elemDestType, k); + llvm_unreachable("unhandled WMMA case"); } diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 585b6dacfa648..df955fc90b45f 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -399,13 +399,15 @@ LogicalResult WMMAOp::verify() { if (!sourceAElemType.isFloat(8) && sourceAElemType != sourceBElemType) { return emitOpError( - "source element types much match (except for fp8) but have ") + "source element types must match (except for fp8/bf8) but have ") << sourceAType << " and " << sourceBType; } - if (!sourceAElemType.isInteger(4) && getK() != 16) { - return emitOpError("K dimension must be 16 for source element type ") - << sourceAElemType; + if (isSrcFloat) { + if (getClamp()) + return emitOpError("clamp flag is not supported for float types"); + if (getUnsignedA() || getUnsignedB()) + return emitOpError("unsigned flags are not supported for float types"); } return success(); } diff --git a/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx1250.mlir new file mode 100644 index 0000000000000..bcbdef040ebe3 --- /dev/null +++ b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx1250.mlir @@ -0,0 +1,89 @@ +// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1250 --allow-unregistered-dialect | FileCheck %s + +// CHECK-LABEL: @wmma_k4 +func.func @wmma_k4(%arg0 : vector<2xf32>, %arg1 : vector<8xf32>) { + // CHECK: rocdl.wmma.f32.16x16x4.f32 %arg0, %arg0, %arg1 + amdgpu.wmma 16x16x4 %arg0 * %arg0 + %arg1 : vector<2xf32>, vector<2xf32>, vector<8xf32> + func.return +} + +// CHECK-LABEL: @wmma_k32 +func.func @wmma_k32(%arg0 : vector<16xf16>, %arg1 : vector<16xbf16>, %arg2 : vector<8xf32>, + %arg3 : vector<8xf16>, %arg4 : vector<8xbf16>) { + // CHECK: rocdl.wmma.f32.16x16x32.f16 %arg0, %arg0, %arg2 + amdgpu.wmma 16x16x32 %arg0 * %arg0 + %arg2 : vector<16xf16>, vector<16xf16>, vector<8xf32> + + // CHECK: rocdl.wmma.f16.16x16x32.f16 %arg0, %arg0, {{.*}} : (vector<16xf16>, vector<16xf16>, vector<8xf16>, i1) + amdgpu.wmma 16x16x32 %arg0 * %arg0 + %arg3 : vector<16xf16>, vector<16xf16>, vector<8xf16> + + // CHECK: rocdl.wmma.f32.16x16x32.bf16 {{.*}}, {{.*}}, %arg2 + amdgpu.wmma 16x16x32 %arg1 * %arg1 + %arg2 : vector<16xbf16>, vector<16xbf16>, vector<8xf32> + + // CHECK: rocdl.wmma.bf16.16x16x32.bf16 {{.*}}, {{.*}}, {{.*}}, {{.*}} : (vector<16xi16>, vector<16xi16>, vector<8xi16>, i1) + amdgpu.wmma 16x16x32 %arg1 * %arg1 + %arg4 : vector<16xbf16>, vector<16xbf16>, vector<8xbf16> + + func.return +} + +// CHECK-LABEL: @wmma_k64 +func.func @wmma_k64(%arg0 : vector<32xi8>, %arg1 : vector<32xf8E4M3FN>, %arg2 : vector<32xf8E5M2>, + %arg3 : vector<8xi32>, %arg4 : vector<8xf32>, %arg5 : vector<8xf16>) { + // CHECK: rocdl.wmma.i32.16x16x64.iu8 {{.*}}, {{.*}}, {{.*}}, {{.*}}, %arg3, {{.*}} + amdgpu.wmma 16x16x64 %arg0 * %arg0 + %arg3 {clamp} : vector<32xi8>, vector<32xi8>, vector<8xi32> + + // CHECK: rocdl.wmma.f32.16x16x64.fp8_fp8 {{.*}}, {{.*}}, %arg4 + amdgpu.wmma 16x16x64 %arg1 * %arg1 + %arg4 : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<8xf32> + + // CHECK: rocdl.wmma.f16.16x16x64.fp8_fp8 {{.*}}, {{.*}}, %arg5, {{.*}} : (vector<8xi32>, vector<8xi32>, vector<8xf16>, i1) + amdgpu.wmma 16x16x64 %arg1 * %arg1 + %arg5 : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<8xf16> + + // CHECK: rocdl.wmma.f32.16x16x64.fp8_bf8 {{.*}}, {{.*}}, %arg4 + amdgpu.wmma 16x16x64 %arg1 * %arg2 + %arg4 : vector<32xf8E4M3FN>, vector<32xf8E5M2>, vector<8xf32> + + // CHECK: rocdl.wmma.f16.16x16x64.fp8_bf8 {{.*}}, {{.*}}, %arg5, {{.*}} : (vector<8xi32>, vector<8xi32>, vector<8xf16>, i1) + amdgpu.wmma 16x16x64 %arg1 * %arg2 + %arg5 : vector<32xf8E4M3FN>, vector<32xf8E5M2>, vector<8xf16> + + // CHECK: rocdl.wmma.f32.16x16x64.bf8_bf8 {{.*}}, {{.*}}, %arg4 + amdgpu.wmma 16x16x64 %arg2 * %arg2 + %arg4 : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<8xf32> + + // CHECK: rocdl.wmma.f16.16x16x64.bf8_bf8 {{.*}}, {{.*}}, %arg5, {{.*}} : (vector<8xi32>, vector<8xi32>, vector<8xf16>, i1) + amdgpu.wmma 16x16x64 %arg2 * %arg2 + %arg5 : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<8xf16> + + // CHECK: rocdl.wmma.f32.16x16x64.bf8_fp8 {{.*}}, {{.*}}, %arg4 + amdgpu.wmma 16x16x64 %arg2 * %arg1 + %arg4 : vector<32xf8E5M2>, vector<32xf8E4M3FN>, vector<8xf32> + + // CHECK: rocdl.wmma.f16.16x16x64.bf8_fp8 {{.*}}, {{.*}}, %arg5, {{.*}} : (vector<8xi32>, vector<8xi32>, vector<8xf16>, i1) + amdgpu.wmma 16x16x64 %arg2 * %arg1 + %arg5 : vector<32xf8E5M2>, vector<32xf8E4M3FN>, vector<8xf16> + + func.return +} + +// CHECK-LABEL: @wmma_k128 +func.func @wmma_k128(%arg0 : vector<64xf8E4M3FN>, %arg1 : vector<64xf8E5M2>, + %arg2 : vector<8xf32>, %arg3 : vector<8xf16>) { + // CHECK: rocdl.wmma.f32.16x16x128.fp8_fp8 {{.*}}, {{.*}}, %arg2 + amdgpu.wmma 16x16x128 %arg0 * %arg0 + %arg2 : vector<64xf8E4M3FN>, vector<64xf8E4M3FN>, vector<8xf32> + + // CHECK: rocdl.wmma.f16.16x16x128.fp8_fp8 {{.*}}, {{.*}}, %arg3, {{.*}} : (vector<16xi32>, vector<16xi32>, vector<8xf16>, i1) + amdgpu.wmma 16x16x128 %arg0 * %arg0 + %arg3 : vector<64xf8E4M3FN>, vector<64xf8E4M3FN>, vector<8xf16> + + // CHECK: rocdl.wmma.f32.16x16x128.fp8_bf8 {{.*}}, {{.*}}, %arg2 + amdgpu.wmma 16x16x128 %arg0 * %arg1 + %arg2 : vector<64xf8E4M3FN>, vector<64xf8E5M2>, vector<8xf32> + + // CHECK: rocdl.wmma.f16.16x16x128.fp8_bf8 {{.*}}, {{.*}}, %arg3, {{.*}} : (vector<16xi32>, vector<16xi32>, vector<8xf16>, i1) + amdgpu.wmma 16x16x128 %arg0 * %arg1 + %arg3 : vector<64xf8E4M3FN>, vector<64xf8E5M2>, vector<8xf16> + + // CHECK: rocdl.wmma.f32.16x16x128.bf8_bf8 {{.*}}, {{.*}}, %arg2 + amdgpu.wmma 16x16x128 %arg1 * %arg1 + %arg2 : vector<64xf8E5M2>, vector<64xf8E5M2>, vector<8xf32> + + // CHECK: rocdl.wmma.f16.16x16x128.bf8_bf8 {{.*}}, {{.*}}, %arg3, {{.*}} : (vector<16xi32>, vector<16xi32>, vector<8xf16>, i1) + amdgpu.wmma 16x16x128 %arg1 * %arg1 + %arg3 : vector<64xf8E5M2>, vector<64xf8E5M2>, vector<8xf16> + + // CHECK: rocdl.wmma.f32.16x16x128.bf8_fp8 {{.*}}, {{.*}}, %arg2 + amdgpu.wmma 16x16x128 %arg1 * %arg0 + %arg2 : vector<64xf8E5M2>, vector<64xf8E4M3FN>, vector<8xf32> + + // CHECK: rocdl.wmma.f16.16x16x128.bf8_fp8 {{.*}}, {{.*}}, %arg3, {{.*}} : (vector<16xi32>, vector<16xi32>, vector<8xf16>, i1) + amdgpu.wmma 16x16x128 %arg1 * %arg0 + %arg3 : vector<64xf8E5M2>, vector<64xf8E4M3FN>, vector<8xf16> + + func.return +} diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index 57847641a2d03..4c6f62a045405 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -156,14 +156,6 @@ func.func @wmma_no_k_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector // ----- -func.func @wmma_wrong_m_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector<8xi32> { - // expected-error@+1 {{'amdgpu.wmma' op attribute 'm' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16}}} - %0 = amdgpu.wmma 32x16x16 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xi32> - func.return %0 : vector<8xi32> -} - -// ----- - func.func @wmma_wrong_n_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector<8xi32> { // expected-error@+1 {{'amdgpu.wmma' op attribute 'n' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16}}} %0 = amdgpu.wmma 16x32x16 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xi32> @@ -173,14 +165,62 @@ func.func @wmma_wrong_n_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vec // ----- func.func @wmma_wrong_k_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector<8xi32> { - // expected-error@+1 {{'amdgpu.wmma' op attribute 'k' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}} + // expected-error@+1 {{'amdgpu.wmma' op attribute 'k' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {4, 16, 32, 64, 128}}} %0 = amdgpu.wmma 16x16x24 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xi32> func.return %0 : vector<8xi32> } // ----- -// Missinng `resetOffset` +func.func @wmma_source_length_mismatch(%arg0 : vector<8xf16>, %arg1 : vector<16xf16>, %arg2 : vector<8xf32>) -> vector<8xf32> { + // expected-error@+1 {{'amdgpu.wmma' op source vectors have different lengths}} + %0 = amdgpu.wmma 16x16x16 %arg0 * %arg1 + %arg2 : vector<8xf16>, vector<16xf16>, vector<8xf32> + func.return %0 : vector<8xf32> +} + +// ----- + +func.func @wmma_mismatched_float_types(%arg0 : vector<8xf16>, %arg1 : vector<8xbf16>, %arg2 : vector<8xf32>) -> vector<8xf32> { + // expected-error@+1 {{'amdgpu.wmma' op source element types must match (except for fp8/bf8)}} + %0 = amdgpu.wmma 16x16x16 %arg0 * %arg1 + %arg2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> + func.return %0 : vector<8xf32> +} + +// ----- + +func.func @wmma_mismatched_int_types(%arg0 : vector<8xi8>, %arg1 : vector<8xi4>, %arg2 : vector<8xi32>) -> vector<8xi32> { + // expected-error@+1 {{'amdgpu.wmma' op source element types must match (except for fp8/bf8)}} + %0 = amdgpu.wmma 16x16x16 %arg0 * %arg1 + %arg2 : vector<8xi8>, vector<8xi4>, vector<8xi32> + func.return %0 : vector<8xi32> +} + +// ----- + +func.func @wmma_clamp_float(%arg0 : vector<8xf16>, %arg1 : vector<8xf32>) -> vector<8xf32> { + // expected-error@+1 {{'amdgpu.wmma' op clamp flag is not supported for float types}} + %0 = amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 {clamp} : vector<8xf16>, vector<8xf16>, vector<8xf32> + func.return %0 : vector<8xf32> +} + +// ----- + +func.func @wmma_unsignedA_float(%arg0 : vector<8xf16>, %arg1 : vector<8xf32>) -> vector<8xf32> { + // expected-error@+1 {{'amdgpu.wmma' op unsigned flags are not supported for float types}} + %0 = amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 {unsignedA} : vector<8xf16>, vector<8xf16>, vector<8xf32> + func.return %0 : vector<8xf32> +} + +// ----- + +func.func @wmma_unsignedB_float(%arg0 : vector<8xf16>, %arg1 : vector<8xf32>) -> vector<8xf32> { + // expected-error@+1 {{'amdgpu.wmma' op unsigned flags are not supported for float types}} + %0 = amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 {unsignedB} : vector<8xf16>, vector<8xf16>, vector<8xf32> + func.return %0 : vector<8xf32> +} + +// ----- + +// Missing `resetOffset` func.func @fat_raw_buffer_cast_stripped_offset(%m: memref<8xi32, strided<[1], offset: ?>, #gpu.address_space>) -> memref<8xi32, #amdgpu.address_space> { // expected-error@+1 {{'amdgpu.fat_raw_buffer_cast' op expected result type to be 'memref<8xi32, strided<[1], offset: ?>, #amdgpu.address_space>' but got 'memref<8xi32, #amdgpu.address_space>'}} %ret = amdgpu.fat_raw_buffer_cast %m : memref<8xi32, strided<[1], offset: ?>, #gpu.address_space> to memref<8xi32, #amdgpu.address_space> diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index a33096750ee23..09134cb4704bb 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -586,6 +586,41 @@ func.func @wmma_i32_16x16x32_i4(%arg0 : vector<16xi4>, %arg1 : vector<8xi32>) -> func.return %0 : vector<8xi32> } +// CHECK-LABEL: func @wmma_f32_16x16x4_f32 +func.func @wmma_f32_16x16x4_f32(%arg0 : vector<2xf32>, %arg1 : vector<8xf32>) -> vector<8xf32> { + // CHECK: amdgpu.wmma 16x16x4 + %0 = amdgpu.wmma 16x16x4 %arg0 * %arg0 + %arg1 : vector<2xf32>, vector<2xf32>, vector<8xf32> + func.return %0 : vector<8xf32> +} + +// CHECK-LABEL: func @wmma_f32_16x16x64_f8 +func.func @wmma_f32_16x16x64_f8(%arg0 : vector<32xf8E4M3FN>, %arg1 : vector<8xf32>) -> vector<8xf32> { + // CHECK: amdgpu.wmma 16x16x64 + %0 = amdgpu.wmma 16x16x64 %arg0 * %arg0 + %arg1 : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<8xf32> + func.return %0 : vector<8xf32> +} + +// CHECK-LABEL: func @wmma_f32_16x16x64_bf8 +func.func @wmma_f32_16x16x64_bf8(%arg0 : vector<32xf8E5M2>, %arg1 : vector<8xf32>) -> vector<8xf32> { + // CHECK: amdgpu.wmma 16x16x64 + %0 = amdgpu.wmma 16x16x64 %arg0 * %arg0 + %arg1 : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<8xf32> + func.return %0 : vector<8xf32> +} + +// CHECK-LABEL: func @wmma_f16_16x16x64_bf8 +func.func @wmma_f16_16x16x64_bf8(%arg0 : vector<32xf8E5M2>, %arg1 : vector<8xf16>) -> vector<8xf16> { + // CHECK: amdgpu.wmma 16x16x64 + %0 = amdgpu.wmma 16x16x64 %arg0 * %arg0 + %arg1 : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<8xf16> + func.return %0 : vector<8xf16> +} + +// CHECK-LABEL: func @wmma_f16_16x16x64_f8 +func.func @wmma_f16_16x16x64_f8(%arg0 : vector<32xf8E4M3FN>, %arg1 : vector<8xf16>) -> vector<8xf16> { + // CHECK: amdgpu.wmma 16x16x64 + %0 = amdgpu.wmma 16x16x64 %arg0 * %arg0 + %arg1 : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<8xf16> + func.return %0 : vector<8xf16> +} + // CHECK-LABEL: func @swizzle_bitmode func.func @swizzle_bitmode(%arg0 : f32) -> f32 { // CHECK: amdgpu.swizzle_bitmode From 8a692a9420a77bb3fd8436412f384212494093be Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Tue, 28 Oct 2025 09:53:56 -0700 Subject: [PATCH 036/539] [MLIR] Revamp RegionBranchOpInterface (#165429) This is still somehow a WIP, we have some issues with this interface that are not trivial to solve. This patch tries to make the concepts of RegionBranchPoint and RegionSuccessor more robust and aligned with their definition: - A `RegionBranchPoint` is either the parent (`RegionBranchOpInterface`) op or a `RegionBranchTerminatorOpInterface` operation in a nested region. - A `RegionSuccessor` is either one of the nested region or the parent `RegionBranchOpInterface` Some new methods with reasonnable default implementation are added to help resolving the flow of values across the RegionBranchOpInterface. It is still not trivial in the current state to walk the def-use chain backward with this interface. For example when you have the 3rd block argument in the entry block of a for-loop, finding the matching operands requires to know about the hidden loop iterator block argument and where the iterargs start. The API is designed around forward-tracking of the chain unfortunately. Try to reland #161575 ; I suspect a buildbot incremental build issue. --- flang/lib/Optimizer/Dialect/FIROps.cpp | 7 +- .../mlir/Analysis/DataFlow/DenseAnalysis.h | 6 +- .../mlir/Analysis/DataFlow/SparseAnalysis.h | 2 +- mlir/include/mlir/Dialect/SCF/IR/SCFOps.td | 7 + .../mlir/Dialect/Transform/IR/TransformOps.td | 6 +- .../TuneExtension/TuneExtensionOps.td | 2 +- mlir/include/mlir/IR/Diagnostics.h | 2 + mlir/include/mlir/IR/Operation.h | 1 + mlir/include/mlir/IR/Region.h | 2 + .../mlir/Interfaces/ControlFlowInterfaces.h | 104 ++++-- .../mlir/Interfaces/ControlFlowInterfaces.td | 108 +++++- .../AliasAnalysis/LocalAliasAnalysis.cpp | 325 ++++++++++++------ .../Analysis/DataFlow/DeadCodeAnalysis.cpp | 9 +- mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp | 4 +- mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp | 6 +- mlir/lib/Analysis/SliceWalk.cpp | 2 +- mlir/lib/Dialect/Affine/IR/AffineOps.cpp | 50 +-- mlir/lib/Dialect/Async/IR/Async.cpp | 11 +- .../OwnershipBasedBufferDeallocation.cpp | 11 +- mlir/lib/Dialect/EmitC/IR/EmitC.cpp | 8 +- mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 2 +- mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp | 2 +- mlir/lib/Dialect/SCF/IR/SCF.cpp | 52 +-- .../lib/Dialect/SCF/Transforms/ForToWhile.cpp | 1 - .../Dialect/SCF/Transforms/ForallToFor.cpp | 1 - mlir/lib/Dialect/Shape/IR/Shape.cpp | 2 +- .../SparseTensor/IR/SparseTensorDialect.cpp | 4 +- .../lib/Dialect/Transform/IR/TransformOps.cpp | 37 +- .../TuneExtension/TuneExtensionOps.cpp | 5 +- mlir/lib/IR/Diagnostics.cpp | 4 + mlir/lib/IR/Region.cpp | 15 + mlir/lib/Interfaces/ControlFlowInterfaces.cpp | 305 +++++++++++----- mlir/lib/Transforms/RemoveDeadValues.cpp | 25 +- mlir/test/Dialect/SCF/invalid.mlir | 8 +- .../TestDenseBackwardDataFlowAnalysis.cpp | 4 +- mlir/test/lib/Dialect/Test/TestOpDefs.cpp | 26 +- mlir/test/lib/Dialect/Test/TestOps.td | 2 +- .../Interfaces/ControlFlowInterfacesTest.cpp | 38 +- 38 files changed, 828 insertions(+), 378 deletions(-) diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index d0164f32d9b6a..4f97acaa88b7a 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -4484,7 +4484,7 @@ void fir::IfOp::getSuccessorRegions( llvm::SmallVectorImpl ®ions) { // The `then` and the `else` region branch back to the parent operation. if (!point.isParent()) { - regions.push_back(mlir::RegionSuccessor(getResults())); + regions.push_back(mlir::RegionSuccessor(getOperation(), getResults())); return; } @@ -4494,7 +4494,8 @@ void fir::IfOp::getSuccessorRegions( // Don't consider the else region if it is empty. mlir::Region *elseRegion = &this->getElseRegion(); if (elseRegion->empty()) - regions.push_back(mlir::RegionSuccessor()); + regions.push_back( + mlir::RegionSuccessor(getOperation(), getOperation()->getResults())); else regions.push_back(mlir::RegionSuccessor(elseRegion)); } @@ -4513,7 +4514,7 @@ void fir::IfOp::getEntrySuccessorRegions( if (!getElseRegion().empty()) regions.emplace_back(&getElseRegion()); else - regions.emplace_back(getResults()); + regions.emplace_back(getOperation(), getOperation()->getResults()); } } diff --git a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h index 8bcfe51ad7cd1..3c87c453a4cf0 100644 --- a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h +++ b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h @@ -397,7 +397,7 @@ class AbstractDenseBackwardDataFlowAnalysis : public DataFlowAnalysis { /// itself. virtual void visitRegionBranchControlFlowTransfer( RegionBranchOpInterface branch, RegionBranchPoint regionFrom, - RegionBranchPoint regionTo, const AbstractDenseLattice &after, + RegionSuccessor regionTo, const AbstractDenseLattice &after, AbstractDenseLattice *before) { meet(before, after); } @@ -526,7 +526,7 @@ class DenseBackwardDataFlowAnalysis /// and "to" regions. virtual void visitRegionBranchControlFlowTransfer( RegionBranchOpInterface branch, RegionBranchPoint regionFrom, - RegionBranchPoint regionTo, const LatticeT &after, LatticeT *before) { + RegionSuccessor regionTo, const LatticeT &after, LatticeT *before) { AbstractDenseBackwardDataFlowAnalysis::visitRegionBranchControlFlowTransfer( branch, regionFrom, regionTo, after, before); } @@ -571,7 +571,7 @@ class DenseBackwardDataFlowAnalysis } void visitRegionBranchControlFlowTransfer( RegionBranchOpInterface branch, RegionBranchPoint regionForm, - RegionBranchPoint regionTo, const AbstractDenseLattice &after, + RegionSuccessor regionTo, const AbstractDenseLattice &after, AbstractDenseLattice *before) final { visitRegionBranchControlFlowTransfer(branch, regionForm, regionTo, static_cast(after), diff --git a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h index 1a33ecf8b5aa9..985573476ab78 100644 --- a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h +++ b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h @@ -286,7 +286,7 @@ class AbstractSparseForwardDataFlowAnalysis : public DataFlowAnalysis { /// and propagating therefrom. virtual void visitRegionSuccessors(ProgramPoint *point, RegionBranchOpInterface branch, - RegionBranchPoint successor, + RegionSuccessor successor, ArrayRef lattices); }; diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td index 66174ce0f7928..cd033c140a233 100644 --- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td +++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td @@ -644,6 +644,13 @@ def ForallOp : SCF_Op<"forall", [ /// Returns true if the mapping specified for this forall op is linear. bool usesLinearMapping(); + + /// RegionBranchOpInterface + + OperandRange getEntrySuccessorOperands(RegionSuccessor successor) { + return getInits(); + } + }]; } diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td index 62e66b3dabee8..ed69287410509 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td +++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td @@ -25,7 +25,7 @@ include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td" def AlternativesOp : TransformDialectOp<"alternatives", [DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, @@ -624,7 +624,7 @@ def ForeachOp : TransformDialectOp<"foreach", [DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, + "getEntrySuccessorOperands"]>, SingleBlockImplicitTerminator<"::mlir::transform::YieldOp"> ]> { let summary = "Executes the body for each element of the payload"; @@ -1237,7 +1237,7 @@ def SelectOp : TransformDialectOp<"select", def SequenceOp : TransformDialectOp<"sequence", [DeclareOpInterfaceMethods, MatchOpInterface, DeclareOpInterfaceMethods, diff --git a/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.td b/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.td index d095659fc4838..4079848fd203a 100644 --- a/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.td +++ b/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.td @@ -63,7 +63,7 @@ def KnobOp : Op, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, diff --git a/mlir/include/mlir/IR/Diagnostics.h b/mlir/include/mlir/IR/Diagnostics.h index 7ff718ad7f241..a0a99f4953822 100644 --- a/mlir/include/mlir/IR/Diagnostics.h +++ b/mlir/include/mlir/IR/Diagnostics.h @@ -29,6 +29,7 @@ class MLIRContext; class Operation; class OperationName; class OpPrintingFlags; +class OpWithFlags; class Type; class Value; @@ -199,6 +200,7 @@ class Diagnostic { /// Stream in an Operation. Diagnostic &operator<<(Operation &op); + Diagnostic &operator<<(OpWithFlags op); Diagnostic &operator<<(Operation *op) { return *this << *op; } /// Append an operation with the given printing flags. Diagnostic &appendOp(Operation &op, const OpPrintingFlags &flags); diff --git a/mlir/include/mlir/IR/Operation.h b/mlir/include/mlir/IR/Operation.h index 5569392cf0b41..b2019574a820d 100644 --- a/mlir/include/mlir/IR/Operation.h +++ b/mlir/include/mlir/IR/Operation.h @@ -1114,6 +1114,7 @@ class OpWithFlags { : op(op), theFlags(flags) {} OpPrintingFlags &flags() { return theFlags; } const OpPrintingFlags &flags() const { return theFlags; } + Operation *getOperation() const { return op; } private: Operation *op; diff --git a/mlir/include/mlir/IR/Region.h b/mlir/include/mlir/IR/Region.h index 1fcb316750230..53d461df98710 100644 --- a/mlir/include/mlir/IR/Region.h +++ b/mlir/include/mlir/IR/Region.h @@ -379,6 +379,8 @@ class RegionRange friend RangeBaseT; }; +llvm::raw_ostream &operator<<(llvm::raw_ostream &os, Region ®ion); + } // namespace mlir #endif // MLIR_IR_REGION_H diff --git a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h index d63800c12d132..47afd252c6d68 100644 --- a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h +++ b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h @@ -15,10 +15,16 @@ #define MLIR_INTERFACES_CONTROLFLOWINTERFACES_H #include "mlir/IR/OpDefinition.h" +#include "mlir/IR/Operation.h" +#include "llvm/ADT/PointerUnion.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/DebugLog.h" +#include "llvm/Support/raw_ostream.h" namespace mlir { class BranchOpInterface; class RegionBranchOpInterface; +class RegionBranchTerminatorOpInterface; /// This class models how operands are forwarded to block arguments in control /// flow. It consists of a number, denoting how many of the successors block @@ -186,27 +192,40 @@ class RegionSuccessor { public: /// Initialize a successor that branches to another region of the parent /// operation. + /// TODO: the default value for the regionInputs is somehow broken. + /// A region successor should have its input correctly set. RegionSuccessor(Region *region, Block::BlockArgListType regionInputs = {}) - : region(region), inputs(regionInputs) {} + : successor(region), inputs(regionInputs) { + assert(region && "Region must not be null"); + } /// Initialize a successor that branches back to/out of the parent operation. - RegionSuccessor(Operation::result_range results) - : inputs(ValueRange(results)) {} - /// Constructor with no arguments. - RegionSuccessor() : inputs(ValueRange()) {} + /// The target must be one of the recursive parent operations. + RegionSuccessor(Operation *successorOp, Operation::result_range results) + : successor(successorOp), inputs(ValueRange(results)) { + assert(successorOp && "Successor op must not be null"); + } /// Return the given region successor. Returns nullptr if the successor is the /// parent operation. - Region *getSuccessor() const { return region; } + Region *getSuccessor() const { return dyn_cast(successor); } /// Return true if the successor is the parent operation. - bool isParent() const { return region == nullptr; } + bool isParent() const { return isa(successor); } /// Return the inputs to the successor that are remapped by the exit values of /// the current region. ValueRange getSuccessorInputs() const { return inputs; } + bool operator==(RegionSuccessor rhs) const { + return successor == rhs.successor && inputs == rhs.inputs; + } + + friend bool operator!=(RegionSuccessor lhs, RegionSuccessor rhs) { + return !(lhs == rhs); + } + private: - Region *region{nullptr}; + llvm::PointerUnion successor{nullptr}; ValueRange inputs; }; @@ -214,64 +233,67 @@ class RegionSuccessor { /// `RegionBranchOpInterface`. /// One can branch from one of two kinds of places: /// * The parent operation (aka the `RegionBranchOpInterface` implementation) -/// * A region within the parent operation. +/// * A RegionBranchTerminatorOpInterface inside a region within the parent +// operation. class RegionBranchPoint { public: /// Returns an instance of `RegionBranchPoint` representing the parent /// operation. static constexpr RegionBranchPoint parent() { return RegionBranchPoint(); } - /// Creates a `RegionBranchPoint` that branches from the given region. - /// The pointer must not be null. - RegionBranchPoint(Region *region) : maybeRegion(region) { - assert(region && "Region must not be null"); - } - - RegionBranchPoint(Region ®ion) : RegionBranchPoint(®ion) {} + /// Creates a `RegionBranchPoint` that branches from the given terminator. + inline RegionBranchPoint(RegionBranchTerminatorOpInterface predecessor); /// Explicitly stops users from constructing with `nullptr`. RegionBranchPoint(std::nullptr_t) = delete; - /// Constructs a `RegionBranchPoint` from the the target of a - /// `RegionSuccessor` instance. - RegionBranchPoint(RegionSuccessor successor) { - if (successor.isParent()) - maybeRegion = nullptr; - else - maybeRegion = successor.getSuccessor(); - } - - /// Assigns a region being branched from. - RegionBranchPoint &operator=(Region ®ion) { - maybeRegion = ®ion; - return *this; - } - /// Returns true if branching from the parent op. - bool isParent() const { return maybeRegion == nullptr; } + bool isParent() const { return predecessor == nullptr; } - /// Returns the region if branching from a region. + /// Returns the terminator if branching from a region. /// A null pointer otherwise. - Region *getRegionOrNull() const { return maybeRegion; } + Operation *getTerminatorPredecessorOrNull() const { return predecessor; } /// Returns true if the two branch points are equal. friend bool operator==(RegionBranchPoint lhs, RegionBranchPoint rhs) { - return lhs.maybeRegion == rhs.maybeRegion; + return lhs.predecessor == rhs.predecessor; } private: // Private constructor to encourage the use of `RegionBranchPoint::parent`. - constexpr RegionBranchPoint() : maybeRegion(nullptr) {} + constexpr RegionBranchPoint() = default; /// Internal encoding. Uses nullptr for representing branching from the parent - /// op and the region being branched from otherwise. - Region *maybeRegion; + /// op and the region terminator being branched from otherwise. + Operation *predecessor = nullptr; }; inline bool operator!=(RegionBranchPoint lhs, RegionBranchPoint rhs) { return !(lhs == rhs); } +inline llvm::raw_ostream &operator<<(llvm::raw_ostream &os, + RegionBranchPoint point) { + if (point.isParent()) + return os << ""; + return os << "getParentRegion() + ->getRegionNumber() + << ", terminator " + << OpWithFlags(point.getTerminatorPredecessorOrNull(), + OpPrintingFlags().skipRegions()) + << ">"; +} + +inline llvm::raw_ostream &operator<<(llvm::raw_ostream &os, + RegionSuccessor successor) { + if (successor.isParent()) + return os << ""; + return os << "getRegionNumber() + << " with " << successor.getSuccessorInputs().size() << " inputs>"; +} + /// This class represents upper and lower bounds on the number of times a region /// of a `RegionBranchOpInterface` can be invoked. The lower bound is at least /// zero, but the upper bound may not be known. @@ -348,4 +370,10 @@ struct ReturnLike : public TraitBase { /// Include the generated interface declarations. #include "mlir/Interfaces/ControlFlowInterfaces.h.inc" +namespace mlir { +inline RegionBranchPoint::RegionBranchPoint( + RegionBranchTerminatorOpInterface predecessor) + : predecessor(predecessor.getOperation()) {} +} // namespace mlir + #endif // MLIR_INTERFACES_CONTROLFLOWINTERFACES_H diff --git a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td index b8d08cc553caa..94242e3ba39ce 100644 --- a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td +++ b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td @@ -117,7 +117,7 @@ def BranchOpInterface : OpInterface<"BranchOpInterface"> { def RegionBranchOpInterface : OpInterface<"RegionBranchOpInterface"> { let description = [{ - This interface provides information for region operations that exhibit + This interface provides information for region-holding operations that exhibit branching behavior between held regions. I.e., this interface allows for expressing control flow information for region holding operations. @@ -126,12 +126,12 @@ def RegionBranchOpInterface : OpInterface<"RegionBranchOpInterface"> { be side-effect free. A "region branch point" indicates a point from which a branch originates. It - can indicate either a region of this op or `RegionBranchPoint::parent()`. In - the latter case, the branch originates from outside of the op, i.e., when - first executing this op. + can indicate either a terminator in any of the immediately nested region of + this op or `RegionBranchPoint::parent()`. In the latter case, the branch + originates from outside of the op, i.e., when first executing this op. A "region successor" indicates the target of a branch. It can indicate - either a region of this op or this op. In the former case, the region + either a region of this op or this op itself. In the former case, the region successor is a region pointer and a range of block arguments to which the "successor operands" are forwarded to. In the latter case, the control flow leaves this op and the region successor is a range of results of this op to @@ -151,10 +151,10 @@ def RegionBranchOpInterface : OpInterface<"RegionBranchOpInterface"> { } ``` - `scf.for` has one region. The region has two region successors: the region - itself and the `scf.for` op. %b is an entry successor operand. %c is a - successor operand. %a is a successor block argument. %r is a successor - result. + `scf.for` has one region. The `scf.yield` has two region successors: the + region body itself and the `scf.for` op. `%b` is an entry successor + operand. `%c` is a successor operand. `%a` is a successor block argument. + `%r` is a successor result. }]; let cppNamespace = "::mlir"; @@ -162,16 +162,16 @@ def RegionBranchOpInterface : OpInterface<"RegionBranchOpInterface"> { InterfaceMethod<[{ Returns the operands of this operation that are forwarded to the region successor's block arguments or this operation's results when branching - to `point`. `point` is guaranteed to be among the successors that are + to `successor`. `successor` is guaranteed to be among the successors that are returned by `getEntrySuccessorRegions`/`getSuccessorRegions(parent())`. Example: In the above example, this method returns the operand %b of the - `scf.for` op, regardless of the value of `point`. I.e., this op always + `scf.for` op, regardless of the value of `successor`. I.e., this op always forwards the same operands, regardless of whether the loop has 0 or more iterations. }], "::mlir::OperandRange", "getEntrySuccessorOperands", - (ins "::mlir::RegionBranchPoint":$point), [{}], + (ins "::mlir::RegionSuccessor":$successor), [{}], /*defaultImplementation=*/[{ auto operandEnd = this->getOperation()->operand_end(); return ::mlir::OperandRange(operandEnd, operandEnd); @@ -224,6 +224,80 @@ def RegionBranchOpInterface : OpInterface<"RegionBranchOpInterface"> { (ins "::mlir::RegionBranchPoint":$point, "::llvm::SmallVectorImpl<::mlir::RegionSuccessor> &":$regions) >, + InterfaceMethod<[{ + Returns the potential region successors when branching from any + terminator in `region`. + These are the regions that may be selected during the flow of control. + }], + "void", "getSuccessorRegions", + (ins "::mlir::Region&":$region, + "::llvm::SmallVectorImpl<::mlir::RegionSuccessor> &":$regions), + [{}], + /*defaultImplementation=*/[{ + for (::mlir::Block &block : region) { + if (block.empty()) + continue; + if (auto terminator = + dyn_cast(block.back())) + $_op.getSuccessorRegions(RegionBranchPoint(terminator), + regions); + } + }]>, + InterfaceMethod<[{ + Returns the potential branching point (predecessors) for a given successor. + }], + "void", "getPredecessors", + (ins "::mlir::RegionSuccessor":$successor, + "::llvm::SmallVectorImpl<::mlir::RegionBranchPoint> &":$predecessors), + [{}], + /*defaultImplementation=*/[{ + ::llvm::SmallVector<::mlir::RegionSuccessor> successors; + $_op.getSuccessorRegions(RegionBranchPoint::parent(), + successors); + if (llvm::any_of(successors, [&] (const RegionSuccessor & succ) { + return succ.getSuccessor() == successor.getSuccessor() || + (succ.isParent() && successor.isParent()); + })) + predecessors.push_back(RegionBranchPoint::parent()); + for (Region ®ion : $_op->getRegions()) { + for (::mlir::Block &block : region) { + if (block.empty()) + continue; + if (auto terminator = + dyn_cast(block.back())) { + ::llvm::SmallVector<::mlir::RegionSuccessor> successors; + $_op.getSuccessorRegions(RegionBranchPoint(terminator), + successors); + if (llvm::any_of(successors, [&] (const RegionSuccessor & succ) { + return succ.getSuccessor() == successor.getSuccessor() || + (succ.isParent() && successor.isParent()); + })) + predecessors.push_back(terminator); + } + } + } + }]>, + InterfaceMethod<[{ + Returns the potential values across all (predecessors) for a given successor + input, modeled by its index (its position in the list of values). + }], + "void", "getPredecessorValues", + (ins "::mlir::RegionSuccessor":$successor, + "int":$index, + "::llvm::SmallVectorImpl<::mlir::Value> &":$predecessorValues), + [{}], + /*defaultImplementation=*/[{ + ::llvm::SmallVector<::mlir::RegionBranchPoint> predecessors; + $_op.getPredecessors(successor, predecessors); + for (auto predecessor : predecessors) { + if (predecessor.isParent()) { + predecessorValues.push_back($_op.getEntrySuccessorOperands(successor)[index]); + continue; + } + auto terminator = cast(predecessor.getTerminatorPredecessorOrNull()); + predecessorValues.push_back(terminator.getSuccessorOperands(successor)[index]); + } + }]>, InterfaceMethod<[{ Populates `invocationBounds` with the minimum and maximum number of times this operation will invoke the attached regions (assuming the @@ -298,7 +372,7 @@ def RegionBranchTerminatorOpInterface : passing them to the region successor indicated by `point`. }], "::mlir::MutableOperandRange", "getMutableSuccessorOperands", - (ins "::mlir::RegionBranchPoint":$point) + (ins "::mlir::RegionSuccessor":$point) >, InterfaceMethod<[{ Returns the potential region successors that are branched to after this @@ -317,7 +391,7 @@ def RegionBranchTerminatorOpInterface : /*defaultImplementation=*/[{ ::mlir::Operation *op = $_op; ::llvm::cast<::mlir::RegionBranchOpInterface>(op->getParentOp()) - .getSuccessorRegions(op->getParentRegion(), regions); + .getSuccessorRegions(::llvm::cast<::mlir::RegionBranchTerminatorOpInterface>(op), regions); }] >, ]; @@ -337,8 +411,8 @@ def RegionBranchTerminatorOpInterface : // them to the region successor given by `index`. If `index` is None, this // function returns the operands that are passed as a result to the parent // operation. - ::mlir::OperandRange getSuccessorOperands(::mlir::RegionBranchPoint point) { - return getMutableSuccessorOperands(point); + ::mlir::OperandRange getSuccessorOperands(::mlir::RegionSuccessor successor) { + return getMutableSuccessorOperands(successor); } }]; } @@ -504,7 +578,7 @@ def ReturnLike : TraitList<[ /*extraOpDeclaration=*/"", /*extraOpDefinition=*/[{ ::mlir::MutableOperandRange $cppClass::getMutableSuccessorOperands( - ::mlir::RegionBranchPoint point) { + ::mlir::RegionSuccessor successor) { return ::mlir::MutableOperandRange(*this); } }] diff --git a/mlir/lib/Analysis/AliasAnalysis/LocalAliasAnalysis.cpp b/mlir/lib/Analysis/AliasAnalysis/LocalAliasAnalysis.cpp index a84d10d5d609d..24cb123e51877 100644 --- a/mlir/lib/Analysis/AliasAnalysis/LocalAliasAnalysis.cpp +++ b/mlir/lib/Analysis/AliasAnalysis/LocalAliasAnalysis.cpp @@ -16,19 +16,21 @@ #include "mlir/IR/Operation.h" #include "mlir/IR/Region.h" #include "mlir/IR/Value.h" -#include "mlir/IR/ValueRange.h" #include "mlir/Interfaces/ControlFlowInterfaces.h" #include "mlir/Interfaces/FunctionInterfaces.h" #include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Interfaces/ViewLikeInterface.h" #include "mlir/Support/LLVM.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/DebugLog.h" #include #include #include using namespace mlir; +#define DEBUG_TYPE "local-alias-analysis" + //===----------------------------------------------------------------------===// // Underlying Address Computation //===----------------------------------------------------------------------===// @@ -42,81 +44,47 @@ static void collectUnderlyingAddressValues(Value value, unsigned maxDepth, DenseSet &visited, SmallVectorImpl &output); -/// Given a successor (`region`) of a RegionBranchOpInterface, collect all of -/// the underlying values being addressed by one of the successor inputs. If the -/// provided `region` is null, as per `RegionBranchOpInterface` this represents -/// the parent operation. -static void collectUnderlyingAddressValues(RegionBranchOpInterface branch, - Region *region, Value inputValue, - unsigned inputIndex, - unsigned maxDepth, - DenseSet &visited, - SmallVectorImpl &output) { - // Given the index of a region of the branch (`predIndex`), or std::nullopt to - // represent the parent operation, try to return the index into the outputs of - // this region predecessor that correspond to the input values of `region`. If - // an index could not be found, std::nullopt is returned instead. - auto getOperandIndexIfPred = - [&](RegionBranchPoint pred) -> std::optional { - SmallVector successors; - branch.getSuccessorRegions(pred, successors); - for (RegionSuccessor &successor : successors) { - if (successor.getSuccessor() != region) - continue; - // Check that the successor inputs map to the given input value. - ValueRange inputs = successor.getSuccessorInputs(); - if (inputs.empty()) { - output.push_back(inputValue); - break; - } - unsigned firstInputIndex, lastInputIndex; - if (region) { - firstInputIndex = cast(inputs[0]).getArgNumber(); - lastInputIndex = cast(inputs.back()).getArgNumber(); - } else { - firstInputIndex = cast(inputs[0]).getResultNumber(); - lastInputIndex = cast(inputs.back()).getResultNumber(); - } - if (firstInputIndex > inputIndex || lastInputIndex < inputIndex) { - output.push_back(inputValue); - break; - } - return inputIndex - firstInputIndex; - } - return std::nullopt; - }; - - // Check branches from the parent operation. - auto branchPoint = RegionBranchPoint::parent(); - if (region) - branchPoint = region; - - if (std::optional operandIndex = - getOperandIndexIfPred(/*predIndex=*/RegionBranchPoint::parent())) { - collectUnderlyingAddressValues( - branch.getEntrySuccessorOperands(branchPoint)[*operandIndex], maxDepth, - visited, output); +/// Given a RegionBranchOpInterface operation (`branch`), a Value`inputValue` +/// which is an input for the provided successor (`initialSuccessor`), try to +/// find the possible sources for the value along the control flow edges. +static void collectUnderlyingAddressValues2( + RegionBranchOpInterface branch, RegionSuccessor initialSuccessor, + Value inputValue, unsigned inputIndex, unsigned maxDepth, + DenseSet &visited, SmallVectorImpl &output) { + LDBG() << "collectUnderlyingAddressValues2: " + << OpWithFlags(branch.getOperation(), OpPrintingFlags().skipRegions()); + LDBG() << " with initialSuccessor " << initialSuccessor; + LDBG() << " inputValue: " << inputValue; + LDBG() << " inputIndex: " << inputIndex; + LDBG() << " maxDepth: " << maxDepth; + ValueRange inputs = initialSuccessor.getSuccessorInputs(); + if (inputs.empty()) { + LDBG() << " input is empty, enqueue value"; + output.push_back(inputValue); + return; } - // Check branches from each child region. - Operation *op = branch.getOperation(); - for (Region ®ion : op->getRegions()) { - if (std::optional operandIndex = getOperandIndexIfPred(region)) { - for (Block &block : region) { - // Try to determine possible region-branch successor operands for the - // current region. - if (auto term = dyn_cast( - block.getTerminator())) { - collectUnderlyingAddressValues( - term.getSuccessorOperands(branchPoint)[*operandIndex], maxDepth, - visited, output); - } else if (block.getNumSuccessors()) { - // Otherwise, if this terminator may exit the region we can't make - // any assumptions about which values get passed. - output.push_back(inputValue); - return; - } - } - } + unsigned firstInputIndex, lastInputIndex; + if (isa(inputs[0])) { + firstInputIndex = cast(inputs[0]).getArgNumber(); + lastInputIndex = cast(inputs.back()).getArgNumber(); + } else { + firstInputIndex = cast(inputs[0]).getResultNumber(); + lastInputIndex = cast(inputs.back()).getResultNumber(); + } + if (firstInputIndex > inputIndex || lastInputIndex < inputIndex) { + LDBG() << " !! Input index " << inputIndex << " out of range " + << firstInputIndex << " to " << lastInputIndex + << ", adding input value to output"; + output.push_back(inputValue); + return; + } + SmallVector predecessorValues; + branch.getPredecessorValues(initialSuccessor, inputIndex - firstInputIndex, + predecessorValues); + LDBG() << " Found " << predecessorValues.size() << " predecessor values"; + for (Value predecessorValue : predecessorValues) { + LDBG() << " Processing predecessor value: " << predecessorValue; + collectUnderlyingAddressValues(predecessorValue, maxDepth, visited, output); } } @@ -124,22 +92,28 @@ static void collectUnderlyingAddressValues(RegionBranchOpInterface branch, static void collectUnderlyingAddressValues(OpResult result, unsigned maxDepth, DenseSet &visited, SmallVectorImpl &output) { + LDBG() << "collectUnderlyingAddressValues (OpResult): " << result; + LDBG() << " maxDepth: " << maxDepth; + Operation *op = result.getOwner(); // If this is a view, unwrap to the source. if (ViewLikeOpInterface view = dyn_cast(op)) { if (result == view.getViewDest()) { + LDBG() << " Unwrapping view to source: " << view.getViewSource(); return collectUnderlyingAddressValues(view.getViewSource(), maxDepth, visited, output); } } // Check to see if we can reason about the control flow of this op. if (auto branch = dyn_cast(op)) { - return collectUnderlyingAddressValues(branch, /*region=*/nullptr, result, - result.getResultNumber(), maxDepth, - visited, output); + LDBG() << " Processing region branch operation"; + return collectUnderlyingAddressValues2( + branch, RegionSuccessor(op, op->getResults()), result, + result.getResultNumber(), maxDepth, visited, output); } + LDBG() << " Adding result to output: " << result; output.push_back(result); } @@ -148,14 +122,23 @@ static void collectUnderlyingAddressValues(OpResult result, unsigned maxDepth, static void collectUnderlyingAddressValues(BlockArgument arg, unsigned maxDepth, DenseSet &visited, SmallVectorImpl &output) { + LDBG() << "collectUnderlyingAddressValues (BlockArgument): " << arg; + LDBG() << " maxDepth: " << maxDepth; + LDBG() << " argNumber: " << arg.getArgNumber(); + LDBG() << " isEntryBlock: " << arg.getOwner()->isEntryBlock(); + Block *block = arg.getOwner(); unsigned argNumber = arg.getArgNumber(); // Handle the case of a non-entry block. if (!block->isEntryBlock()) { + LDBG() << " Processing non-entry block with " + << std::distance(block->pred_begin(), block->pred_end()) + << " predecessors"; for (auto it = block->pred_begin(), e = block->pred_end(); it != e; ++it) { auto branch = dyn_cast((*it)->getTerminator()); if (!branch) { + LDBG() << " Cannot analyze control flow, adding argument to output"; // We can't analyze the control flow, so bail out early. output.push_back(arg); return; @@ -165,10 +148,12 @@ static void collectUnderlyingAddressValues(BlockArgument arg, unsigned maxDepth, unsigned index = it.getSuccessorIndex(); Value operand = branch.getSuccessorOperands(index)[argNumber]; if (!operand) { + LDBG() << " No operand found for argument, adding to output"; // We can't analyze the control flow, so bail out early. output.push_back(arg); return; } + LDBG() << " Processing operand from predecessor: " << operand; collectUnderlyingAddressValues(operand, maxDepth, visited, output); } return; @@ -178,10 +163,35 @@ static void collectUnderlyingAddressValues(BlockArgument arg, unsigned maxDepth, Region *region = block->getParent(); Operation *op = region->getParentOp(); if (auto branch = dyn_cast(op)) { - return collectUnderlyingAddressValues(branch, region, arg, argNumber, - maxDepth, visited, output); + LDBG() << " Processing region branch operation for entry block"; + // We have to find the successor matching the region, so that the input + // arguments are correctly set. + // TODO: this isn't comprehensive: the successor may not be reachable from + // the entry block. + SmallVector successors; + branch.getSuccessorRegions(RegionBranchPoint::parent(), successors); + RegionSuccessor regionSuccessor(region); + bool found = false; + for (RegionSuccessor &successor : successors) { + if (successor.getSuccessor() == region) { + LDBG() << " Found matching region successor: " << successor; + found = true; + regionSuccessor = successor; + break; + } + } + if (!found) { + LDBG() + << " No matching region successor found, adding argument to output"; + output.push_back(arg); + return; + } + return collectUnderlyingAddressValues2( + branch, regionSuccessor, arg, argNumber, maxDepth, visited, output); } + LDBG() + << " Cannot reason about underlying address, adding argument to output"; // We can't reason about the underlying address of this argument. output.push_back(arg); } @@ -190,17 +200,26 @@ static void collectUnderlyingAddressValues(BlockArgument arg, unsigned maxDepth, static void collectUnderlyingAddressValues(Value value, unsigned maxDepth, DenseSet &visited, SmallVectorImpl &output) { + LDBG() << "collectUnderlyingAddressValues: " << value; + LDBG() << " maxDepth: " << maxDepth; + // Check that we don't infinitely recurse. - if (!visited.insert(value).second) + if (!visited.insert(value).second) { + LDBG() << " Value already visited, skipping"; return; + } if (maxDepth == 0) { + LDBG() << " Max depth reached, adding value to output"; output.push_back(value); return; } --maxDepth; - if (BlockArgument arg = dyn_cast(value)) + if (BlockArgument arg = dyn_cast(value)) { + LDBG() << " Processing as BlockArgument"; return collectUnderlyingAddressValues(arg, maxDepth, visited, output); + } + LDBG() << " Processing as OpResult"; collectUnderlyingAddressValues(cast(value), maxDepth, visited, output); } @@ -208,9 +227,11 @@ static void collectUnderlyingAddressValues(Value value, unsigned maxDepth, /// Given a value, collect all of the underlying values being addressed. static void collectUnderlyingAddressValues(Value value, SmallVectorImpl &output) { + LDBG() << "collectUnderlyingAddressValues: " << value; DenseSet visited; collectUnderlyingAddressValues(value, maxUnderlyingValueSearchDepth, visited, output); + LDBG() << " Collected " << output.size() << " underlying values"; } //===----------------------------------------------------------------------===// @@ -227,19 +248,33 @@ static LogicalResult getAllocEffectFor(Value value, std::optional &effect, Operation *&allocScopeOp) { + LDBG() << "getAllocEffectFor: " << value; + // Try to get a memory effect interface for the parent operation. Operation *op; - if (BlockArgument arg = dyn_cast(value)) + if (BlockArgument arg = dyn_cast(value)) { op = arg.getOwner()->getParentOp(); - else + LDBG() << " BlockArgument, parent op: " + << OpWithFlags(op, OpPrintingFlags().skipRegions()); + } else { op = cast(value).getOwner(); + LDBG() << " OpResult, owner op: " + << OpWithFlags(op, OpPrintingFlags().skipRegions()); + } + MemoryEffectOpInterface interface = dyn_cast(op); - if (!interface) + if (!interface) { + LDBG() << " No memory effect interface found"; return failure(); + } // Try to find an allocation effect on the resource. - if (!(effect = interface.getEffectOnValue(value))) + if (!(effect = interface.getEffectOnValue(value))) { + LDBG() << " No allocation effect found on value"; return failure(); + } + + LDBG() << " Found allocation effect"; // If we found an allocation effect, try to find a scope for the allocation. // If the resource of this allocation is automatically scoped, find the parent @@ -247,6 +282,12 @@ getAllocEffectFor(Value value, if (llvm::isa( effect->getResource())) { allocScopeOp = op->getParentWithTrait(); + if (allocScopeOp) { + LDBG() << " Automatic allocation scope found: " + << OpWithFlags(allocScopeOp, OpPrintingFlags().skipRegions()); + } else { + LDBG() << " Automatic allocation scope found: null"; + } return success(); } @@ -255,6 +296,12 @@ getAllocEffectFor(Value value, // For now assume allocation scope to the function scope (we don't care if // pointer escape outside function). allocScopeOp = op->getParentOfType(); + if (allocScopeOp) { + LDBG() << " Function scope found: " + << OpWithFlags(allocScopeOp, OpPrintingFlags().skipRegions()); + } else { + LDBG() << " Function scope found: null"; + } return success(); } @@ -293,33 +340,44 @@ static std::optional checkDistinctObjects(Value lhs, Value rhs) { /// Given the two values, return their aliasing behavior. AliasResult LocalAliasAnalysis::aliasImpl(Value lhs, Value rhs) { - if (lhs == rhs) + LDBG() << "aliasImpl: " << lhs << " vs " << rhs; + + if (lhs == rhs) { + LDBG() << " Same value, must alias"; return AliasResult::MustAlias; + } + Operation *lhsAllocScope = nullptr, *rhsAllocScope = nullptr; std::optional lhsAlloc, rhsAlloc; // Handle the case where lhs is a constant. Attribute lhsAttr, rhsAttr; if (matchPattern(lhs, m_Constant(&lhsAttr))) { + LDBG() << " lhs is constant"; // TODO: This is overly conservative. Two matching constants don't // necessarily map to the same address. For example, if the two values // correspond to different symbols that both represent a definition. - if (matchPattern(rhs, m_Constant(&rhsAttr))) + if (matchPattern(rhs, m_Constant(&rhsAttr))) { + LDBG() << " rhs is also constant, may alias"; return AliasResult::MayAlias; + } // Try to find an alloc effect on rhs. If an effect was found we can't // alias, otherwise we might. - return succeeded(getAllocEffectFor(rhs, rhsAlloc, rhsAllocScope)) - ? AliasResult::NoAlias - : AliasResult::MayAlias; + bool rhsHasAlloc = + succeeded(getAllocEffectFor(rhs, rhsAlloc, rhsAllocScope)); + LDBG() << " rhs has alloc effect: " << rhsHasAlloc; + return rhsHasAlloc ? AliasResult::NoAlias : AliasResult::MayAlias; } // Handle the case where rhs is a constant. if (matchPattern(rhs, m_Constant(&rhsAttr))) { + LDBG() << " rhs is constant"; // Try to find an alloc effect on lhs. If an effect was found we can't // alias, otherwise we might. - return succeeded(getAllocEffectFor(lhs, lhsAlloc, lhsAllocScope)) - ? AliasResult::NoAlias - : AliasResult::MayAlias; + bool lhsHasAlloc = + succeeded(getAllocEffectFor(lhs, lhsAlloc, lhsAllocScope)); + LDBG() << " lhs has alloc effect: " << lhsHasAlloc; + return lhsHasAlloc ? AliasResult::NoAlias : AliasResult::MayAlias; } if (std::optional result = checkDistinctObjects(lhs, rhs)) @@ -329,9 +387,14 @@ AliasResult LocalAliasAnalysis::aliasImpl(Value lhs, Value rhs) { // an allocation effect. bool lhsHasAlloc = succeeded(getAllocEffectFor(lhs, lhsAlloc, lhsAllocScope)); bool rhsHasAlloc = succeeded(getAllocEffectFor(rhs, rhsAlloc, rhsAllocScope)); + LDBG() << " lhs has alloc effect: " << lhsHasAlloc; + LDBG() << " rhs has alloc effect: " << rhsHasAlloc; + if (lhsHasAlloc == rhsHasAlloc) { // If both values have an allocation effect we know they don't alias, and if // neither have an effect we can't make an assumptions. + LDBG() << " Both have same alloc status: " + << (lhsHasAlloc ? "NoAlias" : "MayAlias"); return lhsHasAlloc ? AliasResult::NoAlias : AliasResult::MayAlias; } @@ -339,6 +402,7 @@ AliasResult LocalAliasAnalysis::aliasImpl(Value lhs, Value rhs) { // and one without. Move the one with the effect to the lhs to make the next // checks simpler. if (rhsHasAlloc) { + LDBG() << " Swapping lhs and rhs to put alloc effect on lhs"; std::swap(lhs, rhs); lhsAlloc = rhsAlloc; lhsAllocScope = rhsAllocScope; @@ -347,49 +411,74 @@ AliasResult LocalAliasAnalysis::aliasImpl(Value lhs, Value rhs) { // If the effect has a scoped allocation region, check to see if the // non-effect value is defined above that scope. if (lhsAllocScope) { + LDBG() << " Checking allocation scope: " + << OpWithFlags(lhsAllocScope, OpPrintingFlags().skipRegions()); // If the parent operation of rhs is an ancestor of the allocation scope, or // if rhs is an entry block argument of the allocation scope we know the two // values can't alias. Operation *rhsParentOp = rhs.getParentRegion()->getParentOp(); - if (rhsParentOp->isProperAncestor(lhsAllocScope)) + if (rhsParentOp->isProperAncestor(lhsAllocScope)) { + LDBG() << " rhs parent is ancestor of alloc scope, no alias"; return AliasResult::NoAlias; + } if (rhsParentOp == lhsAllocScope) { BlockArgument rhsArg = dyn_cast(rhs); - if (rhsArg && rhs.getParentBlock()->isEntryBlock()) + if (rhsArg && rhs.getParentBlock()->isEntryBlock()) { + LDBG() << " rhs is entry block arg of alloc scope, no alias"; return AliasResult::NoAlias; + } } } // If we couldn't reason about the relationship between the two values, // conservatively assume they might alias. + LDBG() << " Cannot reason about relationship, may alias"; return AliasResult::MayAlias; } /// Given the two values, return their aliasing behavior. AliasResult LocalAliasAnalysis::alias(Value lhs, Value rhs) { - if (lhs == rhs) + LDBG() << "alias: " << lhs << " vs " << rhs; + + if (lhs == rhs) { + LDBG() << " Same value, must alias"; return AliasResult::MustAlias; + } // Get the underlying values being addressed. SmallVector lhsValues, rhsValues; collectUnderlyingAddressValues(lhs, lhsValues); collectUnderlyingAddressValues(rhs, rhsValues); + LDBG() << " lhs underlying values: " << lhsValues.size(); + LDBG() << " rhs underlying values: " << rhsValues.size(); + // If we failed to collect for either of the values somehow, conservatively // assume they may alias. - if (lhsValues.empty() || rhsValues.empty()) + if (lhsValues.empty() || rhsValues.empty()) { + LDBG() << " Failed to collect underlying values, may alias"; return AliasResult::MayAlias; + } // Check the alias results against each of the underlying values. std::optional result; for (Value lhsVal : lhsValues) { for (Value rhsVal : rhsValues) { + LDBG() << " Checking underlying values: " << lhsVal << " vs " << rhsVal; AliasResult nextResult = aliasImpl(lhsVal, rhsVal); + LDBG() << " Result: " + << (nextResult == AliasResult::MustAlias ? "MustAlias" + : nextResult == AliasResult::NoAlias ? "NoAlias" + : "MayAlias"); result = result ? result->merge(nextResult) : nextResult; } } // We should always have a valid result here. + LDBG() << " Final result: " + << (result->isMust() ? "MustAlias" + : result->isNo() ? "NoAlias" + : "MayAlias"); return *result; } @@ -398,8 +487,12 @@ AliasResult LocalAliasAnalysis::alias(Value lhs, Value rhs) { //===----------------------------------------------------------------------===// ModRefResult LocalAliasAnalysis::getModRef(Operation *op, Value location) { + LDBG() << "getModRef: " << OpWithFlags(op, OpPrintingFlags().skipRegions()) + << " on location " << location; + // Check to see if this operation relies on nested side effects. if (op->hasTrait()) { + LDBG() << " Operation has recursive memory effects, returning ModAndRef"; // TODO: To check recursive operations we need to check all of the nested // operations, which can result in a quadratic number of queries. We should // introduce some caching of some kind to help alleviate this, especially as @@ -410,38 +503,64 @@ ModRefResult LocalAliasAnalysis::getModRef(Operation *op, Value location) { // Otherwise, check to see if this operation has a memory effect interface. MemoryEffectOpInterface interface = dyn_cast(op); - if (!interface) + if (!interface) { + LDBG() << " No memory effect interface, returning ModAndRef"; return ModRefResult::getModAndRef(); + } // Build a ModRefResult by merging the behavior of the effects of this // operation. SmallVector effects; interface.getEffects(effects); + LDBG() << " Found " << effects.size() << " memory effects"; ModRefResult result = ModRefResult::getNoModRef(); for (const MemoryEffects::EffectInstance &effect : effects) { - if (isa(effect.getEffect())) + if (isa(effect.getEffect())) { + LDBG() << " Skipping alloc/free effect"; continue; + } // Check for an alias between the effect and our memory location. // TODO: Add support for checking an alias with a symbol reference. AliasResult aliasResult = AliasResult::MayAlias; - if (Value effectValue = effect.getValue()) + if (Value effectValue = effect.getValue()) { + LDBG() << " Checking alias between effect value " << effectValue + << " and location " << location; aliasResult = alias(effectValue, location); + LDBG() << " Alias result: " + << (aliasResult.isMust() ? "MustAlias" + : aliasResult.isNo() ? "NoAlias" + : "MayAlias"); + } else { + LDBG() << " No effect value, assuming MayAlias"; + } // If we don't alias, ignore this effect. - if (aliasResult.isNo()) + if (aliasResult.isNo()) { + LDBG() << " No alias, ignoring effect"; continue; + } // Merge in the corresponding mod or ref for this effect. if (isa(effect.getEffect())) { + LDBG() << " Adding Ref to result"; result = result.merge(ModRefResult::getRef()); } else { assert(isa(effect.getEffect())); + LDBG() << " Adding Mod to result"; result = result.merge(ModRefResult::getMod()); } - if (result.isModAndRef()) + if (result.isModAndRef()) { + LDBG() << " Result is now ModAndRef, breaking"; break; + } } + + LDBG() << " Final ModRef result: " + << (result.isModAndRef() ? "ModAndRef" + : result.isMod() ? "Mod" + : result.isRef() ? "Ref" + : "NoModRef"); return result; } diff --git a/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp index 377f7ebe06750..0fc5b4482bf3e 100644 --- a/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp @@ -501,11 +501,10 @@ void DeadCodeAnalysis::visitRegionTerminator(Operation *op, return; SmallVector successors; - if (auto terminator = dyn_cast(op)) - terminator.getSuccessorRegions(*operands, successors); - else - branch.getSuccessorRegions(op->getParentRegion(), successors); - + auto terminator = dyn_cast(op); + if (!terminator) + return; + terminator.getSuccessorRegions(*operands, successors); visitRegionBranchEdges(branch, op, successors); } diff --git a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp index daa3db55b2852..0682e5f26785a 100644 --- a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp @@ -588,7 +588,9 @@ void AbstractDenseBackwardDataFlowAnalysis::visitBlock(Block *block) { // flow, propagate the lattice back along the control flow edge. if (auto branch = dyn_cast(block->getParentOp())) { LDBG() << " Exit block of region branch operation"; - visitRegionBranchOperation(point, branch, block->getParent(), before); + auto terminator = + cast(block->getTerminator()); + visitRegionBranchOperation(point, branch, terminator, before); return; } diff --git a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp index 0d2e2ed85549d..8e63ae86753b4 100644 --- a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp @@ -130,7 +130,7 @@ AbstractSparseForwardDataFlowAnalysis::visitOperation(Operation *op) { // The results of a region branch operation are determined by control-flow. if (auto branch = dyn_cast(op)) { visitRegionSuccessors(getProgramPointAfter(branch), branch, - /*successor=*/RegionBranchPoint::parent(), + /*successor=*/{branch, branch->getResults()}, resultLattices); return success(); } @@ -279,7 +279,7 @@ void AbstractSparseForwardDataFlowAnalysis::visitCallableOperation( void AbstractSparseForwardDataFlowAnalysis::visitRegionSuccessors( ProgramPoint *point, RegionBranchOpInterface branch, - RegionBranchPoint successor, ArrayRef lattices) { + RegionSuccessor successor, ArrayRef lattices) { const auto *predecessors = getOrCreateFor(point, point); assert(predecessors->allPredecessorsKnown() && "unexpected unresolved region successors"); @@ -314,7 +314,7 @@ void AbstractSparseForwardDataFlowAnalysis::visitRegionSuccessors( visitNonControlFlowArgumentsImpl( branch, RegionSuccessor( - branch->getResults().slice(firstIndex, inputs.size())), + branch, branch->getResults().slice(firstIndex, inputs.size())), lattices, firstIndex); } else { if (!inputs.empty()) diff --git a/mlir/lib/Analysis/SliceWalk.cpp b/mlir/lib/Analysis/SliceWalk.cpp index 817d71a3452ca..863f260cd4b6a 100644 --- a/mlir/lib/Analysis/SliceWalk.cpp +++ b/mlir/lib/Analysis/SliceWalk.cpp @@ -114,7 +114,7 @@ mlir::getControlFlowPredecessors(Value value) { if (!regionOp) return std::nullopt; // Add the control flow predecessor operands to the work list. - RegionSuccessor region(regionOp->getResults()); + RegionSuccessor region(regionOp, regionOp->getResults()); SmallVector predecessorOperands = getRegionPredecessorOperands( regionOp, region, opResult.getResultNumber()); return predecessorOperands; diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp index e0a53cd52f143..0c3592124cdec 100644 --- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -2716,8 +2716,9 @@ LogicalResult AffineForOp::fold(FoldAdaptor adaptor, return success(folded); } -OperandRange AffineForOp::getEntrySuccessorOperands(RegionBranchPoint point) { - assert((point.isParent() || point == getRegion()) && "invalid region point"); +OperandRange AffineForOp::getEntrySuccessorOperands(RegionSuccessor successor) { + assert((successor.isParent() || successor.getSuccessor() == &getRegion()) && + "invalid region point"); // The initial operands map to the loop arguments after the induction // variable or are forwarded to the results when the trip count is zero. @@ -2726,34 +2727,41 @@ OperandRange AffineForOp::getEntrySuccessorOperands(RegionBranchPoint point) { void AffineForOp::getSuccessorRegions( RegionBranchPoint point, SmallVectorImpl ®ions) { - assert((point.isParent() || point == getRegion()) && "expected loop region"); + assert((point.isParent() || + point.getTerminatorPredecessorOrNull()->getParentRegion() == + &getRegion()) && + "expected loop region"); // The loop may typically branch back to its body or to the parent operation. // If the predecessor is the parent op and the trip count is known to be at // least one, branch into the body using the iterator arguments. And in cases // we know the trip count is zero, it can only branch back to its parent. std::optional tripCount = getTrivialConstantTripCount(*this); - if (point.isParent() && tripCount.has_value()) { - if (tripCount.value() > 0) { - regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs())); - return; - } - if (tripCount.value() == 0) { - regions.push_back(RegionSuccessor(getResults())); - return; + if (tripCount.has_value()) { + if (!point.isParent()) { + // From the loop body, if the trip count is one, we can only branch back + // to the parent. + if (tripCount == 1) { + regions.push_back(RegionSuccessor(getOperation(), getResults())); + return; + } + if (tripCount == 0) + return; + } else { + if (tripCount.value() > 0) { + regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs())); + return; + } + if (tripCount.value() == 0) { + regions.push_back(RegionSuccessor(getOperation(), getResults())); + return; + } } } - // From the loop body, if the trip count is one, we can only branch back to - // the parent. - if (!point.isParent() && tripCount == 1) { - regions.push_back(RegionSuccessor(getResults())); - return; - } - // In all other cases, the loop may branch back to itself or the parent // operation. regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs())); - regions.push_back(RegionSuccessor(getResults())); + regions.push_back(RegionSuccessor(getOperation(), getResults())); } AffineBound AffineForOp::getLowerBound() { @@ -3142,7 +3150,7 @@ void AffineIfOp::getSuccessorRegions( RegionSuccessor(&getThenRegion(), getThenRegion().getArguments())); // If the "else" region is empty, branch bach into parent. if (getElseRegion().empty()) { - regions.push_back(getResults()); + regions.push_back(RegionSuccessor(getOperation(), getResults())); } else { regions.push_back( RegionSuccessor(&getElseRegion(), getElseRegion().getArguments())); @@ -3152,7 +3160,7 @@ void AffineIfOp::getSuccessorRegions( // If the predecessor is the `else`/`then` region, then branching into parent // op is valid. - regions.push_back(RegionSuccessor(getResults())); + regions.push_back(RegionSuccessor(getOperation(), getResults())); } LogicalResult AffineIfOp::verify() { diff --git a/mlir/lib/Dialect/Async/IR/Async.cpp b/mlir/lib/Dialect/Async/IR/Async.cpp index dc7b07d911c17..8e4a49df76b52 100644 --- a/mlir/lib/Dialect/Async/IR/Async.cpp +++ b/mlir/lib/Dialect/Async/IR/Async.cpp @@ -36,8 +36,9 @@ void AsyncDialect::initialize() { constexpr char kOperandSegmentSizesAttr[] = "operandSegmentSizes"; -OperandRange ExecuteOp::getEntrySuccessorOperands(RegionBranchPoint point) { - assert(point == getBodyRegion() && "invalid region index"); +OperandRange ExecuteOp::getEntrySuccessorOperands(RegionSuccessor successor) { + assert(successor.getSuccessor() == &getBodyRegion() && + "invalid region index"); return getBodyOperands(); } @@ -53,8 +54,10 @@ bool ExecuteOp::areTypesCompatible(Type lhs, Type rhs) { void ExecuteOp::getSuccessorRegions(RegionBranchPoint point, SmallVectorImpl ®ions) { // The `body` region branch back to the parent operation. - if (point == getBodyRegion()) { - regions.push_back(RegionSuccessor(getBodyResults())); + if (!point.isParent() && + point.getTerminatorPredecessorOrNull()->getParentRegion() == + &getBodyRegion()) { + regions.push_back(RegionSuccessor(getOperation(), getBodyResults())); return; } diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp index b593ccab060c7..36a759c279eb7 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp @@ -562,8 +562,11 @@ LogicalResult BufferDeallocation::updateFunctionSignature(FunctionOpInterface op) { SmallVector returnOperandTypes(llvm::map_range( op.getFunctionBody().getOps(), - [](RegionBranchTerminatorOpInterface op) { - return op.getSuccessorOperands(RegionBranchPoint::parent()).getTypes(); + [&](RegionBranchTerminatorOpInterface branchOp) { + return branchOp + .getSuccessorOperands(RegionSuccessor( + op.getOperation(), op.getOperation()->getResults())) + .getTypes(); })); if (!llvm::all_equal(returnOperandTypes)) return op->emitError( @@ -942,8 +945,8 @@ BufferDeallocation::handleInterface(RegionBranchTerminatorOpInterface op) { // about, but we would need to check how many successors there are and under // which condition they are taken, etc. - MutableOperandRange operands = - op.getMutableSuccessorOperands(RegionBranchPoint::parent()); + MutableOperandRange operands = op.getMutableSuccessorOperands( + RegionSuccessor(op.getOperation(), op.getOperation()->getResults())); SmallVector updatedOwnerships; auto result = deallocation_impl::insertDeallocOpForReturnLike( diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp index 4754f0bfe895e..0992ce14b4afb 100644 --- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp +++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp @@ -845,7 +845,8 @@ void IfOp::getSuccessorRegions(RegionBranchPoint point, SmallVectorImpl ®ions) { // The `then` and the `else` region branch back to the parent operation. if (!point.isParent()) { - regions.push_back(RegionSuccessor()); + regions.push_back( + RegionSuccessor(getOperation(), getOperation()->getResults())); return; } @@ -854,7 +855,8 @@ void IfOp::getSuccessorRegions(RegionBranchPoint point, // Don't consider the else region if it is empty. Region *elseRegion = &this->getElseRegion(); if (elseRegion->empty()) - regions.push_back(RegionSuccessor()); + regions.push_back( + RegionSuccessor(getOperation(), getOperation()->getResults())); else regions.push_back(RegionSuccessor(elseRegion)); } @@ -871,7 +873,7 @@ void IfOp::getEntrySuccessorRegions(ArrayRef operands, if (!getElseRegion().empty()) regions.emplace_back(&getElseRegion()); else - regions.emplace_back(); + regions.emplace_back(getOperation(), getOperation()->getResults()); } } diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index b5f8ddaadacdf..6c6d8d2bad55d 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -2399,7 +2399,7 @@ ParseResult WarpExecuteOnLane0Op::parse(OpAsmParser &parser, void WarpExecuteOnLane0Op::getSuccessorRegions( RegionBranchPoint point, SmallVectorImpl ®ions) { if (!point.isParent()) { - regions.push_back(RegionSuccessor(getResults())); + regions.push_back(RegionSuccessor(getOperation(), getResults())); return; } diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp index c551fba93e367..1c21a2f270da6 100644 --- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp +++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp @@ -405,7 +405,7 @@ ParseResult AllocaScopeOp::parse(OpAsmParser &parser, OperationState &result) { void AllocaScopeOp::getSuccessorRegions( RegionBranchPoint point, SmallVectorImpl ®ions) { if (!point.isParent()) { - regions.push_back(RegionSuccessor(getResults())); + regions.push_back(RegionSuccessor(getOperation(), getResults())); return; } diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp index 1ab01d86bcd10..2946b53c8cb36 100644 --- a/mlir/lib/Dialect/SCF/IR/SCF.cpp +++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp @@ -397,7 +397,7 @@ void ExecuteRegionOp::getSuccessorRegions( } // Otherwise, the region branches back to the parent operation. - regions.push_back(RegionSuccessor(getResults())); + regions.push_back(RegionSuccessor(getOperation(), getResults())); } //===----------------------------------------------------------------------===// @@ -405,10 +405,11 @@ void ExecuteRegionOp::getSuccessorRegions( //===----------------------------------------------------------------------===// MutableOperandRange -ConditionOp::getMutableSuccessorOperands(RegionBranchPoint point) { - assert((point.isParent() || point == getParentOp().getAfter()) && - "condition op can only exit the loop or branch to the after" - "region"); +ConditionOp::getMutableSuccessorOperands(RegionSuccessor point) { + assert( + (point.isParent() || point.getSuccessor() == &getParentOp().getAfter()) && + "condition op can only exit the loop or branch to the after" + "region"); // Pass all operands except the condition to the successor region. return getArgsMutable(); } @@ -426,7 +427,7 @@ void ConditionOp::getSuccessorRegions( regions.emplace_back(&whileOp.getAfter(), whileOp.getAfter().getArguments()); if (!boolAttr || !boolAttr.getValue()) - regions.emplace_back(whileOp.getResults()); + regions.emplace_back(whileOp.getOperation(), whileOp.getResults()); } //===----------------------------------------------------------------------===// @@ -749,7 +750,7 @@ ForOp mlir::scf::getForInductionVarOwner(Value val) { return dyn_cast_or_null(containingOp); } -OperandRange ForOp::getEntrySuccessorOperands(RegionBranchPoint point) { +OperandRange ForOp::getEntrySuccessorOperands(RegionSuccessor successor) { return getInitArgs(); } @@ -759,7 +760,7 @@ void ForOp::getSuccessorRegions(RegionBranchPoint point, // back into the operation itself. It is possible for loop not to enter the // body. regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs())); - regions.push_back(RegionSuccessor(getResults())); + regions.push_back(RegionSuccessor(getOperation(), getResults())); } SmallVector ForallOp::getLoopRegions() { return {&getRegion()}; } @@ -2053,9 +2054,10 @@ void ForallOp::getSuccessorRegions(RegionBranchPoint point, // parallel by multiple threads. We should not expect to branch back into // the forall body after the region's execution is complete. if (point.isParent()) - regions.push_back(RegionSuccessor(&getRegion())); + regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs())); else - regions.push_back(RegionSuccessor()); + regions.push_back( + RegionSuccessor(getOperation(), getOperation()->getResults())); } //===----------------------------------------------------------------------===// @@ -2333,9 +2335,10 @@ void IfOp::print(OpAsmPrinter &p) { void IfOp::getSuccessorRegions(RegionBranchPoint point, SmallVectorImpl ®ions) { - // The `then` and the `else` region branch back to the parent operation. + // The `then` and the `else` region branch back to the parent operation or one + // of the recursive parent operations (early exit case). if (!point.isParent()) { - regions.push_back(RegionSuccessor(getResults())); + regions.push_back(RegionSuccessor(getOperation(), getResults())); return; } @@ -2344,7 +2347,8 @@ void IfOp::getSuccessorRegions(RegionBranchPoint point, // Don't consider the else region if it is empty. Region *elseRegion = &this->getElseRegion(); if (elseRegion->empty()) - regions.push_back(RegionSuccessor()); + regions.push_back( + RegionSuccessor(getOperation(), getOperation()->getResults())); else regions.push_back(RegionSuccessor(elseRegion)); } @@ -2361,7 +2365,7 @@ void IfOp::getEntrySuccessorRegions(ArrayRef operands, if (!getElseRegion().empty()) regions.emplace_back(&getElseRegion()); else - regions.emplace_back(getResults()); + regions.emplace_back(getOperation(), getResults()); } } @@ -3385,7 +3389,8 @@ void ParallelOp::getSuccessorRegions( // back into the operation itself. It is possible for loop not to enter the // body. regions.push_back(RegionSuccessor(&getRegion())); - regions.push_back(RegionSuccessor()); + regions.push_back(RegionSuccessor( + getOperation(), ResultRange{getResults().end(), getResults().end()})); } //===----------------------------------------------------------------------===// @@ -3431,7 +3436,7 @@ LogicalResult ReduceOp::verifyRegions() { } MutableOperandRange -ReduceOp::getMutableSuccessorOperands(RegionBranchPoint point) { +ReduceOp::getMutableSuccessorOperands(RegionSuccessor point) { // No operands are forwarded to the next iteration. return MutableOperandRange(getOperation(), /*start=*/0, /*length=*/0); } @@ -3514,8 +3519,8 @@ Block::BlockArgListType WhileOp::getRegionIterArgs() { return getBeforeArguments(); } -OperandRange WhileOp::getEntrySuccessorOperands(RegionBranchPoint point) { - assert(point == getBefore() && +OperandRange WhileOp::getEntrySuccessorOperands(RegionSuccessor successor) { + assert(successor.getSuccessor() == &getBefore() && "WhileOp is expected to branch only to the first region"); return getInits(); } @@ -3528,15 +3533,18 @@ void WhileOp::getSuccessorRegions(RegionBranchPoint point, return; } - assert(llvm::is_contained({&getAfter(), &getBefore()}, point) && + assert(llvm::is_contained( + {&getAfter(), &getBefore()}, + point.getTerminatorPredecessorOrNull()->getParentRegion()) && "there are only two regions in a WhileOp"); // The body region always branches back to the condition region. - if (point == getAfter()) { + if (point.getTerminatorPredecessorOrNull()->getParentRegion() == + &getAfter()) { regions.emplace_back(&getBefore(), getBefore().getArguments()); return; } - regions.emplace_back(getResults()); + regions.emplace_back(getOperation(), getResults()); regions.emplace_back(&getAfter(), getAfter().getArguments()); } @@ -4445,7 +4453,7 @@ void IndexSwitchOp::getSuccessorRegions( RegionBranchPoint point, SmallVectorImpl &successors) { // All regions branch back to the parent op. if (!point.isParent()) { - successors.emplace_back(getResults()); + successors.emplace_back(getOperation(), getResults()); return; } diff --git a/mlir/lib/Dialect/SCF/Transforms/ForToWhile.cpp b/mlir/lib/Dialect/SCF/Transforms/ForToWhile.cpp index ae52af5009dc9..ddcbda86cf1f3 100644 --- a/mlir/lib/Dialect/SCF/Transforms/ForToWhile.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/ForToWhile.cpp @@ -23,7 +23,6 @@ namespace mlir { #include "mlir/Dialect/SCF/Transforms/Passes.h.inc" } // namespace mlir -using namespace llvm; using namespace mlir; using scf::ForOp; using scf::WhileOp; diff --git a/mlir/lib/Dialect/SCF/Transforms/ForallToFor.cpp b/mlir/lib/Dialect/SCF/Transforms/ForallToFor.cpp index a2f03f1e1056e..00bef707fadd3 100644 --- a/mlir/lib/Dialect/SCF/Transforms/ForallToFor.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/ForallToFor.cpp @@ -21,7 +21,6 @@ namespace mlir { #include "mlir/Dialect/SCF/Transforms/Passes.h.inc" } // namespace mlir -using namespace llvm; using namespace mlir; using scf::LoopNest; diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp index 5ba828918c22a..f0f22e5ef4a83 100644 --- a/mlir/lib/Dialect/Shape/IR/Shape.cpp +++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp @@ -346,7 +346,7 @@ void AssumingOp::getSuccessorRegions( // parent, so return the correct RegionSuccessor purely based on the index // being None or 0. if (!point.isParent()) { - regions.push_back(RegionSuccessor(getResults())); + regions.push_back(RegionSuccessor(getOperation(), getResults())); return; } diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index 1a9d9e158ee75..3962e3e84dd31 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -2597,7 +2597,7 @@ std::optional> IterateOp::getYieldedValuesMutable() { std::optional IterateOp::getLoopResults() { return getResults(); } -OperandRange IterateOp::getEntrySuccessorOperands(RegionBranchPoint point) { +OperandRange IterateOp::getEntrySuccessorOperands(RegionSuccessor successor) { return getInitArgs(); } @@ -2607,7 +2607,7 @@ void IterateOp::getSuccessorRegions(RegionBranchPoint point, // or back into the operation itself. regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs())); // It is possible for loop not to enter the body. - regions.push_back(RegionSuccessor(getResults())); + regions.push_back(RegionSuccessor(getOperation(), getResults())); } void CoIterateOp::build(OpBuilder &builder, OperationState &odsState, diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp index 365afab3764c8..062606e7e10b6 100644 --- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp +++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp @@ -96,9 +96,9 @@ ensurePayloadIsSeparateFromTransform(transform::TransformOpInterface transform, // AlternativesOp //===----------------------------------------------------------------------===// -OperandRange -transform::AlternativesOp::getEntrySuccessorOperands(RegionBranchPoint point) { - if (!point.isParent() && getOperation()->getNumOperands() == 1) +OperandRange transform::AlternativesOp::getEntrySuccessorOperands( + RegionSuccessor successor) { + if (!successor.isParent() && getOperation()->getNumOperands() == 1) return getOperation()->getOperands(); return OperandRange(getOperation()->operand_end(), getOperation()->operand_end()); @@ -107,15 +107,18 @@ transform::AlternativesOp::getEntrySuccessorOperands(RegionBranchPoint point) { void transform::AlternativesOp::getSuccessorRegions( RegionBranchPoint point, SmallVectorImpl ®ions) { for (Region &alternative : llvm::drop_begin( - getAlternatives(), - point.isParent() ? 0 - : point.getRegionOrNull()->getRegionNumber() + 1)) { + getAlternatives(), point.isParent() + ? 0 + : point.getTerminatorPredecessorOrNull() + ->getParentRegion() + ->getRegionNumber() + + 1)) { regions.emplace_back(&alternative, !getOperands().empty() ? alternative.getArguments() : Block::BlockArgListType()); } if (!point.isParent()) - regions.emplace_back(getOperation()->getResults()); + regions.emplace_back(getOperation(), getOperation()->getResults()); } void transform::AlternativesOp::getRegionInvocationBounds( @@ -1740,16 +1743,18 @@ void transform::ForeachOp::getSuccessorRegions( } // Branch back to the region or the parent. - assert(point == getBody() && "unexpected region index"); + assert(point.getTerminatorPredecessorOrNull()->getParentRegion() == + &getBody() && + "unexpected region index"); regions.emplace_back(bodyRegion, bodyRegion->getArguments()); - regions.emplace_back(); + regions.emplace_back(getOperation(), getOperation()->getResults()); } OperandRange -transform::ForeachOp::getEntrySuccessorOperands(RegionBranchPoint point) { +transform::ForeachOp::getEntrySuccessorOperands(RegionSuccessor successor) { // Each block argument handle is mapped to a subset (one op to be precise) // of the payload of the corresponding `targets` operand of ForeachOp. - assert(point == getBody() && "unexpected region index"); + assert(successor.getSuccessor() == &getBody() && "unexpected region index"); return getOperation()->getOperands(); } @@ -2948,8 +2953,8 @@ void transform::SequenceOp::getEffects( } OperandRange -transform::SequenceOp::getEntrySuccessorOperands(RegionBranchPoint point) { - assert(point == getBody() && "unexpected region index"); +transform::SequenceOp::getEntrySuccessorOperands(RegionSuccessor successor) { + assert(successor.getSuccessor() == &getBody() && "unexpected region index"); if (getOperation()->getNumOperands() > 0) return getOperation()->getOperands(); return OperandRange(getOperation()->operand_end(), @@ -2966,8 +2971,10 @@ void transform::SequenceOp::getSuccessorRegions( return; } - assert(point == getBody() && "unexpected region index"); - regions.emplace_back(getOperation()->getResults()); + assert(point.getTerminatorPredecessorOrNull()->getParentRegion() == + &getBody() && + "unexpected region index"); + regions.emplace_back(getOperation(), getOperation()->getResults()); } void transform::SequenceOp::getRegionInvocationBounds( diff --git a/mlir/lib/Dialect/Transform/TuneExtension/TuneExtensionOps.cpp b/mlir/lib/Dialect/Transform/TuneExtension/TuneExtensionOps.cpp index c627158e999ed..f727118f3f9a0 100644 --- a/mlir/lib/Dialect/Transform/TuneExtension/TuneExtensionOps.cpp +++ b/mlir/lib/Dialect/Transform/TuneExtension/TuneExtensionOps.cpp @@ -9,6 +9,7 @@ #include "mlir/Dialect/Transform/IR/TransformOps.h" #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h" #include "mlir/IR/OpImplementation.h" +#include "mlir/Interfaces/ControlFlowInterfaces.h" #include "llvm/Support/Debug.h" #include "mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.h" @@ -112,7 +113,7 @@ static void printAlternativesOpSelectedRegion(OpAsmPrinter &printer, } OperandRange transform::tune::AlternativesOp::getEntrySuccessorOperands( - RegionBranchPoint point) { + RegionSuccessor successor) { // No operands will be forwarded to the region(s). return getOperands().slice(0, 0); } @@ -128,7 +129,7 @@ void transform::tune::AlternativesOp::getSuccessorRegions( for (Region &alternative : getAlternatives()) regions.emplace_back(&alternative, Block::BlockArgListType()); else - regions.emplace_back(getOperation()->getResults()); + regions.emplace_back(getOperation(), getOperation()->getResults()); } void transform::tune::AlternativesOp::getRegionInvocationBounds( diff --git a/mlir/lib/IR/Diagnostics.cpp b/mlir/lib/IR/Diagnostics.cpp index 776b5c6588c71..f4c9242ed3479 100644 --- a/mlir/lib/IR/Diagnostics.cpp +++ b/mlir/lib/IR/Diagnostics.cpp @@ -138,6 +138,10 @@ Diagnostic &Diagnostic::operator<<(Operation &op) { return appendOp(op, OpPrintingFlags()); } +Diagnostic &Diagnostic::operator<<(OpWithFlags op) { + return appendOp(*op.getOperation(), op.flags()); +} + Diagnostic &Diagnostic::appendOp(Operation &op, const OpPrintingFlags &flags) { std::string str; llvm::raw_string_ostream os(str); diff --git a/mlir/lib/IR/Region.cpp b/mlir/lib/IR/Region.cpp index 46b6298076d48..15a941f380225 100644 --- a/mlir/lib/IR/Region.cpp +++ b/mlir/lib/IR/Region.cpp @@ -253,6 +253,21 @@ void Region::OpIterator::skipOverBlocksWithNoOps() { operation = block->begin(); } +llvm::raw_ostream &mlir::operator<<(llvm::raw_ostream &os, Region ®ion) { + if (!region.getParentOp()) { + os << "Region has no parent op"; + } else { + os << "Region #" << region.getRegionNumber() << " in operation " + << region.getParentOp()->getName(); + } + for (auto it : llvm::enumerate(region.getBlocks())) { + os << "\n Block #" << it.index() << ":"; + for (Operation &op : it.value().getOperations()) + os << "\n " << OpWithFlags(&op, OpPrintingFlags().skipRegions()); + } + return os; +} + //===----------------------------------------------------------------------===// // RegionRange //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp index ca3f7666dba8a..1e56810ff7aaf 100644 --- a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp +++ b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp @@ -9,7 +9,9 @@ #include #include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Operation.h" #include "mlir/Interfaces/ControlFlowInterfaces.h" +#include "llvm/Support/DebugLog.h" using namespace mlir; @@ -38,20 +40,31 @@ SuccessorOperands::SuccessorOperands(unsigned int producedOperandCount, std::optional detail::getBranchSuccessorArgument(const SuccessorOperands &operands, unsigned operandIndex, Block *successor) { + LDBG() << "Getting branch successor argument for operand index " + << operandIndex << " in successor block"; + OperandRange forwardedOperands = operands.getForwardedOperands(); // Check that the operands are valid. - if (forwardedOperands.empty()) + if (forwardedOperands.empty()) { + LDBG() << "No forwarded operands, returning nullopt"; return std::nullopt; + } // Check to ensure that this operand is within the range. unsigned operandsStart = forwardedOperands.getBeginOperandIndex(); if (operandIndex < operandsStart || - operandIndex >= (operandsStart + forwardedOperands.size())) + operandIndex >= (operandsStart + forwardedOperands.size())) { + LDBG() << "Operand index " << operandIndex << " out of range [" + << operandsStart << ", " + << (operandsStart + forwardedOperands.size()) + << "), returning nullopt"; return std::nullopt; + } // Index the successor. unsigned argIndex = operands.getProducedOperandCount() + operandIndex - operandsStart; + LDBG() << "Computed argument index " << argIndex << " for successor block"; return successor->getArgument(argIndex); } @@ -59,9 +72,15 @@ detail::getBranchSuccessorArgument(const SuccessorOperands &operands, LogicalResult detail::verifyBranchSuccessorOperands(Operation *op, unsigned succNo, const SuccessorOperands &operands) { + LDBG() << "Verifying branch successor operands for successor #" << succNo + << " in operation " << op->getName(); + // Check the count. unsigned operandCount = operands.size(); Block *destBB = op->getSuccessor(succNo); + LDBG() << "Branch has " << operandCount << " operands, target block has " + << destBB->getNumArguments() << " arguments"; + if (operandCount != destBB->getNumArguments()) return op->emitError() << "branch has " << operandCount << " operands for successor #" << succNo @@ -69,13 +88,22 @@ detail::verifyBranchSuccessorOperands(Operation *op, unsigned succNo, << destBB->getNumArguments(); // Check the types. + LDBG() << "Checking type compatibility for " + << (operandCount - operands.getProducedOperandCount()) + << " forwarded operands"; for (unsigned i = operands.getProducedOperandCount(); i != operandCount; ++i) { - if (!cast(op).areTypesCompatible( - operands[i].getType(), destBB->getArgument(i).getType())) + Type operandType = operands[i].getType(); + Type argType = destBB->getArgument(i).getType(); + LDBG() << "Checking type compatibility: operand type " << operandType + << " vs argument type " << argType; + + if (!cast(op).areTypesCompatible(operandType, argType)) return op->emitError() << "type mismatch for bb argument #" << i << " of successor #" << succNo; } + + LDBG() << "Branch successor operand verification successful"; return success(); } @@ -126,15 +154,15 @@ LogicalResult detail::verifyRegionBranchWeights(Operation *op) { static InFlightDiagnostic &printRegionEdgeName(InFlightDiagnostic &diag, RegionBranchPoint sourceNo, - RegionBranchPoint succRegionNo) { + RegionSuccessor succRegionNo) { diag << "from "; - if (Region *region = sourceNo.getRegionOrNull()) - diag << "Region #" << region->getRegionNumber(); + if (Operation *op = sourceNo.getTerminatorPredecessorOrNull()) + diag << "Operation " << op->getName(); else diag << "parent operands"; diag << " to "; - if (Region *region = succRegionNo.getRegionOrNull()) + if (Region *region = succRegionNo.getSuccessor()) diag << "Region #" << region->getRegionNumber(); else diag << "parent results"; @@ -145,13 +173,12 @@ static InFlightDiagnostic &printRegionEdgeName(InFlightDiagnostic &diag, /// `sourcePoint`. `getInputsTypesForRegion` is a function that returns the /// types of the inputs that flow to a successor region. static LogicalResult -verifyTypesAlongAllEdges(Operation *op, RegionBranchPoint sourcePoint, - function_ref(RegionBranchPoint)> +verifyTypesAlongAllEdges(RegionBranchOpInterface branchOp, + RegionBranchPoint sourcePoint, + function_ref(RegionSuccessor)> getInputsTypesForRegion) { - auto regionInterface = cast(op); - SmallVector successors; - regionInterface.getSuccessorRegions(sourcePoint, successors); + branchOp.getSuccessorRegions(sourcePoint, successors); for (RegionSuccessor &succ : successors) { FailureOr sourceTypes = getInputsTypesForRegion(succ); @@ -160,10 +187,14 @@ verifyTypesAlongAllEdges(Operation *op, RegionBranchPoint sourcePoint, TypeRange succInputsTypes = succ.getSuccessorInputs().getTypes(); if (sourceTypes->size() != succInputsTypes.size()) { - InFlightDiagnostic diag = op->emitOpError("region control flow edge "); + InFlightDiagnostic diag = + branchOp->emitOpError("region control flow edge "); + std::string succStr; + llvm::raw_string_ostream os(succStr); + os << succ; return printRegionEdgeName(diag, sourcePoint, succ) << ": source has " << sourceTypes->size() - << " operands, but target successor needs " + << " operands, but target successor " << os.str() << " needs " << succInputsTypes.size(); } @@ -171,8 +202,10 @@ verifyTypesAlongAllEdges(Operation *op, RegionBranchPoint sourcePoint, llvm::enumerate(llvm::zip(*sourceTypes, succInputsTypes))) { Type sourceType = std::get<0>(typesIdx.value()); Type inputType = std::get<1>(typesIdx.value()); - if (!regionInterface.areTypesCompatible(sourceType, inputType)) { - InFlightDiagnostic diag = op->emitOpError("along control flow edge "); + + if (!branchOp.areTypesCompatible(sourceType, inputType)) { + InFlightDiagnostic diag = + branchOp->emitOpError("along control flow edge "); return printRegionEdgeName(diag, sourcePoint, succ) << ": source type #" << typesIdx.index() << " " << sourceType << " should match input type #" << typesIdx.index() << " " @@ -180,6 +213,7 @@ verifyTypesAlongAllEdges(Operation *op, RegionBranchPoint sourcePoint, } } } + return success(); } @@ -187,34 +221,18 @@ verifyTypesAlongAllEdges(Operation *op, RegionBranchPoint sourcePoint, LogicalResult detail::verifyTypesAlongControlFlowEdges(Operation *op) { auto regionInterface = cast(op); - auto inputTypesFromParent = [&](RegionBranchPoint point) -> TypeRange { - return regionInterface.getEntrySuccessorOperands(point).getTypes(); + auto inputTypesFromParent = [&](RegionSuccessor successor) -> TypeRange { + return regionInterface.getEntrySuccessorOperands(successor).getTypes(); }; // Verify types along control flow edges originating from the parent. - if (failed(verifyTypesAlongAllEdges(op, RegionBranchPoint::parent(), - inputTypesFromParent))) + if (failed(verifyTypesAlongAllEdges( + regionInterface, RegionBranchPoint::parent(), inputTypesFromParent))) return failure(); - auto areTypesCompatible = [&](TypeRange lhs, TypeRange rhs) { - if (lhs.size() != rhs.size()) - return false; - for (auto types : llvm::zip(lhs, rhs)) { - if (!regionInterface.areTypesCompatible(std::get<0>(types), - std::get<1>(types))) { - return false; - } - } - return true; - }; - // Verify types along control flow edges originating from each region. for (Region ®ion : op->getRegions()) { - - // Since there can be multiple terminators implementing the - // `RegionBranchTerminatorOpInterface`, all should have the same operand - // types when passing them to the same region. - + // Collect all return-like terminators in the region. SmallVector regionReturnOps; for (Block &block : region) if (!block.empty()) @@ -227,33 +245,20 @@ LogicalResult detail::verifyTypesAlongControlFlowEdges(Operation *op) { if (regionReturnOps.empty()) continue; - auto inputTypesForRegion = - [&](RegionBranchPoint point) -> FailureOr { - std::optional regionReturnOperands; - for (RegionBranchTerminatorOpInterface regionReturnOp : regionReturnOps) { - auto terminatorOperands = regionReturnOp.getSuccessorOperands(point); - - if (!regionReturnOperands) { - regionReturnOperands = terminatorOperands; - continue; - } - - // Found more than one ReturnLike terminator. Make sure the operand - // types match with the first one. - if (!areTypesCompatible(regionReturnOperands->getTypes(), - terminatorOperands.getTypes())) { - InFlightDiagnostic diag = op->emitOpError("along control flow edge"); - return printRegionEdgeName(diag, region, point) - << " operands mismatch between return-like terminators"; - } - } - - // All successors get the same set of operand types. - return TypeRange(regionReturnOperands->getTypes()); - }; - - if (failed(verifyTypesAlongAllEdges(op, region, inputTypesForRegion))) - return failure(); + // Verify types along control flow edges originating from each return-like + // terminator. + for (RegionBranchTerminatorOpInterface regionReturnOp : regionReturnOps) { + + auto inputTypesForRegion = + [&](RegionSuccessor successor) -> FailureOr { + OperandRange terminatorOperands = + regionReturnOp.getSuccessorOperands(successor); + return TypeRange(terminatorOperands.getTypes()); + }; + if (failed(verifyTypesAlongAllEdges(regionInterface, regionReturnOp, + inputTypesForRegion))) + return failure(); + } } return success(); @@ -272,31 +277,74 @@ using StopConditionFn = function_ref visited)>; static bool traverseRegionGraph(Region *begin, StopConditionFn stopConditionFn) { auto op = cast(begin->getParentOp()); + LDBG() << "Starting region graph traversal from region #" + << begin->getRegionNumber() << " in operation " << op->getName(); + SmallVector visited(op->getNumRegions(), false); visited[begin->getRegionNumber()] = true; + LDBG() << "Initialized visited array with " << op->getNumRegions() + << " regions"; // Retrieve all successors of the region and enqueue them in the worklist. SmallVector worklist; auto enqueueAllSuccessors = [&](Region *region) { - SmallVector successors; - op.getSuccessorRegions(region, successors); - for (RegionSuccessor successor : successors) - if (!successor.isParent()) - worklist.push_back(successor.getSuccessor()); + LDBG() << "Enqueuing successors for region #" << region->getRegionNumber(); + SmallVector operandAttributes(op->getNumOperands()); + for (Block &block : *region) { + if (block.empty()) + continue; + auto terminator = + dyn_cast(block.back()); + if (!terminator) + continue; + SmallVector successors; + operandAttributes.resize(terminator->getNumOperands()); + terminator.getSuccessorRegions(operandAttributes, successors); + LDBG() << "Found " << successors.size() + << " successors from terminator in block"; + for (RegionSuccessor successor : successors) { + if (!successor.isParent()) { + worklist.push_back(successor.getSuccessor()); + LDBG() << "Added region #" + << successor.getSuccessor()->getRegionNumber() + << " to worklist"; + } else { + LDBG() << "Skipping parent successor"; + } + } + } }; enqueueAllSuccessors(begin); + LDBG() << "Initial worklist size: " << worklist.size(); // Process all regions in the worklist via DFS. while (!worklist.empty()) { Region *nextRegion = worklist.pop_back_val(); - if (stopConditionFn(nextRegion, visited)) + LDBG() << "Processing region #" << nextRegion->getRegionNumber() + << " from worklist (remaining: " << worklist.size() << ")"; + + if (stopConditionFn(nextRegion, visited)) { + LDBG() << "Stop condition met for region #" + << nextRegion->getRegionNumber() << ", returning true"; return true; - if (visited[nextRegion->getRegionNumber()]) + } + llvm::dbgs() << "Region: " << nextRegion << "\n"; + if (!nextRegion->getParentOp()) { + llvm::errs() << "Region " << *nextRegion << " has no parent op\n"; + return false; + } + if (visited[nextRegion->getRegionNumber()]) { + LDBG() << "Region #" << nextRegion->getRegionNumber() + << " already visited, skipping"; continue; + } visited[nextRegion->getRegionNumber()] = true; + LDBG() << "Marking region #" << nextRegion->getRegionNumber() + << " as visited"; enqueueAllSuccessors(nextRegion); } + LDBG() << "Traversal completed, returning false"; return false; } @@ -322,18 +370,26 @@ static bool isRegionReachable(Region *begin, Region *r) { /// mutually exclusive if they are not reachable from each other as per /// RegionBranchOpInterface::getSuccessorRegions. bool mlir::insideMutuallyExclusiveRegions(Operation *a, Operation *b) { + LDBG() << "Checking if operations are in mutually exclusive regions: " + << a->getName() << " and " << b->getName(); + assert(a && "expected non-empty operation"); assert(b && "expected non-empty operation"); auto branchOp = a->getParentOfType(); while (branchOp) { + LDBG() << "Checking branch operation " << branchOp->getName(); + // Check if b is inside branchOp. (We already know that a is.) if (!branchOp->isProperAncestor(b)) { + LDBG() << "Operation b is not inside branchOp, checking next ancestor"; // Check next enclosing RegionBranchOpInterface. branchOp = branchOp->getParentOfType(); continue; } + LDBG() << "Both operations are inside branchOp, finding their regions"; + // b is contained in branchOp. Retrieve the regions in which `a` and `b` // are contained. Region *regionA = nullptr, *regionB = nullptr; @@ -341,63 +397,136 @@ bool mlir::insideMutuallyExclusiveRegions(Operation *a, Operation *b) { if (r.findAncestorOpInRegion(*a)) { assert(!regionA && "already found a region for a"); regionA = &r; + LDBG() << "Found region #" << r.getRegionNumber() << " for operation a"; } if (r.findAncestorOpInRegion(*b)) { assert(!regionB && "already found a region for b"); regionB = &r; + LDBG() << "Found region #" << r.getRegionNumber() << " for operation b"; } } assert(regionA && regionB && "could not find region of op"); + LDBG() << "Region A: #" << regionA->getRegionNumber() << ", Region B: #" + << regionB->getRegionNumber(); + // `a` and `b` are in mutually exclusive regions if both regions are // distinct and neither region is reachable from the other region. - return regionA != regionB && !isRegionReachable(regionA, regionB) && - !isRegionReachable(regionB, regionA); + bool regionsAreDistinct = (regionA != regionB); + bool aNotReachableFromB = !isRegionReachable(regionA, regionB); + bool bNotReachableFromA = !isRegionReachable(regionB, regionA); + + LDBG() << "Regions distinct: " << regionsAreDistinct + << ", A not reachable from B: " << aNotReachableFromB + << ", B not reachable from A: " << bNotReachableFromA; + + bool mutuallyExclusive = + regionsAreDistinct && aNotReachableFromB && bNotReachableFromA; + LDBG() << "Operations are mutually exclusive: " << mutuallyExclusive; + + return mutuallyExclusive; } // Could not find a common RegionBranchOpInterface among a's and b's // ancestors. + LDBG() << "No common RegionBranchOpInterface found, operations are not " + "mutually exclusive"; return false; } bool RegionBranchOpInterface::isRepetitiveRegion(unsigned index) { + LDBG() << "Checking if region #" << index << " is repetitive in operation " + << getOperation()->getName(); + Region *region = &getOperation()->getRegion(index); - return isRegionReachable(region, region); + bool isRepetitive = isRegionReachable(region, region); + + LDBG() << "Region #" << index << " is repetitive: " << isRepetitive; + return isRepetitive; } bool RegionBranchOpInterface::hasLoop() { + LDBG() << "Checking if operation " << getOperation()->getName() + << " has loops"; + SmallVector entryRegions; getSuccessorRegions(RegionBranchPoint::parent(), entryRegions); - for (RegionSuccessor successor : entryRegions) - if (!successor.isParent() && - traverseRegionGraph(successor.getSuccessor(), - [](Region *nextRegion, ArrayRef visited) { - // Interrupt traversal if the region was already - // visited. - return visited[nextRegion->getRegionNumber()]; - })) - return true; + LDBG() << "Found " << entryRegions.size() << " entry regions"; + + for (RegionSuccessor successor : entryRegions) { + if (!successor.isParent()) { + LDBG() << "Checking entry region #" + << successor.getSuccessor()->getRegionNumber() << " for loops"; + + bool hasLoop = + traverseRegionGraph(successor.getSuccessor(), + [](Region *nextRegion, ArrayRef visited) { + // Interrupt traversal if the region was already + // visited. + return visited[nextRegion->getRegionNumber()]; + }); + + if (hasLoop) { + LDBG() << "Found loop in entry region #" + << successor.getSuccessor()->getRegionNumber(); + return true; + } + } else { + LDBG() << "Skipping parent successor"; + } + } + + LDBG() << "No loops found in operation"; return false; } Region *mlir::getEnclosingRepetitiveRegion(Operation *op) { + LDBG() << "Finding enclosing repetitive region for operation " + << op->getName(); + while (Region *region = op->getParentRegion()) { + LDBG() << "Checking region #" << region->getRegionNumber() + << " in operation " << region->getParentOp()->getName(); + op = region->getParentOp(); - if (auto branchOp = dyn_cast(op)) - if (branchOp.isRepetitiveRegion(region->getRegionNumber())) + if (auto branchOp = dyn_cast(op)) { + LDBG() + << "Found RegionBranchOpInterface, checking if region is repetitive"; + if (branchOp.isRepetitiveRegion(region->getRegionNumber())) { + LDBG() << "Found repetitive region #" << region->getRegionNumber(); return region; + } + } else { + LDBG() << "Parent operation does not implement RegionBranchOpInterface"; + } } + + LDBG() << "No enclosing repetitive region found"; return nullptr; } Region *mlir::getEnclosingRepetitiveRegion(Value value) { + LDBG() << "Finding enclosing repetitive region for value"; + Region *region = value.getParentRegion(); while (region) { + LDBG() << "Checking region #" << region->getRegionNumber() + << " in operation " << region->getParentOp()->getName(); + Operation *op = region->getParentOp(); - if (auto branchOp = dyn_cast(op)) - if (branchOp.isRepetitiveRegion(region->getRegionNumber())) + if (auto branchOp = dyn_cast(op)) { + LDBG() + << "Found RegionBranchOpInterface, checking if region is repetitive"; + if (branchOp.isRepetitiveRegion(region->getRegionNumber())) { + LDBG() << "Found repetitive region #" << region->getRegionNumber(); return region; + } + } else { + LDBG() << "Parent operation does not implement RegionBranchOpInterface"; + } region = op->getParentRegion(); } + + LDBG() << "No enclosing repetitive region found for value"; return nullptr; } diff --git a/mlir/lib/Transforms/RemoveDeadValues.cpp b/mlir/lib/Transforms/RemoveDeadValues.cpp index e0c65b0e09774..41f3f9d76a3b1 100644 --- a/mlir/lib/Transforms/RemoveDeadValues.cpp +++ b/mlir/lib/Transforms/RemoveDeadValues.cpp @@ -432,8 +432,7 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp, // Return the successors of `region` if the latter is not null. Else return // the successors of `regionBranchOp`. - auto getSuccessors = [&](Region *region = nullptr) { - auto point = region ? region : RegionBranchPoint::parent(); + auto getSuccessors = [&](RegionBranchPoint point) { SmallVector successors; regionBranchOp.getSuccessorRegions(point, successors); return successors; @@ -456,7 +455,8 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp, // `nonForwardedOperands`. auto markNonForwardedOperands = [&](BitVector &nonForwardedOperands) { nonForwardedOperands.resize(regionBranchOp->getNumOperands(), true); - for (const RegionSuccessor &successor : getSuccessors()) { + for (const RegionSuccessor &successor : + getSuccessors(RegionBranchPoint::parent())) { for (OpOperand *opOperand : getForwardedOpOperands(successor)) nonForwardedOperands.reset(opOperand->getOperandNumber()); } @@ -469,10 +469,13 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp, for (Region ®ion : regionBranchOp->getRegions()) { if (region.empty()) continue; + // TODO: this isn't correct in face of multiple terminators. Operation *terminator = region.front().getTerminator(); nonForwardedRets[terminator] = BitVector(terminator->getNumOperands(), true); - for (const RegionSuccessor &successor : getSuccessors(®ion)) { + for (const RegionSuccessor &successor : + getSuccessors(RegionBranchPoint( + cast(terminator)))) { for (OpOperand *opOperand : getForwardedOpOperands(successor, terminator)) nonForwardedRets[terminator].reset(opOperand->getOperandNumber()); @@ -489,8 +492,13 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp, DenseMap &argsToKeep, Region *region = nullptr) { Operation *terminator = region ? region->front().getTerminator() : nullptr; + RegionBranchPoint point = + terminator + ? RegionBranchPoint( + cast(terminator)) + : RegionBranchPoint::parent(); - for (const RegionSuccessor &successor : getSuccessors(region)) { + for (const RegionSuccessor &successor : getSuccessors(point)) { Region *successorRegion = successor.getSuccessor(); for (auto [opOperand, input] : llvm::zip(getForwardedOpOperands(successor, terminator), @@ -517,7 +525,8 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp, resultsOrArgsToKeepChanged = false; // Recompute `resultsToKeep` and `argsToKeep` based on `operandsToKeep`. - for (const RegionSuccessor &successor : getSuccessors()) { + for (const RegionSuccessor &successor : + getSuccessors(RegionBranchPoint::parent())) { Region *successorRegion = successor.getSuccessor(); for (auto [opOperand, input] : llvm::zip(getForwardedOpOperands(successor), @@ -551,7 +560,9 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp, if (region.empty()) continue; Operation *terminator = region.front().getTerminator(); - for (const RegionSuccessor &successor : getSuccessors(®ion)) { + for (const RegionSuccessor &successor : + getSuccessors(RegionBranchPoint( + cast(terminator)))) { Region *successorRegion = successor.getSuccessor(); for (auto [opOperand, input] : llvm::zip(getForwardedOpOperands(successor, terminator), diff --git a/mlir/test/Dialect/SCF/invalid.mlir b/mlir/test/Dialect/SCF/invalid.mlir index 37fc86b18e7f0..3f481ad5dbba7 100644 --- a/mlir/test/Dialect/SCF/invalid.mlir +++ b/mlir/test/Dialect/SCF/invalid.mlir @@ -373,7 +373,7 @@ func.func @reduceReturn_not_inside_reduce(%arg0 : f32) { func.func @std_if_incorrect_yield(%arg0: i1, %arg1: f32) { - // expected-error@+1 {{region control flow edge from Region #0 to parent results: source has 1 operands, but target successor needs 2}} + // expected-error@+1 {{region control flow edge from Operation scf.yield to parent results: source has 1 operands, but target successor needs 2}} %x, %y = scf.if %arg0 -> (f32, f32) { %0 = arith.addf %arg1, %arg1 : f32 scf.yield %0 : f32 @@ -544,7 +544,7 @@ func.func @while_invalid_terminator() { func.func @while_cross_region_type_mismatch() { %true = arith.constant true - // expected-error@+1 {{'scf.while' op region control flow edge from Region #0 to Region #1: source has 0 operands, but target successor needs 1}} + // expected-error@+1 {{region control flow edge from Operation scf.condition to Region #1: source has 0 operands, but target successor needs 1}} scf.while : () -> () { scf.condition(%true) } do { @@ -557,7 +557,7 @@ func.func @while_cross_region_type_mismatch() { func.func @while_cross_region_type_mismatch() { %true = arith.constant true - // expected-error@+1 {{'scf.while' op along control flow edge from Region #0 to Region #1: source type #0 'i1' should match input type #0 'i32'}} + // expected-error@+1 {{along control flow edge from Operation scf.condition to Region #1: source type #0 'i1' should match input type #0 'i32'}} %0 = scf.while : () -> (i1) { scf.condition(%true) %true : i1 } do { @@ -570,7 +570,7 @@ func.func @while_cross_region_type_mismatch() { func.func @while_result_type_mismatch() { %true = arith.constant true - // expected-error@+1 {{'scf.while' op region control flow edge from Region #0 to parent results: source has 1 operands, but target successor needs 0}} + // expected-error@+1 {{region control flow edge from Operation scf.condition to parent results: source has 1 operands, but target successor needs 0}} scf.while : () -> () { scf.condition(%true) %true : i1 } do { diff --git a/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp b/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp index eb0d9801e7d3f..7a7a58384fbb8 100644 --- a/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp +++ b/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp @@ -66,7 +66,7 @@ class NextAccessAnalysis : public DenseBackwardDataFlowAnalysis { void visitRegionBranchControlFlowTransfer(RegionBranchOpInterface branch, RegionBranchPoint regionFrom, - RegionBranchPoint regionTo, + RegionSuccessor regionTo, const NextAccess &after, NextAccess *before) override; @@ -240,7 +240,7 @@ void NextAccessAnalysis::visitCallControlFlowTransfer( void NextAccessAnalysis::visitRegionBranchControlFlowTransfer( RegionBranchOpInterface branch, RegionBranchPoint regionFrom, - RegionBranchPoint regionTo, const NextAccess &after, NextAccess *before) { + RegionSuccessor regionTo, const NextAccess &after, NextAccess *before) { LDBG() << "visitRegionBranchControlFlowTransfer: " << OpWithFlags(branch.getOperation(), OpPrintingFlags().skipRegions()); LDBG() << " regionFrom: " << (regionFrom.isParent() ? "parent" : "region"); diff --git a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp index b211e243f234c..4d4ec02546bc7 100644 --- a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp +++ b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp @@ -633,8 +633,9 @@ ParseResult RegionIfOp::parse(OpAsmParser &parser, OperationState &result) { parser.getCurrentLocation(), result.operands); } -OperandRange RegionIfOp::getEntrySuccessorOperands(RegionBranchPoint point) { - assert(llvm::is_contained({&getThenRegion(), &getElseRegion()}, point) && +OperandRange RegionIfOp::getEntrySuccessorOperands(RegionSuccessor successor) { + assert(llvm::is_contained({&getThenRegion(), &getElseRegion()}, + successor.getSuccessor()) && "invalid region index"); return getOperands(); } @@ -643,10 +644,11 @@ void RegionIfOp::getSuccessorRegions( RegionBranchPoint point, SmallVectorImpl ®ions) { // We always branch to the join region. if (!point.isParent()) { - if (point != getJoinRegion()) + if (point.getTerminatorPredecessorOrNull()->getParentRegion() != + &getJoinRegion()) regions.push_back(RegionSuccessor(&getJoinRegion(), getJoinArgs())); else - regions.push_back(RegionSuccessor(getResults())); + regions.push_back(RegionSuccessor(getOperation(), getResults())); return; } @@ -673,7 +675,7 @@ void AnyCondOp::getSuccessorRegions(RegionBranchPoint point, if (point.isParent()) regions.emplace_back(&getRegion()); else - regions.emplace_back(getResults()); + regions.emplace_back(getOperation(), getResults()); } void AnyCondOp::getRegionInvocationBounds( @@ -1107,11 +1109,11 @@ void LoopBlockOp::getSuccessorRegions( if (point.isParent()) return; - regions.emplace_back((*this)->getResults()); + regions.emplace_back(getOperation(), getOperation()->getResults()); } -OperandRange LoopBlockOp::getEntrySuccessorOperands(RegionBranchPoint point) { - assert(point == getBody()); +OperandRange LoopBlockOp::getEntrySuccessorOperands(RegionSuccessor successor) { + assert(successor.getSuccessor() == &getBody()); return MutableOperandRange(getInitMutable()); } @@ -1120,8 +1122,8 @@ OperandRange LoopBlockOp::getEntrySuccessorOperands(RegionBranchPoint point) { //===----------------------------------------------------------------------===// MutableOperandRange -LoopBlockTerminatorOp::getMutableSuccessorOperands(RegionBranchPoint point) { - if (point.isParent()) +LoopBlockTerminatorOp::getMutableSuccessorOperands(RegionSuccessor successor) { + if (successor.isParent()) return getExitArgMutable(); return getNextIterArgMutable(); } @@ -1213,7 +1215,7 @@ void TestStoreWithARegion::getSuccessorRegions( if (point.isParent()) regions.emplace_back(&getBody(), getBody().front().getArguments()); else - regions.emplace_back(); + regions.emplace_back(getOperation(), getOperation()->getResults()); } //===----------------------------------------------------------------------===// @@ -1227,7 +1229,7 @@ void TestStoreWithALoopRegion::getSuccessorRegions( // enter the body. regions.emplace_back( RegionSuccessor(&getBody(), getBody().front().getArguments())); - regions.emplace_back(); + regions.emplace_back(getOperation(), getOperation()->getResults()); } //===----------------------------------------------------------------------===// diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td index 05a33cf1afd94..a3430ba49a291 100644 --- a/mlir/test/lib/Dialect/Test/TestOps.td +++ b/mlir/test/lib/Dialect/Test/TestOps.td @@ -2581,7 +2581,7 @@ def LoopBlockTerminatorOp : TEST_Op<"loop_block_term", def TestNoTerminatorOp : TEST_Op<"switch_with_no_break", [ NoTerminator, - DeclareOpInterfaceMethods + DeclareOpInterfaceMethods ]> { let arguments = (ins Index:$arg, DenseI64ArrayAttr:$cases); let regions = (region VariadicRegion>:$caseRegions); diff --git a/mlir/unittests/Interfaces/ControlFlowInterfacesTest.cpp b/mlir/unittests/Interfaces/ControlFlowInterfacesTest.cpp index f1aae15393fd3..2e6950fca6be2 100644 --- a/mlir/unittests/Interfaces/ControlFlowInterfacesTest.cpp +++ b/mlir/unittests/Interfaces/ControlFlowInterfacesTest.cpp @@ -13,17 +13,24 @@ #include "mlir/IR/OpDefinition.h" #include "mlir/IR/OpImplementation.h" #include "mlir/Parser/Parser.h" +#include "llvm/Support/DebugLog.h" #include using namespace mlir; /// A dummy op that is also a terminator. -struct DummyOp : public Op { +struct DummyOp : public Op { using Op::Op; static ArrayRef getAttributeNames() { return {}; } static StringRef getOperationName() { return "cftest.dummy_op"; } + + MutableOperandRange getMutableSuccessorOperands(RegionSuccessor point) { + return MutableOperandRange(getOperation(), 0, 0); + } }; /// All regions of this op are mutually exclusive. @@ -39,6 +46,8 @@ struct MutuallyExclusiveRegionsOp // Regions have no successors. void getSuccessorRegions(RegionBranchPoint point, SmallVectorImpl ®ions) {} + using RegionBranchOpInterface::Trait< + MutuallyExclusiveRegionsOp>::getSuccessorRegions; }; /// All regions of this op call each other in a large circle. @@ -53,13 +62,18 @@ struct LoopRegionsOp void getSuccessorRegions(RegionBranchPoint point, SmallVectorImpl ®ions) { - if (Region *region = point.getRegionOrNull()) { - if (point == (*this)->getRegion(1)) + if (point.getTerminatorPredecessorOrNull()) { + Region *region = + point.getTerminatorPredecessorOrNull()->getParentRegion(); + if (region == &(*this)->getRegion(1)) // This region also branches back to the parent. - regions.push_back(RegionSuccessor()); + regions.push_back( + RegionSuccessor(getOperation()->getParentOp(), + getOperation()->getParentOp()->getResults())); regions.push_back(RegionSuccessor(region)); } } + using RegionBranchOpInterface::Trait::getSuccessorRegions; }; /// Each region branches back it itself or the parent. @@ -75,11 +89,17 @@ struct DoubleLoopRegionsOp void getSuccessorRegions(RegionBranchPoint point, SmallVectorImpl ®ions) { - if (Region *region = point.getRegionOrNull()) { - regions.push_back(RegionSuccessor()); + if (point.getTerminatorPredecessorOrNull()) { + Region *region = + point.getTerminatorPredecessorOrNull()->getParentRegion(); + regions.push_back( + RegionSuccessor(getOperation()->getParentOp(), + getOperation()->getParentOp()->getResults())); regions.push_back(RegionSuccessor(region)); } } + using RegionBranchOpInterface::Trait< + DoubleLoopRegionsOp>::getSuccessorRegions; }; /// Regions are executed sequentially. @@ -93,11 +113,15 @@ struct SequentialRegionsOp // Region 0 has Region 1 as a successor. void getSuccessorRegions(RegionBranchPoint point, SmallVectorImpl ®ions) { - if (point == (*this)->getRegion(0)) { + if (point.getTerminatorPredecessorOrNull() && + point.getTerminatorPredecessorOrNull()->getParentRegion() == + &(*this)->getRegion(0)) { Operation *thisOp = this->getOperation(); regions.push_back(RegionSuccessor(&thisOp->getRegion(1))); } } + using RegionBranchOpInterface::Trait< + SequentialRegionsOp>::getSuccessorRegions; }; /// A dialect putting all the above together. From 7b9d1fca919f28a469ce720a17de9875c2e1287a Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Tue, 28 Oct 2025 12:54:20 -0400 Subject: [PATCH 037/539] [PowerPC] Implement Context Switch Instr mtlpl (#160593) Add new instruction `mtlpl`. --- llvm/lib/Target/PowerPC/PPCInstrFuture.td | 20 +++++++++++++++++++ .../PowerPC/ppc-encoding-ISAFuture.txt | 3 +++ .../PowerPC/ppc64le-encoding-ISAFuture.txt | 3 +++ llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s | 4 ++++ 4 files changed, 30 insertions(+) diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index b0bed71c6755f..da3efdc15f1e1 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -194,6 +194,22 @@ class XX3Form_XTAB6 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, let Inst{31} = XT{5}; } +class XForm_RBS5 opCode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list pattern> + : I { + + bits<5> RB; + bits<5> RS; + + let Pattern = pattern; + + let Inst{6...10} = RS; + let Inst{11...15} = 0; + let Inst{16...20} = RB; + let Inst{21...30} = xo; + let Inst{31} = 0; +} + class XX3Form_XTAB6_S xo, dag OOL, dag IOL, string asmstr, list pattern> : I<59, OOL, IOL, asmstr, NoItinerary> { @@ -317,12 +333,16 @@ let Predicates = [IsISAFuture] in { def TLBIEIO : XForm_RSB5_UIMM2<31, 18, (outs), (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC), "tlbieio $RB, $RS, $RIC", []>; + def MTLPL : XForm_RBS5<31, 275, (outs), (ins gprc:$RB, gprc:$RS), + "mtlpl $RB, $RS", IIC_SprMTSPR, []>; let Interpretation64Bit = 1, isCodeGenOnly = 1 in { def TLBIEP8 : XForm_RSB5_UIMM2_2UIMM1<31, 50, (outs), (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC, u1imm:$PRS, u1imm:$R), "tlbiep $RB, $RS, $RIC, $PRS, $R", []>; + def MTLPL8 : XForm_RBS5<31, 275, (outs), (ins g8rc:$RB, g8rc:$RS), + "mtlpl $RB, $RS", IIC_SprMTSPR, []>, isPPC64; } } diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt index 054489ce51a60..f5cb4b72959f9 100644 --- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt +++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt @@ -286,6 +286,9 @@ #CHECK: xvmulhuh 4, 5, 7 0xf0,0x85,0x3b,0xd0 +#CHECK: mtlpl 3, 4 +0x7c,0x80,0x1a,0x26 + #CHECK: xxmulmul 8, 3, 4, 2 0xed,0x03,0x22,0x08 diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt index 17d1413bacc3a..f0df8ce39021b 100644 --- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt +++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt @@ -280,6 +280,9 @@ #CHECK: xvmulhuh 4, 5, 7 0xd0,0x3b,0x85,0xf0 +#CHECK: mtlpl 3, 4 +0x26,0x1a,0x80,0x7c + #CHECK: xxmulmul 8, 3, 4, 2 0x08,0x22,0x03,0xed diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s index e5bc1f47bf666..bc0683e38887c 100644 --- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s +++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s @@ -403,6 +403,10 @@ #CHECK-BE: xvmulhuh 4, 5, 7 # encoding: [0xf0,0x85,0x3b,0xd0] #CHECK-LE: xvmulhuh 4, 5, 7 # encoding: [0xd0,0x3b,0x85,0xf0] + mtlpl 3, 4 +#CHECK-BE: mtlpl 3, 4 # encoding: [0x7c,0x80,0x1a,0x26] +#CHECK-LE: mtlpl 3, 4 # encoding: [0x26,0x1a,0x80,0x7c] + xxmulmul 8, 3, 4, 2 #CHECK-BE: xxmulmul 8, 3, 4, 2 # encoding: [0xed,0x03,0x22,0x08] #CHECK-LE: xxmulmul 8, 3, 4, 2 # encoding: [0x08,0x22,0x03,0xed] From 3ddeb486835906c5f94008dfeb5e17e5a26b8cea Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Tue, 28 Oct 2025 09:56:32 -0700 Subject: [PATCH 038/539] [MLIR] Fix some typos in AffineOps.td (NFC) --- mlir/include/mlir/Dialect/Affine/IR/AffineOps.td | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td index 12a79358d42f1..409bd05292e0d 100644 --- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td +++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td @@ -714,7 +714,7 @@ def AffineParallelOp : Affine_Op<"parallel", operand_range getUpperBoundsOperands(); AffineValueMap getUpperBoundsValueMap(); - /// Sets elements fo the loop upper bound. + /// Sets elements of the loop upper bound. void setUpperBounds(ValueRange operands, AffineMap map); void setSteps(ArrayRef newSteps); @@ -999,7 +999,7 @@ def AffineVectorStoreOp : AffineStoreOpBase<"vector_store"> { elemental type, supplied as its second operand. The index for each memref dimension is an affine expression of loop induction variables and symbols. These indices determine the start position - of the write within the memref. The shape of th input vector determines the + of the write within the memref. The shape of the input vector determines the shape of the slice written to the memref. This slice is contiguous along the respective dimensions of the shape. Strided vector stores will be supported in the future. @@ -1188,7 +1188,7 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index", If all `N` basis elements are provided, the linearize_index operation is said to "have an outer bound". - As a convenience, and for symmetry with `getPaddedBasis()`, ifg the first + As a convenience, and for symmetry with `getPaddedBasis()`, if the first element of a set of `OpFoldResult`s passed to the builders of this operation is `nullptr`, that element is ignored. From 1e7486dabce03a79d2c367153dfba84bd933434e Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Tue, 28 Oct 2025 16:59:53 +0000 Subject: [PATCH 039/539] [LV] Bundle (partial) reductions with a mul of a constant (#162503) A reduction (including partial reductions) with a multiply of a constant value can be bundled by first converting it from `reduce.add(mul(ext, const))` to `reduce.add(mul(ext, ext(const)))` as long as it is safe to extend the constant. This PR adds such bundling by first truncating the constant to the source type of the other extend, then extending it to the destination type of the extend. The first truncate is necessary so that the types of each extend's operand are then the same, and the call to canConstantBeExtended proves that the extend following a truncate is safe to do. The truncate is removed by optimisations. This is a stacked PR, 1a and 1b can be merged in any order: 1a. https://github.com/llvm/llvm-project/pull/147302 1b. https://github.com/llvm/llvm-project/pull/163175 2. -> https://github.com/llvm/llvm-project/pull/162503 --- .../Transforms/Vectorize/VPlanTransforms.cpp | 63 +- .../LoopVectorize/reduction-inloop.ll | 82 +++ .../vplan-printing-reductions.ll | 542 ++++++++++++++++++ 3 files changed, 676 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index acad795e327ba..d9ac26bba7507 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -3648,6 +3648,37 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, Sub = VecOp->getDefiningRecipe(); VecOp = Tmp; } + + // If ValB is a constant and can be safely extended, truncate it to the same + // type as ExtA's operand, then extend it to the same type as ExtA. This + // creates two uniform extends that can more easily be matched by the rest of + // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all + // replaced with the new extend of the constant. + auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA, + VPWidenCastRecipe *&ExtB, + VPValue *&ValB, VPWidenRecipe *Mul) { + if (!ExtA || ExtB || !ValB->isLiveIn()) + return; + Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0)); + Instruction::CastOps ExtOpc = ExtA->getOpcode(); + const APInt *Const; + if (!match(ValB, m_APInt(Const)) || + !llvm::canConstantBeExtended( + Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc))) + return; + // The truncate ensures that the type of each extended operand is the + // same, and it's been proven that the constant can be extended from + // NarrowTy safely. Necessary since ExtA's extended operand would be + // e.g. an i8, while the const will likely be an i32. This will be + // elided by later optimisations. + VPBuilder Builder(Mul); + auto *Trunc = + Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy); + Type *WideTy = Ctx.Types.inferScalarType(ExtA); + ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy); + Mul->setOperand(1, ExtB); + }; + // Try to match reduce.add(mul(...)). if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) { auto *RecipeA = @@ -3656,6 +3687,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, dyn_cast_if_present(B->getDefiningRecipe()); auto *Mul = cast(VecOp->getDefiningRecipe()); + // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const))) + ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul); + // Match reduce.add/sub(mul(ext, ext)). if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) && match(RecipeB, m_ZExtOrSExt(m_VPValue())) && @@ -3665,7 +3699,6 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, cast(Sub), Red); return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red); } - // Match reduce.add(mul). // TODO: Add an expression type for this variant with a negated mul if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr)) return new VPExpressionRecipe(Mul, Red); @@ -3674,18 +3707,26 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // variants. if (Sub) return nullptr; - // Match reduce.add(ext(mul(ext(A), ext(B)))). - // All extend recipes must have same opcode or A == B - // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))). - if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()), - m_ZExtOrSExt(m_VPValue()))))) { + + // Match reduce.add(ext(mul(A, B))). + if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) { auto *Ext = cast(VecOp->getDefiningRecipe()); auto *Mul = cast(Ext->getOperand(0)->getDefiningRecipe()); - auto *Ext0 = - cast(Mul->getOperand(0)->getDefiningRecipe()); - auto *Ext1 = - cast(Mul->getOperand(1)->getDefiningRecipe()); - if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && + auto *Ext0 = dyn_cast_if_present(A->getDefiningRecipe()); + auto *Ext1 = dyn_cast_if_present(B->getDefiningRecipe()); + + // reduce.add(ext(mul(ext, const))) + // -> reduce.add(ext(mul(ext, ext(const)))) + ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul); + + // reduce.add(ext(mul(ext(A), ext(B)))) + // -> reduce.add(mul(wider_ext(A), wider_ext(B))) + // The inner extends must either have the same opcode as the outer extend or + // be the same, in which case the multiply can never result in a negative + // value and the outer extend can be folded away by doing wider + // extends for the operands of the mul. + if (Ext0 && Ext1 && + (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && Ext0->getOpcode() == Ext1->getOpcode() && IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) { auto *NewExt0 = new VPWidenCastRecipe( diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll index 964a257ef352f..fafa82c211dc6 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -2800,6 +2800,88 @@ exit: ret i64 %r.0.lcssa } +define i32 @reduction_expression_ext_mulacc_livein(ptr %a, i16 %c) { +; CHECK-LABEL: define i32 @reduction_expression_ext_mulacc_livein( +; CHECK-SAME: ptr [[A:%.*]], i16 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i16> +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i16> [[BROADCAST_SPLAT]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP5]] = add i32 [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[FOR_EXIT:.*]] +; CHECK: [[FOR_EXIT]]: +; CHECK-NEXT: ret i32 [[TMP5]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @reduction_expression_ext_mulacc_livein( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[C:%.*]]) { +; CHECK-INTERLEAVED-NEXT: [[ENTRY:.*:]] +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVED: [[VECTOR_PH]]: +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVED: [[VECTOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i16> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[WIDE_LOAD2]] to <4 x i16> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul <4 x i16> [[BROADCAST_SPLAT]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul <4 x i16> [[BROADCAST_SPLAT]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) +; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add i32 [[VEC_PHI]], [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) +; CHECK-INTERLEAVED-NEXT: [[TMP11]] = add i32 [[VEC_PHI1]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK-INTERLEAVED: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP11]], [[TMP8]] +; CHECK-INTERLEAVED-NEXT: br label %[[FOR_EXIT:.*]] +; CHECK-INTERLEAVED: [[FOR_EXIT]]: +; CHECK-INTERLEAVED-NEXT: ret i32 [[BIN_RDX]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i16 + %mul = mul i16 %c, %ext.a + %mul.ext = zext i16 %mul to i32 + %add = add i32 %mul.ext, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + declare float @llvm.fmuladd.f32(float, float, float) !6 = distinct !{!6, !7, !8} diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll index 06b044872c217..291ada86cf797 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll @@ -800,3 +800,545 @@ exit: %r.0.lcssa = phi i64 [ %rdx.next, %loop ] ret i64 %r.0.lcssa } + +define i32 @print_mulacc_extended_const(ptr %start, ptr %end) { +; CHECK-LABEL: 'print_mulacc_extended_const' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: vp<%3> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1> +; CHECK-NEXT: EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9> +; CHECK-NEXT: vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7> +; CHECK-NEXT: vp<%8> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<%8> +; CHECK-NEXT: EXPRESSION vp<%9> = ir<%red> + reduce.add (mul (ir<%l> zext to i32), (ir<63> zext to i32)) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: IR %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 +; CHECK-NEXT: IR %l.ext = zext i8 %l to i32 +; CHECK-NEXT: IR %mul = mul i32 %l.ext, 63 +; CHECK-NEXT: IR %red.next = add i32 %red, %mul +; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 +; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { +; CHECK-NEXT: Live-in ir<%1> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %start2 = ptrtoint ptr %start to i64 +; CHECK-NEXT: IR %end1 = ptrtoint ptr %end to i64 +; CHECK-NEXT: IR %0 = add i64 %end1, 1 +; CHECK-NEXT: IR %1 = sub i64 %0, %start2 +; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4> +; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> +; CHECK-NEXT: Successor(s): ir-bb, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4> +; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf> +; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1> +; CHECK-NEXT: Successor(s): vector.body +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index> +; CHECK-NEXT: WIDEN ir<%l> = load vp<%next.gep> +; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = zext ir<%l> to i32 +; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%l.ext>, ir<63> +; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.add (ir<%mul>) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%n.vec> +; CHECK-NEXT: Successor(s): middle.block, vector.body +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb) +; CHECK-NEXT: IR %red = phi i32 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb) +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 +; CHECK-NEXT: IR %l.ext = zext i8 %l to i32 +; CHECK-NEXT: IR %mul = mul i32 %l.ext, 63 +; CHECK-NEXT: IR %red.next = add i32 %red, %mul +; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 +; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] + %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] + %l = load i8, ptr %ptr.iv, align 1 + %l.ext = zext i8 %l to i32 + %mul = mul i32 %l.ext, 63 + %red.next = add i32 %red, %mul + %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %red.next +} + +; Constants >= 128 cannot be treated as sign-extended, so the expression shouldn't extend 128 +define i32 @print_mulacc_not_extended_const(ptr %start, ptr %end) { +; CHECK-LABEL: 'print_mulacc_not_extended_const' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: vp<%3> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1> +; CHECK-NEXT: EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9> +; CHECK-NEXT: vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7> +; CHECK-NEXT: vp<%8> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<%8> +; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 +; CHECK-NEXT: EXPRESSION vp<%9> = ir<%red> + reduce.add (mul ir<%l.ext>, ir<128>) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: IR %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 +; CHECK-NEXT: IR %l.ext = sext i8 %l to i32 +; CHECK-NEXT: IR %mul = mul i32 %l.ext, 128 +; CHECK-NEXT: IR %red.next = add i32 %red, %mul +; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 +; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { +; CHECK-NEXT: Live-in ir<%1> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %start2 = ptrtoint ptr %start to i64 +; CHECK-NEXT: IR %end1 = ptrtoint ptr %end to i64 +; CHECK-NEXT: IR %0 = add i64 %end1, 1 +; CHECK-NEXT: IR %1 = sub i64 %0, %start2 +; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4> +; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> +; CHECK-NEXT: Successor(s): ir-bb, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4> +; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf> +; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1> +; CHECK-NEXT: Successor(s): vector.body +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index> +; CHECK-NEXT: WIDEN ir<%l> = load vp<%next.gep> +; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 +; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%l.ext>, ir<128> +; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.add (ir<%mul>) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%n.vec> +; CHECK-NEXT: Successor(s): middle.block, vector.body +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb) +; CHECK-NEXT: IR %red = phi i32 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb) +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 +; CHECK-NEXT: IR %l.ext = sext i8 %l to i32 +; CHECK-NEXT: IR %mul = mul i32 %l.ext, 128 +; CHECK-NEXT: IR %red.next = add i32 %red, %mul +; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 +; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] + %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] + %l = load i8, ptr %ptr.iv, align 1 + %l.ext = sext i8 %l to i32 + %mul = mul i32 %l.ext, 128 + %red.next = add i32 %red, %mul + %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + %red.next.lcssa = phi i32 [ %red.next, %loop ] + ret i32 %red.next.lcssa +} + +define i64 @print_ext_mulacc_extended_const(ptr %start, ptr %end) { +; CHECK-LABEL: 'print_ext_mulacc_extended_const' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: vp<%3> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1> +; CHECK-NEXT: EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9> +; CHECK-NEXT: vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7> +; CHECK-NEXT: vp<%8> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<%8> +; CHECK-NEXT: EXPRESSION vp<%9> = ir<%red> + reduce.add (mul (ir<%l> zext to i64), (ir<63> zext to i64)) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: IR %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 +; CHECK-NEXT: IR %l.ext = zext i8 %l to i32 +; CHECK-NEXT: IR %mul = mul i32 %l.ext, 63 +; CHECK-NEXT: IR %mul.ext = zext i32 %mul to i64 +; CHECK-NEXT: IR %red.next = add i64 %red, %mul.ext +; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 +; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { +; CHECK-NEXT: Live-in ir<%1> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %start2 = ptrtoint ptr %start to i64 +; CHECK-NEXT: IR %end1 = ptrtoint ptr %end to i64 +; CHECK-NEXT: IR %0 = add i64 %end1, 1 +; CHECK-NEXT: IR %1 = sub i64 %0, %start2 +; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4> +; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> +; CHECK-NEXT: Successor(s): ir-bb, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4> +; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf> +; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1> +; CHECK-NEXT: Successor(s): vector.body +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index> +; CHECK-NEXT: WIDEN ir<%l> = load vp<%next.gep> +; CHECK-NEXT: WIDEN-CAST vp<%4> = zext ir<%l> to i64 +; CHECK-NEXT: WIDEN ir<%mul> = mul vp<%4>, ir<63> +; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.add (ir<%mul>) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%n.vec> +; CHECK-NEXT: Successor(s): middle.block, vector.body +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%6> = compute-reduction-result ir<%red>, ir<%red.next> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%6> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%6>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb) +; CHECK-NEXT: IR %red = phi i64 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb) +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 +; CHECK-NEXT: IR %l.ext = zext i8 %l to i32 +; CHECK-NEXT: IR %mul = mul i32 %l.ext, 63 +; CHECK-NEXT: IR %mul.ext = zext i32 %mul to i64 +; CHECK-NEXT: IR %red.next = add i64 %red, %mul.ext +; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 +; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] + %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] + %l = load i8, ptr %ptr.iv, align 1 + %l.ext = zext i8 %l to i32 + %mul = mul i32 %l.ext, 63 + %mul.ext = zext i32 %mul to i64 + %red.next = add i64 %red, %mul.ext + %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + ret i64 %red.next +} + +; Constants >= 128 cannot be treated as sign-extended, so the expression shouldn't extend 128 +define i64 @print_ext_mulacc_not_extended_const(ptr %start, ptr %end) { +; CHECK-LABEL: 'print_ext_mulacc_not_extended_const' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: vp<%3> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1> +; CHECK-NEXT: EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9> +; CHECK-NEXT: vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7> +; CHECK-NEXT: vp<%8> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<%8> +; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 +; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%l.ext>, ir<128> +; CHECK-NEXT: EXPRESSION vp<%9> = ir<%red> + reduce.add (ir<%mul> sext to i64) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: IR %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 +; CHECK-NEXT: IR %l.ext = sext i8 %l to i32 +; CHECK-NEXT: IR %mul = mul i32 %l.ext, 128 +; CHECK-NEXT: IR %mul.ext = sext i32 %mul to i64 +; CHECK-NEXT: IR %red.next = add i64 %red, %mul.ext +; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 +; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { +; CHECK-NEXT: Live-in ir<%1> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %start2 = ptrtoint ptr %start to i64 +; CHECK-NEXT: IR %end1 = ptrtoint ptr %end to i64 +; CHECK-NEXT: IR %0 = add i64 %end1, 1 +; CHECK-NEXT: IR %1 = sub i64 %0, %start2 +; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4> +; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> +; CHECK-NEXT: Successor(s): ir-bb, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4> +; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf> +; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1> +; CHECK-NEXT: Successor(s): vector.body +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index> +; CHECK-NEXT: WIDEN ir<%l> = load vp<%next.gep> +; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 +; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%l.ext>, ir<128> +; CHECK-NEXT: WIDEN-CAST ir<%mul.ext> = sext ir<%mul> to i64 +; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.add (ir<%mul.ext>) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%n.vec> +; CHECK-NEXT: Successor(s): middle.block, vector.body +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb) +; CHECK-NEXT: IR %red = phi i64 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb) +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 +; CHECK-NEXT: IR %l.ext = sext i8 %l to i32 +; CHECK-NEXT: IR %mul = mul i32 %l.ext, 128 +; CHECK-NEXT: IR %mul.ext = sext i32 %mul to i64 +; CHECK-NEXT: IR %red.next = add i64 %red, %mul.ext +; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 +; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] + %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] + %l = load i8, ptr %ptr.iv, align 1 + %l.ext = sext i8 %l to i32 + %mul = mul i32 %l.ext, 128 + %mul.ext = sext i32 %mul to i64 + %red.next = add i64 %red, %mul.ext + %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + %red.next.lcssa = phi i64 [ %red.next, %loop ] + ret i64 %red.next.lcssa +} From a46480fb2a8351966dbbfbf1d0b309c614f7267c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 28 Oct 2025 17:28:39 +0000 Subject: [PATCH 040/539] [X86] combineTruncate - drop load alignment after (trunc (srl (load p), amt)) -> (load p + amt/8) fold (#165436) The pointer adjustment no longer guarantees any alignment Missed in #165266 and only noticed in some follow up work --- llvm/lib/Target/X86/X86ISelLowering.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b86020aa512ea..5785440a20e43 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -54679,7 +54679,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, SDValue NewPtr = DAG.getMemBasePlusOffset( Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap); SDValue NewLoad = - DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getMemOperand()); + DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(), + Align(), Ld->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1), NewLoad.getValue(1)); return NewLoad; From 24235ca37feaf3316339a5ba0d26b213a57b7127 Mon Sep 17 00:00:00 2001 From: Sam Elliott Date: Tue, 28 Oct 2025 17:35:36 +0000 Subject: [PATCH 041/539] [RISCV] fixup_riscv_rvc_imm may be linker relaxable (#161797) With Xqcili, `c.li` may be relaxed to `qc.e.li` (this is because `qc.e.li` is compressed into `c.li`, which needs to be undone). `qc.e.li` is relaxable, so we need to mark `c.li` as linker relaxable when it is emitted. This fixup cannot be emitted as a relocation, but we still mark it as requiring no R_RISCV_RELAX in case this changes in the future. --- .../RISCV/MCTargetDesc/RISCVAsmBackend.cpp | 1 + .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp | 1 + llvm/test/MC/RISCV/xqcili-linker-relaxation.s | 37 +++++++++++++++++++ 3 files changed, 39 insertions(+) create mode 100644 llvm/test/MC/RISCV/xqcili-linker-relaxation.s diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 41a9c92cf99c3..96e8afca0680e 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -823,6 +823,7 @@ static bool relaxableFixupNeedsRelocation(const MCFixupKind Kind) { break; case RISCV::fixup_riscv_rvc_jump: case RISCV::fixup_riscv_rvc_branch: + case RISCV::fixup_riscv_rvc_imm: case RISCV::fixup_riscv_jal: return false; } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index 6d587e6f167fc..5934c91cb4b9a 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -688,6 +688,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, // the `jal` again in the assembler. } else if (MIFrm == RISCVII::InstFormatCI) { FixupKind = RISCV::fixup_riscv_rvc_imm; + AsmRelaxToLinkerRelaxableWithFeature(RISCV::FeatureVendorXqcili); } else if (MIFrm == RISCVII::InstFormatI) { FixupKind = RISCV::fixup_riscv_12_i; } else if (MIFrm == RISCVII::InstFormatQC_EB) { diff --git a/llvm/test/MC/RISCV/xqcili-linker-relaxation.s b/llvm/test/MC/RISCV/xqcili-linker-relaxation.s new file mode 100644 index 0000000000000..ace677979ee13 --- /dev/null +++ b/llvm/test/MC/RISCV/xqcili-linker-relaxation.s @@ -0,0 +1,37 @@ +# RUN: llvm-mc --triple=riscv32 -mattr=+relax,+experimental-xqcili \ +# RUN: %s -filetype=obj -o - -riscv-add-build-attributes \ +# RUN: | llvm-objdump -dr -M no-aliases - \ +# RUN: | FileCheck %s + +## This tests that we correctly emit relocations for linker relaxation when +## emitting `QC.E.LI` and `QC.LI`. + + .section .text.ex1, "ax", @progbits +# CHECK-LABEL: <.text.ex1>: + blez a1, .L1 +# CHECK-NEXT: bge zero, a1, 0x0 <.text.ex1> +# CHECK-NEXT: R_RISCV_BRANCH .L1{{$}} + qc.e.li a0, sym +# CHECK-NEXT: qc.e.li a0, 0x0 +# CHECK-NEXT: R_RISCV_VENDOR QUALCOMM{{$}} +# CHECK-NEXT: R_RISCV_CUSTOM194 sym{{$}} +# CHECK-NEXT: R_RISCV_RELAX *ABS*{{$}} +.L1: +# CHECK: <.L1>: + ret +# CHECK-NEXT: c.jr ra + + .section .text.ex2, "ax", @progbits +# CHECK-LABEL: <.text.ex2>: + blez a1, .L2 +# CHECK-NEXT: bge zero, a1, 0x0 <.text.ex2> +# CHECK-NEXT: R_RISCV_BRANCH .L2{{$}} + qc.li a0, %qc.abs20(sym) +# CHECK-NEXT: qc.li a0, 0x0 +# CHECK-NEXT: R_RISCV_VENDOR QUALCOMM{{$}} +# CHECK-NEXT: R_RISCV_CUSTOM192 sym{{$}} +# CHECK-NEXT: R_RISCV_RELAX *ABS*{{$}} +.L2: +# CHECK: <.L2>: + ret +# CHECK-NEXT: c.jr ra From 9ce6628ae0dc1e93106d7c7dc2caefe289bfa76b Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Tue, 28 Oct 2025 12:36:07 -0500 Subject: [PATCH 042/539] [bazel][mlir] Port #165429: RegionBranchOpInterface (#165447) --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index d528daeb160cf..e8561cc39e007 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -12086,6 +12086,7 @@ cc_library( srcs = glob(["lib/Dialect/Transform/TuneExtension/*.cpp"]), hdrs = glob(["include/mlir/Dialect/Transform/TuneExtension/*.h"]), deps = [ + ":ControlFlowInterfaces", ":IR", ":TransformDialect", ":TransformDialectInterfaces", From 1763752c426e60adeab90e73a4528038a533f6a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 28 Oct 2025 07:37:59 -1000 Subject: [PATCH 043/539] [flang][cuda] Add interfaces and lowering for barrier_try_wait(_sleep) (#165316) As described in the programming guide: https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/#load-and-store-functions-using-bulk-tma-operations --- .../flang/Optimizer/Builder/IntrinsicCall.h | 2 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 60 +++++++++++++++++++ flang/module/cudadevice.f90 | 27 +++++++-- flang/test/Lower/CUDA/cuda-device-proc.cuf | 22 +++++++ 4 files changed, 105 insertions(+), 6 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index c3cd119b96174..ed0cbd3bdf16b 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -211,6 +211,8 @@ struct IntrinsicLibrary { mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef); mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef); void genBarrierInit(llvm::ArrayRef); + mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef); + mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genBesselJn(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genBesselYn(mlir::Type, diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 39bac818fe5d0..0d225532f2460 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -50,6 +50,7 @@ #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/LLVMTypes.h" #include "mlir/Dialect/Math/IR/Math.h" +#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -358,6 +359,14 @@ static constexpr IntrinsicHandler handlers[]{ &I::genBarrierInit, {{{"barrier", asAddr}, {"count", asValue}}}, /*isElemental=*/false}, + {"barrier_try_wait", + &I::genBarrierTryWait, + {{{"barrier", asAddr}, {"token", asValue}}}, + /*isElemental=*/false}, + {"barrier_try_wait_sleep", + &I::genBarrierTryWaitSleep, + {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}}, + /*isElemental=*/false}, {"bessel_jn", &I::genBesselJn, {{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}}, @@ -3282,6 +3291,57 @@ void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef args) { mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); } +// BARRIER_TRY_WAIT (CUDA) +mlir::Value +IntrinsicLibrary::genBarrierTryWait(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 2); + mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); + mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0); + fir::StoreOp::create(builder, loc, zero, res); + mlir::Value ns = + builder.createIntegerConstant(loc, builder.getI32Type(), 1000000); + mlir::Value load = fir::LoadOp::create(builder, loc, res); + auto whileOp = mlir::scf::WhileOp::create( + builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load}); + mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore()); + mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc); + builder.setInsertionPointToStart(beforeBlock); + mlir::Value condition = mlir::arith::CmpIOp::create( + builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero); + mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg); + mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter()); + afterBlock->addArgument(resultType, loc); + builder.setInsertionPointToStart(afterBlock); + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); + mlir::Value ret = + mlir::NVVM::InlinePtxOp::create( + builder, loc, {resultType}, {barrier, args[1], ns}, {}, + ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; " + "selp.b32 %0, 1, 0, p;", + {}) + .getResult(0); + mlir::scf::YieldOp::create(builder, loc, ret); + builder.setInsertionPointAfter(whileOp); + return whileOp.getResult(0); +} + +// BARRIER_TRY_WAIT_SLEEP (CUDA) +mlir::Value +IntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 3); + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); + return mlir::NVVM::InlinePtxOp::create( + builder, loc, {resultType}, {barrier, args[1], args[2]}, {}, + ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; " + "selp.b32 %0, 1, 0, p;", + {}) + .getResult(0); +} + // BESSEL_JN fir::ExtendedValue IntrinsicLibrary::genBesselJn(mlir::Type resultType, diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index 5182950cbffea..ea54c974c9e7c 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -1998,6 +1998,18 @@ attributes(device,host) logical function on_device() bind(c) ! TMA Operations + interface barrier_arrive + attributes(device) function barrier_arrive(barrier) result(token) + integer(8), shared :: barrier + integer(8) :: token + end function + attributes(device) function barrier_arrive_cnt(barrier, count) result(token) + integer(8), shared :: barrier + integer(4), value :: count + integer(8) :: token + end function + end interface + interface attributes(device) subroutine barrier_init(barrier, count) integer(8), shared :: barrier @@ -2005,15 +2017,18 @@ attributes(device) subroutine barrier_init(barrier, count) end subroutine end interface - interface barrier_arrive - attributes(device) function barrier_arrive(barrier) result(token) + interface + attributes(device) integer function barrier_try_wait(barrier, token) integer(8), shared :: barrier - integer(8) :: token + integer(8), value :: token end function - attributes(device) function barrier_arrive_cnt(barrier, count) result(token) + end interface + + interface + attributes(device) integer function barrier_try_wait_sleep(barrier, token, ns) integer(8), shared :: barrier - integer(4), value :: count - integer(8) :: token + integer(8), value :: token + integer(4), value :: ns end function end interface diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 5c4c3c6d39820..99b1a2fc0cbf7 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -492,3 +492,25 @@ end subroutine ! CHECK: %[[CASTED_CMP_XCHG_EV:.*]] = fir.convert %[[CMP_XCHG_EV]] : (i1) -> i32 ! CHECK: %{{.*}} = arith.constant 1 : i32 ! CHECK: %19 = arith.cmpi eq, %[[CASTED_CMP_XCHG_EV]], %{{.*}} : i32 + +attributes(global) subroutine test_barrier_try_wait() + integer :: istat + integer(8), shared :: barrier1 + integer(8) :: token + istat = barrier_try_wait(barrier1, token) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_barrier_try_wait() +! CHECK: scf.while +! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %{{.*}}, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %c1000000{{.*}} : !llvm.ptr, i64, i32) -> i32 + +attributes(global) subroutine test_barrier_try_wait_sleep() + integer :: istat + integer(8), shared :: barrier1 + integer(8) :: token + integer(4) :: sleep_time + istat = barrier_try_wait_sleep(barrier1, token, sleep_time) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_barrier_try_wait_sleep() +! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %0, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32 From 62216ef2c3e8f574c489bca658a96aa9f8e5b4fc Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Tue, 28 Oct 2025 12:45:04 -0500 Subject: [PATCH 044/539] [flang][OpenMP] Implement OpenMP stylized expressions (#165049) Consider OpenMP stylized expression to be a template to be instantiated with a series of types listed on the containing directive (currently DECLARE_REDUCTION). Create a series of instantiations in the parser, allowing OpenMP special variables to be declared separately for each type. --------- Co-authored-by: Tom Eccles --- flang/include/flang/Parser/dump-parse-tree.h | 6 +- flang/include/flang/Parser/openmp-utils.h | 22 ++ flang/include/flang/Parser/parse-tree.h | 65 ++++- flang/include/flang/Semantics/symbol.h | 2 + flang/lib/Parser/openmp-parsers.cpp | 244 ++++++++++++++++-- flang/lib/Parser/openmp-utils.cpp | 12 + flang/lib/Parser/parse-tree.cpp | 27 ++ flang/lib/Parser/unparse.cpp | 37 +-- flang/lib/Semantics/resolve-directives.cpp | 17 ++ flang/lib/Semantics/resolve-names.cpp | 99 +++---- .../Parser/OpenMP/declare-reduction-multi.f90 | 136 +++++++++- .../OpenMP/declare-reduction-operator.f90 | 110 +++++++- ...declare-reduction-unparse-with-symbols.f90 | 2 +- .../OpenMP/declare-reduction-unparse.f90 | 57 +++- .../Parser/OpenMP/metadirective-dirspec.f90 | 55 ++-- .../OpenMP/openmp6-directive-spellings.f90 | 35 +-- .../OpenMP/declare-reduction-error.f90 | 11 - .../OpenMP/declare-reduction-functions.f90 | 52 ++-- .../OpenMP/declare-reduction-logical.f90 | 7 +- .../OpenMP/declare-reduction-modfile.f90 | 12 +- .../OpenMP/declare-reduction-operator.f90 | 6 +- .../OpenMP/declare-reduction-operators.f90 | 7 +- .../OpenMP/declare-reduction-renamedop.f90 | 9 +- .../Semantics/OpenMP/declare-reduction.f90 | 16 +- 24 files changed, 812 insertions(+), 234 deletions(-) delete mode 100644 flang/test/Semantics/OpenMP/declare-reduction-error.f90 diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index 553cbd52cb3fd..bb970691c85c9 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -599,7 +599,7 @@ class ParseTreeDumper { NODE(parser, OmpInitClause) NODE(OmpInitClause, Modifier) NODE(parser, OmpInitializerClause) - NODE(parser, OmpInitializerProc) + NODE(parser, OmpInitializerExpression) NODE(parser, OmpInReductionClause) NODE(OmpInReductionClause, Modifier) NODE(parser, OmpInteropPreference) @@ -677,6 +677,10 @@ class ParseTreeDumper { NODE_ENUM(OmpSeverityClause, Severity) NODE(parser, OmpStepComplexModifier) NODE(parser, OmpStepSimpleModifier) + NODE(parser, OmpStylizedDeclaration) + NODE(parser, OmpStylizedExpression) + NODE(parser, OmpStylizedInstance) + NODE(OmpStylizedInstance, Instance) NODE(parser, OmpTaskDependenceType) NODE_ENUM(OmpTaskDependenceType, Value) NODE(parser, OmpTaskReductionClause) diff --git a/flang/include/flang/Parser/openmp-utils.h b/flang/include/flang/Parser/openmp-utils.h index f761332c9cfd7..49db091af93a7 100644 --- a/flang/include/flang/Parser/openmp-utils.h +++ b/flang/include/flang/Parser/openmp-utils.h @@ -25,6 +25,13 @@ namespace Fortran::parser::omp { +template constexpr auto addr_if(std::optional &x) { + return x ? &*x : nullptr; +} +template constexpr auto addr_if(const std::optional &x) { + return x ? &*x : nullptr; +} + namespace detail { using D = llvm::omp::Directive; @@ -133,9 +140,24 @@ template OmpDirectiveName GetOmpDirectiveName(const T &x) { } const OmpObjectList *GetOmpObjectList(const OmpClause &clause); + +template +const T *GetFirstArgument(const OmpDirectiveSpecification &spec) { + for (const OmpArgument &arg : spec.Arguments().v) { + if (auto *t{std::get_if(&arg.u)}) { + return t; + } + } + return nullptr; +} + const BlockConstruct *GetFortranBlockConstruct( const ExecutionPartConstruct &epc); +const OmpCombinerExpression *GetCombinerExpr( + const OmpReductionSpecifier &rspec); +const OmpInitializerExpression *GetInitializerExpr(const OmpClause &init); + } // namespace Fortran::parser::omp #endif // FORTRAN_PARSER_OPENMP_UTILS_H diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index 2cf6faead479d..c3a8c2eab15f2 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -24,7 +24,9 @@ #include "provenance.h" #include "flang/Common/idioms.h" #include "flang/Common/indirection.h" +#include "flang/Common/reference.h" #include "flang/Support/Fortran.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/Frontend/OpenACC/ACC.h.inc" #include "llvm/Frontend/OpenMP/OMP.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" @@ -3510,6 +3512,8 @@ struct OmpDirectiveName { // type-name list item struct OmpTypeName { + CharBlock source; + mutable const semantics::DeclTypeSpec *declTypeSpec{nullptr}; UNION_CLASS_BOILERPLATE(OmpTypeName); std::variant u; }; @@ -3538,6 +3542,39 @@ struct OmpObjectList { WRAPPER_CLASS_BOILERPLATE(OmpObjectList, std::list); }; +struct OmpStylizedDeclaration { + COPY_AND_ASSIGN_BOILERPLATE(OmpStylizedDeclaration); + // Since "Reference" isn't handled by parse-tree-visitor, add EmptyTrait, + // and visit the members by hand when needed. + using EmptyTrait = std::true_type; + common::Reference type; + EntityDecl var; +}; + +struct OmpStylizedInstance { + struct Instance { + UNION_CLASS_BOILERPLATE(Instance); + std::variant> u; + }; + TUPLE_CLASS_BOILERPLATE(OmpStylizedInstance); + std::tuple, Instance> t; +}; + +class ParseState; + +// Ref: [5.2:76], [6.0:185] +// +struct OmpStylizedExpression { + CharBlock source; + // Pointer to a temporary copy of the ParseState that is used to create + // additional parse subtrees for the stylized expression. This is only + // used internally during parsing and conveys no information to the + // consumers of the AST. + const ParseState *state{nullptr}; + WRAPPER_CLASS_BOILERPLATE( + OmpStylizedExpression, std::list); +}; + // Ref: [4.5:201-207], [5.0:293-299], [5.1:325-331], [5.2:124] // // reduction-identifier -> @@ -3555,9 +3592,22 @@ struct OmpReductionIdentifier { // combiner-expression -> // since 4.5 // assignment-statement | // function-reference -struct OmpCombinerExpression { - UNION_CLASS_BOILERPLATE(OmpCombinerExpression); - std::variant u; +struct OmpCombinerExpression : public OmpStylizedExpression { + INHERITED_WRAPPER_CLASS_BOILERPLATE( + OmpCombinerExpression, OmpStylizedExpression); + static llvm::ArrayRef Variables(); +}; + +// Ref: [4.5:222:7-8], [5.0:305:28-29], [5.1:337:20-21], [5.2:127:6-8], +// [6.0:242:3-5] +// +// initializer-expression -> // since 4.5 +// OMP_PRIV = expression | +// subroutine-name(argument-list) +struct OmpInitializerExpression : public OmpStylizedExpression { + INHERITED_WRAPPER_CLASS_BOILERPLATE( + OmpInitializerExpression, OmpStylizedExpression); + static llvm::ArrayRef Variables(); }; inline namespace arguments { @@ -4558,16 +4608,9 @@ struct OmpInReductionClause { std::tuple t; }; -// declare-reduction -> DECLARE REDUCTION (reduction-identifier : type-list -// : combiner) [initializer-clause] -struct OmpInitializerProc { - TUPLE_CLASS_BOILERPLATE(OmpInitializerProc); - std::tuple> t; -}; // Initialization for declare reduction construct struct OmpInitializerClause { - UNION_CLASS_BOILERPLATE(OmpInitializerClause); - std::variant u; + WRAPPER_CLASS_BOILERPLATE(OmpInitializerClause, OmpInitializerExpression); }; // Ref: [4.5:199-201], [5.0:288-290], [5.1:321-322], [5.2:115-117] diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h index 04a063957082a..cb27d544ed9f5 100644 --- a/flang/include/flang/Semantics/symbol.h +++ b/flang/include/flang/Semantics/symbol.h @@ -830,6 +830,8 @@ class Symbol { OmpUseDevicePtr, OmpUseDeviceAddr, OmpIsDevicePtr, OmpHasDeviceAddr, // OpenMP data-copying attribute OmpCopyIn, OmpCopyPrivate, + // OpenMP special variables + OmpInVar, OmpOrigVar, OmpOutVar, OmpPrivVar, // OpenMP miscellaneous flags OmpCommonBlock, OmpReduction, OmpInReduction, OmpAligned, OmpNontemporal, OmpAllocate, OmpDeclarativeAllocateDirective, diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index d1e081cfd1b41..4159d2e41b78c 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -275,6 +275,13 @@ struct SpecificModifierParser { // --- Iterator helpers ----------------------------------------------- +static EntityDecl MakeEntityDecl(ObjectName &&name) { + return EntityDecl( + /*ObjectName=*/std::move(name), std::optional{}, + std::optional{}, std::optional{}, + std::optional{}); +} + // [5.0:47:17-18] In an iterator-specifier, if the iterator-type is not // specified then the type of that iterator is default integer. // [5.0:49:14] The iterator-type must be an integer type. @@ -282,11 +289,7 @@ static std::list makeEntityList(std::list &&names) { std::list entities; for (auto iter = names.begin(), end = names.end(); iter != end; ++iter) { - EntityDecl entityDecl( - /*ObjectName=*/std::move(*iter), std::optional{}, - std::optional{}, std::optional{}, - std::optional{}); - entities.push_back(std::move(entityDecl)); + entities.push_back(MakeEntityDecl(std::move(*iter))); } return entities; } @@ -306,6 +309,217 @@ static TypeDeclarationStmt makeIterSpecDecl(std::list &&names) { makeEntityList(std::move(names))); } +// --- Stylized expression handling ----------------------------------- + +// OpenMP has a concept of am "OpenMP stylized expression". Syntactially +// it looks like a typical Fortran expression (or statement), except: +// - the only variables allowed in it are OpenMP special variables, the +// exact set of these variables depends on the specific case of the +// stylized expression +// - the special OpenMP variables present may assume one or more types, +// and the expression should be semantically valid for each type. +// +// The stylized expression can be thought of as a template, which will be +// instantiated for each type provided somewhere in the context in which +// the stylized expression appears. +// +// AST nodes: +// - OmpStylizedExpression: contains the source string for the expression, +// plus the list of instances (OmpStylizedInstance). +// - OmpStylizedInstance: corresponds to the instantiation of the stylized +// expression for a specific type. The way that the type is specified is +// by creating declarations (OmpStylizedDeclaration) for the special +// variables. Together with the AST tree corresponding to the stylized +// expression the instantiation has enough information for semantic +// analysis. Each instance has its own scope, and the special variables +// have their own Symbol's (local to the scope). +// - OmpStylizedDeclaration: encapsulates the information that the visitors +// in resolve-names can use to "emulate" a declaration for a special +// variable and allow name resolution in the instantiation AST to work. +// +// Implementation specifics: +// The semantic analysis stores "evaluate::Expr" in each AST node rooted +// in parser::Expr (in the typedExpr member). The evaluate::Expr is specific +// to a given type, and so to allow different types for a given expression, +// for each type a separate copy of the parser::Expr subtree is created. +// Normally, AST nodes are non-copyable (copy-ctor is deleted), so to create +// several copies of a subtree, the same source string is parsed several +// times. The ParseState member in OmpStylizedExpression is the parser state +// immediately before the stylized expression. +// +// Initially, when OmpStylizedExpression is first created, the expression is +// parsed as if it was an actual code, but this parsing is only done to +// establish where the stylized expression ends (in the source). The source +// and the initial parser state are stored in the object, and the instance +// list is empty. +// Once the parsing of the containing OmpDirectiveSpecification completes, +// a post-processing "parser" (OmpStylizedInstanceCreator) executes. This +// post-processor examines the directive specification to see if it expects +// any stylized expressions to be contained in it, and then instantiates +// them for each such directive. + +template struct NeverParser { + using resultType = A; + std::optional Parse(ParseState &state) const { + // Always fail, but without any messages. + return std::nullopt; + } +}; + +template constexpr auto never() { return NeverParser{}; } + +// Parser for optional which always succeeds and returns std::nullptr. +// It's only needed to produce "std::optional" in +// CallStmt. +template struct NullParser; +template struct NullParser> { + using resultType = std::optional; + std::optional Parse(ParseState &) const { + return resultType{std::nullopt}; + } +}; + +template constexpr auto null() { return NullParser{}; } + +// OmpStylizedDeclaration and OmpStylizedInstance are helper classes, and +// don't correspond to anything in the source. Their parsers should still +// exist, but they should never be executed. +TYPE_PARSER(construct(never())) +TYPE_PARSER(construct(never())) + +TYPE_PARSER( // + construct(Parser{}) || + construct( + sourced(construct(Parser{}, + null>(), + parenthesized(optionalList(actualArgSpec))))) || + construct(indirect(expr))) + +struct OmpStylizedExpressionParser { + using resultType = OmpStylizedExpression; + + std::optional Parse(ParseState &state) const { + auto *saved{new ParseState(state)}; + auto getSource{verbatim(Parser{} >> ok)}; + if (auto &&ok{getSource.Parse(state)}) { + OmpStylizedExpression result{std::list{}}; + result.source = ok->source; + result.state = saved; + // result.v remains empty + return std::move(result); + } + delete saved; + return std::nullopt; + } +}; + +static void Instantiate(OmpStylizedExpression &ose, + llvm::ArrayRef types, llvm::ArrayRef vars) { + // 1. For each var in the vars list, declare it with the corresponding + // type from types. + // 2. Run the parser to get the AST for the stylized expression. + // 3. Create OmpStylizedInstance and append it to the list in ose. + assert(types.size() == vars.size() && "List size mismatch"); + // A ParseState object is irreversibly modified during parsing (in + // particular, it cannot be rewound to an earlier position in the source). + // Because of that we need to create a local copy for each instantiation. + // If rewinding was possible, we could just use the current one, and we + // wouldn't need to save it in the AST node. + ParseState state{DEREF(ose.state)}; + + std::list decls; + for (auto [type, var] : llvm::zip_equal(types, vars)) { + decls.emplace_back(OmpStylizedDeclaration{ + common::Reference(*type), MakeEntityDecl(Name{var})}); + } + + if (auto &&instance{Parser{}.Parse(state)}) { + ose.v.emplace_back( + OmpStylizedInstance{std::move(decls), std::move(*instance)}); + } +} + +static void InstantiateForTypes(OmpStylizedExpression &ose, + const OmpTypeNameList &typeNames, llvm::ArrayRef vars) { + // For each type in the type list, declare all variables in vars with + // that type, and complete the instantiation. + for (const OmpTypeName &t : typeNames.v) { + std::vector types(vars.size(), &t); + Instantiate(ose, types, vars); + } +} + +static void InstantiateDeclareReduction(OmpDirectiveSpecification &spec) { + // There can be arguments/clauses that don't make sense, that analysis + // is left until semantic checks. Tolerate any unexpected stuff. + auto *rspec{GetFirstArgument(spec)}; + if (!rspec) { + return; + } + + const OmpTypeNameList *typeNames{nullptr}; + + if (auto *cexpr{ + const_cast(GetCombinerExpr(*rspec))}) { + typeNames = &std::get(rspec->t); + + InstantiateForTypes(*cexpr, *typeNames, OmpCombinerExpression::Variables()); + delete cexpr->state; + cexpr->state = nullptr; + } else { + // If there are no types, there is nothing else to do. + return; + } + + for (const OmpClause &clause : spec.Clauses().v) { + llvm::omp::Clause id{clause.Id()}; + if (id == llvm::omp::Clause::OMPC_initializer) { + if (auto *iexpr{const_cast( + GetInitializerExpr(clause))}) { + InstantiateForTypes( + *iexpr, *typeNames, OmpInitializerExpression::Variables()); + delete iexpr->state; + iexpr->state = nullptr; + } + } + } +} + +static void InstantiateStylizedDirective(OmpDirectiveSpecification &spec) { + const OmpDirectiveName &dirName{spec.DirName()}; + if (dirName.v == llvm::omp::Directive::OMPD_declare_reduction) { + InstantiateDeclareReduction(spec); + } +} + +template >> +struct OmpStylizedInstanceCreator { + using resultType = OmpDirectiveSpecification; + constexpr OmpStylizedInstanceCreator(P p) : parser_(p) {} + + std::optional Parse(ParseState &state) const { + if (auto &&spec{parser_.Parse(state)}) { + InstantiateStylizedDirective(*spec); + return std::move(spec); + } + return std::nullopt; + } + +private: + const P parser_; +}; + +template +OmpStylizedInstanceCreator(P) -> OmpStylizedInstanceCreator

; + +// --- Parsers for types ---------------------------------------------- + +TYPE_PARSER( // + sourced(construct(Parser{})) || + sourced(construct(Parser{}))) + // --- Parsers for arguments ------------------------------------------ // At the moment these are only directive arguments. This is needed for @@ -366,10 +580,6 @@ struct OmpArgumentListParser { } }; -TYPE_PARSER( // - construct(Parser{}) || - construct(Parser{})) - // 2.15.3.6 REDUCTION (reduction-identifier: variable-name-list) TYPE_PARSER(construct(Parser{}) || construct(Parser{})) @@ -1065,7 +1275,8 @@ TYPE_PARSER(construct( TYPE_PARSER(construct( maybe(nonemptyList(Parser{}) / ":"), - maybe(indirect(Parser{})))) + maybe(indirect( + OmpStylizedInstanceCreator(Parser{}))))) // OMP 5.2 12.6.1 grainsize([ prescriptiveness :] scalar-integer-expression) TYPE_PARSER(construct( @@ -1777,12 +1988,7 @@ TYPE_PARSER( Parser{})) / endOfLine) -TYPE_PARSER(construct(Parser{}, - parenthesized(many(maybe(","_tok) >> Parser{})))) - -TYPE_PARSER(construct( - construct(assignmentStmt) || - construct(Parser{}))) +TYPE_PARSER(construct(Parser{})) // OpenMP 5.2: 7.5.4 Declare Variant directive TYPE_PARSER(sourced(construct( @@ -1794,7 +2000,7 @@ TYPE_PARSER(sourced(construct( TYPE_PARSER(sourced(construct( predicated(Parser{}, IsDirective(llvm::omp::Directive::OMPD_declare_reduction)) >= - Parser{}))) + OmpStylizedInstanceCreator(Parser{})))) // 2.10.6 Declare Target Construct TYPE_PARSER(sourced(construct( @@ -1832,8 +2038,8 @@ TYPE_PARSER(sourced(construct( IsDirective(llvm::omp::Directive::OMPD_declare_mapper)) >= Parser{}))) -TYPE_PARSER(construct(Parser{}) || - construct(Parser{})) +TYPE_PARSER(construct(OmpStylizedExpressionParser{})) +TYPE_PARSER(construct(OmpStylizedExpressionParser{})) TYPE_PARSER(sourced(construct( OmpBlockConstructParser{llvm::omp::Directive::OMPD_critical}))) diff --git a/flang/lib/Parser/openmp-utils.cpp b/flang/lib/Parser/openmp-utils.cpp index 937a17f29f221..95ad3f60770f5 100644 --- a/flang/lib/Parser/openmp-utils.cpp +++ b/flang/lib/Parser/openmp-utils.cpp @@ -74,4 +74,16 @@ const BlockConstruct *GetFortranBlockConstruct( return nullptr; } +const OmpCombinerExpression *GetCombinerExpr( + const OmpReductionSpecifier &rspec) { + return addr_if(std::get>(rspec.t)); +} + +const OmpInitializerExpression *GetInitializerExpr(const OmpClause &init) { + if (auto *wrapped{std::get_if(&init.u)}) { + return &wrapped->v.v; + } + return nullptr; +} + } // namespace Fortran::parser::omp diff --git a/flang/lib/Parser/parse-tree.cpp b/flang/lib/Parser/parse-tree.cpp index 8cbaa399c4763..ad0016e1404f9 100644 --- a/flang/lib/Parser/parse-tree.cpp +++ b/flang/lib/Parser/parse-tree.cpp @@ -11,6 +11,7 @@ #include "flang/Common/indirection.h" #include "flang/Parser/tools.h" #include "flang/Parser/user-state.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/Frontend/OpenMP/OMP.h" #include "llvm/Support/raw_ostream.h" #include @@ -430,4 +431,30 @@ const OmpClauseList &OmpDirectiveSpecification::Clauses() const { } return empty; } + +static bool InitCharBlocksFromStrings(llvm::MutableArrayRef blocks, + llvm::ArrayRef strings) { + for (auto [i, n] : llvm::enumerate(strings)) { + blocks[i] = CharBlock(n); + } + return true; +} + +// The names should have static storage duration. Keep these names +// in a sigle place. +llvm::ArrayRef OmpCombinerExpression::Variables() { + static std::string names[]{"omp_in", "omp_out"}; + static CharBlock vars[std::size(names)]; + + [[maybe_unused]] static bool init = InitCharBlocksFromStrings(vars, names); + return vars; +} + +llvm::ArrayRef OmpInitializerExpression::Variables() { + static std::string names[]{"omp_orig", "omp_priv"}; + static CharBlock vars[std::size(names)]; + + [[maybe_unused]] static bool init = InitCharBlocksFromStrings(vars, names); + return vars; +} } // namespace Fortran::parser diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index 20a8d2abd8ca0..9b38cfc40c5b2 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -2095,15 +2095,13 @@ class UnparseVisitor { // OpenMP Clauses & Directives void Unparse(const OmpArgumentList &x) { Walk(x.v, ", "); } + void Unparse(const OmpTypeNameList &x) { Walk(x.v, ", "); } void Unparse(const OmpBaseVariantNames &x) { Walk(std::get<0>(x.t)); // OmpObject Put(":"); Walk(std::get<1>(x.t)); // OmpObject } - void Unparse(const OmpTypeNameList &x) { // - Walk(x.v, ","); - } void Unparse(const OmpMapperSpecifier &x) { const auto &mapperName{std::get(x.t)}; if (mapperName.find(llvm::omp::OmpDefaultMapperName) == std::string::npos) { @@ -2202,6 +2200,15 @@ class UnparseVisitor { unsigned ompVersion{langOpts_.OpenMPVersion}; Word(llvm::omp::getOpenMPDirectiveName(x.v, ompVersion)); } + void Unparse(const OmpStylizedDeclaration &x) { + // empty + } + void Unparse(const OmpStylizedExpression &x) { // + Put(x.source.ToString()); + } + void Unparse(const OmpStylizedInstance &x) { + // empty + } void Unparse(const OmpIteratorSpecifier &x) { Walk(std::get(x.t)); Put(" = "); @@ -2511,29 +2518,11 @@ class UnparseVisitor { void Unparse(const OpenMPCriticalConstruct &x) { Unparse(static_cast(x)); } - void Unparse(const OmpInitializerProc &x) { - Walk(std::get(x.t)); - Put("("); - Walk(std::get>(x.t)); - Put(")"); - } - void Unparse(const OmpInitializerClause &x) { - // Don't let the visitor go to the normal AssignmentStmt Unparse function, - // it adds an extra newline that we don't want. - if (const auto *assignment{std::get_if(&x.u)}) { - Walk(assignment->t, " = "); - } else { - Walk(x.u); - } + void Unparse(const OmpInitializerExpression &x) { + Unparse(static_cast(x)); } void Unparse(const OmpCombinerExpression &x) { - // Don't let the visitor go to the normal AssignmentStmt Unparse function, - // it adds an extra newline that we don't want. - if (const auto *assignment{std::get_if(&x.u)}) { - Walk(assignment->t, " = "); - } else { - Walk(x.u); - } + Unparse(static_cast(x)); } void Unparse(const OpenMPDeclareReductionConstruct &x) { BeginOpenMP(); diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index 196755e2912a8..628068f9a9f68 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -26,6 +26,8 @@ #include "flang/Semantics/symbol.h" #include "flang/Semantics/tools.h" #include "flang/Support/Flags.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Frontend/OpenMP/OMP.h.inc" #include "llvm/Support/Debug.h" #include @@ -453,6 +455,21 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { return true; } + bool Pre(const parser::OmpStylizedDeclaration &x) { + static llvm::StringMap map{ + {"omp_in", Symbol::Flag::OmpInVar}, + {"omp_orig", Symbol::Flag::OmpOrigVar}, + {"omp_out", Symbol::Flag::OmpOutVar}, + {"omp_priv", Symbol::Flag::OmpPrivVar}, + }; + if (auto &name{std::get(x.var.t)}; name.symbol) { + if (auto found{map.find(name.ToString())}; found != map.end()) { + ResolveOmp(name, found->second, + const_cast(DEREF(name.symbol).owner())); + } + } + return false; + } bool Pre(const parser::OmpMetadirectiveDirective &x) { PushContext(x.v.source, llvm::omp::Directive::OMPD_metadirective); return true; diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 93faba7873916..0e6d4c71b30de 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -1605,6 +1605,12 @@ class OmpVisitor : public virtual DeclarationVisitor { Post(static_cast(x)); } + void Post(const parser::OmpTypeName &); + bool Pre(const parser::OmpStylizedDeclaration &); + void Post(const parser::OmpStylizedDeclaration &); + bool Pre(const parser::OmpStylizedInstance &); + void Post(const parser::OmpStylizedInstance &); + bool Pre(const parser::OpenMPDeclareMapperConstruct &x) { AddOmpSourceRange(x.source); return true; @@ -1615,18 +1621,6 @@ class OmpVisitor : public virtual DeclarationVisitor { return true; } - bool Pre(const parser::OmpInitializerProc &x) { - auto &procDes = std::get(x.t); - auto &name = std::get(procDes.u); - auto *symbol{FindSymbol(NonDerivedTypeScope(), name)}; - if (!symbol) { - context().Say(name.source, - "Implicit subroutine declaration '%s' in DECLARE REDUCTION"_err_en_US, - name.source); - } - return true; - } - bool Pre(const parser::OmpDeclareVariantDirective &x) { AddOmpSourceRange(x.source); return true; @@ -1772,14 +1766,6 @@ class OmpVisitor : public virtual DeclarationVisitor { messageHandler().set_currStmtSource(std::nullopt); } - bool Pre(const parser::OmpTypeName &x) { - BeginDeclTypeSpec(); - return true; - } - void Post(const parser::OmpTypeName &x) { // - EndDeclTypeSpec(); - } - bool Pre(const parser::OpenMPConstruct &x) { // Indicate that the current directive is not a declarative one. declaratives_.push_back(nullptr); @@ -1835,6 +1821,30 @@ void OmpVisitor::Post(const parser::OmpBlockConstruct &x) { } } +void OmpVisitor::Post(const parser::OmpTypeName &x) { + x.declTypeSpec = GetDeclTypeSpec(); +} + +bool OmpVisitor::Pre(const parser::OmpStylizedDeclaration &x) { + BeginDecl(); + Walk(x.type.get()); + Walk(x.var); + return true; +} + +void OmpVisitor::Post(const parser::OmpStylizedDeclaration &x) { // + EndDecl(); +} + +bool OmpVisitor::Pre(const parser::OmpStylizedInstance &x) { + PushScope(Scope::Kind::OtherConstruct, nullptr); + return true; +} + +void OmpVisitor::Post(const parser::OmpStylizedInstance &x) { // + PopScope(); +} + bool OmpVisitor::Pre(const parser::OmpMapClause &x) { auto &mods{OmpGetModifiers(x)}; if (auto *mapper{OmpGetUniqueModifier(mods)}) { @@ -1969,51 +1979,20 @@ void OmpVisitor::ProcessReductionSpecifier( } } - auto &typeList{std::get(spec.t)}; - - // Create a temporary variable declaration for the four variables - // used in the reduction specifier and initializer (omp_out, omp_in, - // omp_priv and omp_orig), with the type in the typeList. - // - // In theory it would be possible to create only variables that are - // actually used, but that requires walking the entire parse-tree of the - // expressions, and finding the relevant variables [there may well be other - // variables involved too]. - // - // This allows doing semantic analysis where the type is a derived type - // e.g omp_out%x = omp_out%x + omp_in%x. - // - // These need to be temporary (in their own scope). If they are created - // as variables in the outer scope, if there's more than one type in the - // typelist, duplicate symbols will be reported. - const parser::CharBlock ompVarNames[]{ - {"omp_in", 6}, {"omp_out", 7}, {"omp_priv", 8}, {"omp_orig", 8}}; - - for (auto &t : typeList.v) { - PushScope(Scope::Kind::OtherConstruct, nullptr); - BeginDeclTypeSpec(); - // We need to walk t.u because Walk(t) does it's own BeginDeclTypeSpec. - Walk(t.u); + reductionDetails->AddDecl(declaratives_.back()); - // Only process types we can find. There will be an error later on when - // a type isn't found. - if (const DeclTypeSpec *typeSpec{GetDeclTypeSpec()}) { - reductionDetails->AddType(*typeSpec); + // Do not walk OmpTypeNameList. The types on the list will be visited + // during procesing of OmpCombinerExpression. + Walk(std::get>(spec.t)); + Walk(clauses); - for (auto &nm : ompVarNames) { - ObjectEntityDetails details{}; - details.set_type(*typeSpec); - MakeSymbol(nm, Attrs{}, std::move(details)); - } + for (auto &type : std::get(spec.t).v) { + // The declTypeSpec can be null if there is some semantic error. + if (type.declTypeSpec) { + reductionDetails->AddType(*type.declTypeSpec); } - EndDeclTypeSpec(); - Walk(std::get>(spec.t)); - Walk(clauses); - PopScope(); } - reductionDetails->AddDecl(declaratives_.back()); - if (!symbol) { symbol = &MakeSymbol(mangledName, Attrs{}, std::move(*reductionDetails)); } diff --git a/flang/test/Parser/OpenMP/declare-reduction-multi.f90 b/flang/test/Parser/OpenMP/declare-reduction-multi.f90 index a682958eb9128..88566613bd412 100644 --- a/flang/test/Parser/OpenMP/declare-reduction-multi.f90 +++ b/flang/test/Parser/OpenMP/declare-reduction-multi.f90 @@ -26,7 +26,8 @@ program omp_examples type(tt) :: values(n), sum, prod, big, small !$omp declare reduction(+:tt:omp_out%r = omp_out%r + omp_in%r) initializer(omp_priv%r = 0) -!CHECK: !$OMP DECLARE REDUCTION(+:tt: omp_out%r = omp_out%r+omp_in%r) INITIALIZER(omp_priv%r = 0_4) +!CHECK: !$OMP DECLARE REDUCTION(+:tt: omp_out%r = omp_out%r + omp_in%r) INITIALIZER(om& +!CHECK-NEXT: !$OMP&p_priv%r = 0) !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction @@ -34,11 +35,39 @@ program omp_examples !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec !PARSE-TREE: | | | Name = 'tt' -!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%r=omp_out%r+omp_in%r' -!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=0._4' +!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out%r=omp_out%r+omp_in%r' +!PARSE-TREE: | | | | Variable = 'omp_out%r' +!PARSE-TREE: | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | Name = 'r' +!PARSE-TREE: | | | | Expr = 'omp_out%r+omp_in%r' +!PARSE-TREE: | | | | | Add +!PARSE-TREE: | | | | | | Expr = 'omp_out%r' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | | | Name = 'r' +!PARSE-TREE: | | | | | | Expr = 'omp_in%r' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_in' +!PARSE-TREE: | | | | | | | | Name = 'r' +!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance +!PARSE-TREE: | | OmpStylizedDeclaration +!PARSE-TREE: | | OmpStylizedDeclaration +!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv%r=0._4' +!PARSE-TREE: | | | Variable = 'omp_priv%r' +!PARSE-TREE: | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | DataRef -> Name = 'omp_priv' +!PARSE-TREE: | | | | | Name = 'r' +!PARSE-TREE: | | | Expr = '0_4' +!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '0' +!PARSE-TREE: | Flags = None !$omp declare reduction(*:tt:omp_out%r = omp_out%r * omp_in%r) initializer(omp_priv%r = 1) -!CHECK-NEXT: !$OMP DECLARE REDUCTION(*:tt: omp_out%r = omp_out%r*omp_in%r) INITIALIZER(omp_priv%r = 1_4) +!CHECK-NEXT: !$OMP DECLARE REDUCTION(*:tt: omp_out%r = omp_out%r * omp_in%r) INITIALIZER(om& +!CHECK-NEXT: !$OMP&p_priv%r = 1) !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction @@ -46,11 +75,39 @@ program omp_examples !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Multiply !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec !PARSE-TREE: | | | Name = 'tt' -!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%r=omp_out%r*omp_in%r' -!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=1._4' +!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out%r=omp_out%r*omp_in%r' +!PARSE-TREE: | | | | Variable = 'omp_out%r' +!PARSE-TREE: | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | Name = 'r' +!PARSE-TREE: | | | | Expr = 'omp_out%r*omp_in%r' +!PARSE-TREE: | | | | | Multiply +!PARSE-TREE: | | | | | | Expr = 'omp_out%r' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | | | Name = 'r' +!PARSE-TREE: | | | | | | Expr = 'omp_in%r' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_in' +!PARSE-TREE: | | | | | | | | Name = 'r' +!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance +!PARSE-TREE: | | OmpStylizedDeclaration +!PARSE-TREE: | | OmpStylizedDeclaration +!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv%r=1._4' +!PARSE-TREE: | | | Variable = 'omp_priv%r' +!PARSE-TREE: | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | DataRef -> Name = 'omp_priv' +!PARSE-TREE: | | | | | Name = 'r' +!PARSE-TREE: | | | Expr = '1_4' +!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '1' +!PARSE-TREE: | Flags = None !$omp declare reduction(max:tt:omp_out = mymax(omp_out, omp_in)) initializer(omp_priv%r = 0) -!CHECK-NEXT: !$OMP DECLARE REDUCTION(max:tt: omp_out = mymax(omp_out,omp_in)) INITIALIZER(omp_priv%r = 0_4) +!CHECK-NEXT: !$OMP DECLARE REDUCTION(max:tt: omp_out = mymax(omp_out, omp_in)) INITIALIZER(& +!CHECK-NEXT: !$OMP&omp_priv%r = 0) !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction @@ -58,11 +115,36 @@ program omp_examples !PARSE-TREE: | | OmpReductionIdentifier -> ProcedureDesignator -> Name = 'max' !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec !PARSE-TREE: | | | Name = 'tt' -!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=mymax(omp_out,omp_in)' -!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=0._4' +!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out=mymax(omp_out,omp_in)' +!PARSE-TREE: | | | | Variable = 'omp_out' +!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | Expr = 'mymax(omp_out,omp_in)' +!PARSE-TREE: | | | | | FunctionReference -> Call +!PARSE-TREE: | | | | | | ProcedureDesignator -> Name = 'mymax' +!PARSE-TREE: | | | | | | ActualArgSpec +!PARSE-TREE: | | | | | | | ActualArg -> Expr = 'omp_out' +!PARSE-TREE: | | | | | | | | Designator -> DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | ActualArgSpec +!PARSE-TREE: | | | | | | | ActualArg -> Expr = 'omp_in' +!PARSE-TREE: | | | | | | | | Designator -> DataRef -> Name = 'omp_in' +!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance +!PARSE-TREE: | | OmpStylizedDeclaration +!PARSE-TREE: | | OmpStylizedDeclaration +!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv%r=0._4' +!PARSE-TREE: | | | Variable = 'omp_priv%r' +!PARSE-TREE: | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | DataRef -> Name = 'omp_priv' +!PARSE-TREE: | | | | | Name = 'r' +!PARSE-TREE: | | | Expr = '0_4' +!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '0' +!PARSE-TREE: | Flags = None !$omp declare reduction(min:tt:omp_out%r = min(omp_out%r, omp_in%r)) initializer(omp_priv%r = 1) -!CHECK-NEXT: !$OMP DECLARE REDUCTION(min:tt: omp_out%r = min(omp_out%r,omp_in%r)) INITIALIZER(omp_priv%r = 1_4) +!CHECK-NEXT: !$OMP DECLARE REDUCTION(min:tt: omp_out%r = min(omp_out%r, omp_in%r)) INITIALI& +!CHECK-NEXT: !$OMP&ZER(omp_priv%r = 1) !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction @@ -70,8 +152,38 @@ program omp_examples !PARSE-TREE: | | OmpReductionIdentifier -> ProcedureDesignator -> Name = 'min' !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec !PARSE-TREE: | | | Name = 'tt' -!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%r=min(omp_out%r,omp_in%r)' -!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=1._4' +!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out%r=min(omp_out%r,omp_in%r)' +!PARSE-TREE: | | | | Variable = 'omp_out%r' +!PARSE-TREE: | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | Name = 'r' +!PARSE-TREE: | | | | Expr = 'min(omp_out%r,omp_in%r)' +!PARSE-TREE: | | | | | FunctionReference -> Call +!PARSE-TREE: | | | | | | ProcedureDesignator -> Name = 'min' +!PARSE-TREE: | | | | | | ActualArgSpec +!PARSE-TREE: | | | | | | | ActualArg -> Expr = 'omp_out%r' +!PARSE-TREE: | | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | | DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | | | | Name = 'r' +!PARSE-TREE: | | | | | | ActualArgSpec +!PARSE-TREE: | | | | | | | ActualArg -> Expr = 'omp_in%r' +!PARSE-TREE: | | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | | DataRef -> Name = 'omp_in' +!PARSE-TREE: | | | | | | | | | Name = 'r' +!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance +!PARSE-TREE: | | OmpStylizedDeclaration +!PARSE-TREE: | | OmpStylizedDeclaration +!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv%r=1._4' +!PARSE-TREE: | | | Variable = 'omp_priv%r' +!PARSE-TREE: | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | DataRef -> Name = 'omp_priv' +!PARSE-TREE: | | | | | Name = 'r' +!PARSE-TREE: | | | Expr = '1_4' +!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '1' +!PARSE-TREE: | Flags = None call random_number(values%r) diff --git a/flang/test/Parser/OpenMP/declare-reduction-operator.f90 b/flang/test/Parser/OpenMP/declare-reduction-operator.f90 index e4d07c8265b1e..0d337c1ef42f3 100644 --- a/flang/test/Parser/OpenMP/declare-reduction-operator.f90 +++ b/flang/test/Parser/OpenMP/declare-reduction-operator.f90 @@ -16,7 +16,8 @@ subroutine reduce_1 ( n, tts ) type(tt) :: tts(n) type(tt2) :: tts2(n) -!CHECK: !$OMP DECLARE REDUCTION(+:tt: omp_out = tt(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)) INITIALIZER(omp_priv = tt(x=0_4,y=0_4)) +!CHECK: !$OMP DECLARE REDUCTION(+:tt: omp_out = tt(omp_out%x - omp_in%x , omp_out%y - & +!CHECK: !$OMP&omp_in%y)) INITIALIZER(omp_priv = tt(0,0)) !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction @@ -24,13 +25,60 @@ subroutine reduce_1 ( n, tts ) !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec !PARSE-TREE: | | | Name = 'tt' -!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=tt(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)' -!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv=tt(x=0_4,y=0_4)' - +!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out=tt(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)' +!PARSE-TREE: | | | | Variable = 'omp_out' +!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | Expr = 'tt(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)' +!PARSE-TREE: | | | | | StructureConstructor +!PARSE-TREE: | | | | | | DerivedTypeSpec +!PARSE-TREE: | | | | | | | Name = 'tt' +!PARSE-TREE: | | | | | | ComponentSpec +!PARSE-TREE: | | | | | | | ComponentDataSource -> Expr = 'omp_out%x-omp_in%x' +!PARSE-TREE: | | | | | | | | Subtract +!PARSE-TREE: | | | | | | | | | Expr = 'omp_out%x' +!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | | | | | | Name = 'x' +!PARSE-TREE: | | | | | | | | | Expr = 'omp_in%x' +!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_in' +!PARSE-TREE: | | | | | | | | | | | Name = 'x' +!PARSE-TREE: | | | | | | ComponentSpec +!PARSE-TREE: | | | | | | | ComponentDataSource -> Expr = 'omp_out%y-omp_in%y' +!PARSE-TREE: | | | | | | | | Subtract +!PARSE-TREE: | | | | | | | | | Expr = 'omp_out%y' +!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | | | | | | Name = 'y' +!PARSE-TREE: | | | | | | | | | Expr = 'omp_in%y' +!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_in' +!PARSE-TREE: | | | | | | | | | | | Name = 'y' +!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance +!PARSE-TREE: | | OmpStylizedDeclaration +!PARSE-TREE: | | OmpStylizedDeclaration +!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv=tt(x=0_4,y=0_4)' +!PARSE-TREE: | | | Variable = 'omp_priv' +!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'omp_priv' +!PARSE-TREE: | | | Expr = 'tt(x=0_4,y=0_4)' +!PARSE-TREE: | | | | StructureConstructor +!PARSE-TREE: | | | | | DerivedTypeSpec +!PARSE-TREE: | | | | | | Name = 'tt' +!PARSE-TREE: | | | | | ComponentSpec +!PARSE-TREE: | | | | | | ComponentDataSource -> Expr = '0_4' +!PARSE-TREE: | | | | | | | LiteralConstant -> IntLiteralConstant = '0' +!PARSE-TREE: | | | | | ComponentSpec +!PARSE-TREE: | | | | | | ComponentDataSource -> Expr = '0_4' +!PARSE-TREE: | | | | | | | LiteralConstant -> IntLiteralConstant = '0' +!PARSE-TREE: | Flags = None !$omp declare reduction(+ : tt : omp_out = tt(omp_out%x - omp_in%x , omp_out%y - omp_in%y)) initializer(omp_priv = tt(0,0)) -!CHECK: !$OMP DECLARE REDUCTION(+:tt2: omp_out = tt2(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)) INITIALIZER(omp_priv = tt2(x=0._8,y=0._8) +!CHECK: !$OMP DECLARE REDUCTION(+:tt2: omp_out = tt2(omp_out%x - omp_in%x , omp_out%y & +!CHECK: !$OMP&- omp_in%y)) INITIALIZER(omp_priv = tt2(0,0)) !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction @@ -38,9 +86,55 @@ subroutine reduce_1 ( n, tts ) !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec !PARSE-TREE: | | | Name = 'tt2' -!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=tt2(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)' -!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv=tt2(x=0._8,y=0._8)' - +!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out=tt2(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)' +!PARSE-TREE: | | | | Variable = 'omp_out' +!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | Expr = 'tt2(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)' +!PARSE-TREE: | | | | | StructureConstructor +!PARSE-TREE: | | | | | | DerivedTypeSpec +!PARSE-TREE: | | | | | | | Name = 'tt2' +!PARSE-TREE: | | | | | | ComponentSpec +!PARSE-TREE: | | | | | | | ComponentDataSource -> Expr = 'omp_out%x-omp_in%x' +!PARSE-TREE: | | | | | | | | Subtract +!PARSE-TREE: | | | | | | | | | Expr = 'omp_out%x' +!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | | | | | | Name = 'x' +!PARSE-TREE: | | | | | | | | | Expr = 'omp_in%x' +!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_in' +!PARSE-TREE: | | | | | | | | | | | Name = 'x' +!PARSE-TREE: | | | | | | ComponentSpec +!PARSE-TREE: | | | | | | | ComponentDataSource -> Expr = 'omp_out%y-omp_in%y' +!PARSE-TREE: | | | | | | | | Subtract +!PARSE-TREE: | | | | | | | | | Expr = 'omp_out%y' +!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | | | | | | Name = 'y' +!PARSE-TREE: | | | | | | | | | Expr = 'omp_in%y' +!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_in' +!PARSE-TREE: | | | | | | | | | | | Name = 'y' +!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance +!PARSE-TREE: | | OmpStylizedDeclaration +!PARSE-TREE: | | OmpStylizedDeclaration +!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv=tt2(x=0._8,y=0._8)' +!PARSE-TREE: | | | Variable = 'omp_priv' +!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'omp_priv' +!PARSE-TREE: | | | Expr = 'tt2(x=0._8,y=0._8)' +!PARSE-TREE: | | | | StructureConstructor +!PARSE-TREE: | | | | | DerivedTypeSpec +!PARSE-TREE: | | | | | | Name = 'tt2' +!PARSE-TREE: | | | | | ComponentSpec +!PARSE-TREE: | | | | | | ComponentDataSource -> Expr = '0_4' +!PARSE-TREE: | | | | | | | LiteralConstant -> IntLiteralConstant = '0' +!PARSE-TREE: | | | | | ComponentSpec +!PARSE-TREE: | | | | | | ComponentDataSource -> Expr = '0_4' +!PARSE-TREE: | | | | | | | LiteralConstant -> IntLiteralConstant = '0' +!PARSE-TREE: | Flags = None !$omp declare reduction(+ :tt2 : omp_out = tt2(omp_out%x - omp_in%x , omp_out%y - omp_in%y)) initializer(omp_priv = tt2(0,0)) type(tt) :: diffp = tt( 0, 0 ) diff --git a/flang/test/Parser/OpenMP/declare-reduction-unparse-with-symbols.f90 b/flang/test/Parser/OpenMP/declare-reduction-unparse-with-symbols.f90 index 455fc17871ad3..f026f15ddd88c 100644 --- a/flang/test/Parser/OpenMP/declare-reduction-unparse-with-symbols.f90 +++ b/flang/test/Parser/OpenMP/declare-reduction-unparse-with-symbols.f90 @@ -8,6 +8,6 @@ subroutine f00 !CHECK: !DEF: /f00 (Subroutine) Subprogram !CHECK: subroutine f00 -!CHECK: !$omp declare reduction(fred:integer,real: omp_out = omp_in+omp_out) +!CHECK: !$omp declare reduction(fred:integer, real: omp_out = omp_in + omp_out) !CHECK: end subroutine diff --git a/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 index 73d7ccf489f01..7897eb0fb46f0 100644 --- a/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 +++ b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 @@ -19,7 +19,8 @@ subroutine initme(x,n) end subroutine initme end interface !$omp declare reduction(red_add:integer(4):omp_out=omp_out+omp_in) initializer(initme(omp_priv,0)) -!CHECK: !$OMP DECLARE REDUCTION(red_add:INTEGER(KIND=4_4): omp_out = omp_out+omp_in) INITIALIZER(initme(omp_priv, 0_4)) +!CHECK: !$OMP DECLARE REDUCTION(red_add:INTEGER(KIND=4_4): omp_out=omp_out+omp_in) INITIA& +!CHECKL !$OMP&LIZER(initme(omp_priv,0)) !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction @@ -27,9 +28,31 @@ end subroutine initme !PARSE-TREE: | | OmpReductionIdentifier -> ProcedureDesignator -> Name = 'red_add' !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> KindSelector -> Scalar -> Integer -> Constant -> Expr = '4_4' !PARSE-TREE: | | | LiteralConstant -> IntLiteralConstant = '4' -!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=omp_out+omp_in' -!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerProc -!PARSE-TREE: | | ProcedureDesignator -> Name = 'initme' +!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out=omp_out+omp_in' +!PARSE-TREE: | | | | Variable = 'omp_out' +!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | Expr = 'omp_out+omp_in' +!PARSE-TREE: | | | | | Add +!PARSE-TREE: | | | | | | Expr = 'omp_out' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | Expr = 'omp_in' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'omp_in' +!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance +!PARSE-TREE: | | OmpStylizedDeclaration +!PARSE-TREE: | | OmpStylizedDeclaration +!PARSE-TREE: | | Instance -> CallStmt = 'CALL initme(omp_priv,0_4)' +!PARSE-TREE: | | | Call +!PARSE-TREE: | | | | ProcedureDesignator -> Name = 'initme' +!PARSE-TREE: | | | | ActualArgSpec +!PARSE-TREE: | | | | | ActualArg -> Expr = 'omp_priv' +!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'omp_priv' +!PARSE-TREE: | | | | ActualArgSpec +!PARSE-TREE: | | | | | ActualArg -> Expr = '0_4' +!PARSE-TREE: | | | | | | LiteralConstant -> IntLiteralConstant = '0' +!PARSE-TREE: | Flags = None res=init !$omp simd reduction(red_add:res) @@ -59,7 +82,8 @@ end function func !CHECK-LABEL: program main program main integer :: my_var -!CHECK: !$OMP DECLARE REDUCTION(my_add_red:INTEGER: omp_out = omp_out+omp_in) INITIALIZER(omp_priv = 0_4) +!CHECK: !$OMP DECLARE REDUCTION(my_add_red:INTEGER: omp_out = omp_out + omp_in) INITIA& +!CHECK: !$OMP&LIZER(omp_priv=0) !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0) my_var = 0 @@ -74,5 +98,24 @@ end program main !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpReductionSpecifier !PARSE-TREE: | | OmpReductionIdentifier -> ProcedureDesignator -> Name = 'my_add_red' !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> -!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=omp_out+omp_in' -!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv=0_4' +!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out=omp_out+omp_in' +!PARSE-TREE: | | | | Variable = 'omp_out' +!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | Expr = 'omp_out+omp_in' +!PARSE-TREE: | | | | | Add +!PARSE-TREE: | | | | | | Expr = 'omp_out' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | Expr = 'omp_in' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'omp_in' +!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance +!PARSE-TREE: | | OmpStylizedDeclaration +!PARSE-TREE: | | OmpStylizedDeclaration +!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv=0_4' +!PARSE-TREE: | | | Variable = 'omp_priv' +!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'omp_priv' +!PARSE-TREE: | | | Expr = '0_4' +!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '0' +!PARSE-TREE: | Flags = None diff --git a/flang/test/Parser/OpenMP/metadirective-dirspec.f90 b/flang/test/Parser/OpenMP/metadirective-dirspec.f90 index c373001be8963..b64ceb1a98164 100644 --- a/flang/test/Parser/OpenMP/metadirective-dirspec.f90 +++ b/flang/test/Parser/OpenMP/metadirective-dirspec.f90 @@ -105,8 +105,8 @@ subroutine f03 !UNPARSE: TYPE :: tt2 !UNPARSE: REAL :: x !UNPARSE: END TYPE -!UNPARSE: !$OMP METADIRECTIVE WHEN(USER={CONDITION(.true._4)}: DECLARE REDUCTION(+:tt1,tt2: omp_out%x = omp_in%x+omp_out%x)& -!UNPARSE: !$OMP&) +!UNPARSE: !$OMP METADIRECTIVE WHEN(USER={CONDITION(.true._4)}: DECLARE REDUCTION(+:tt1, tt2: omp& +!UNPARSE: !$OMP&_out%x = omp_in%x + omp_out%x)) !UNPARSE: END SUBROUTINE !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpMetadirectiveDirective @@ -127,21 +127,44 @@ subroutine f03 !PARSE-TREE: | | | | | Name = 'tt1' !PARSE-TREE: | | | | OmpTypeName -> TypeSpec -> DerivedTypeSpec !PARSE-TREE: | | | | | Name = 'tt2' -!PARSE-TREE: | | | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%x=omp_in%x+omp_out%x' -!PARSE-TREE: | | | | | | Designator -> DataRef -> StructureComponent -!PARSE-TREE: | | | | | | | DataRef -> Name = 'omp_out' -!PARSE-TREE: | | | | | | | Name = 'x' -!PARSE-TREE: | | | | | Expr = 'omp_in%x+omp_out%x' -!PARSE-TREE: | | | | | | Add -!PARSE-TREE: | | | | | | | Expr = 'omp_in%x' -!PARSE-TREE: | | | | | | | | Designator -> DataRef -> StructureComponent -!PARSE-TREE: | | | | | | | | | DataRef -> Name = 'omp_in' -!PARSE-TREE: | | | | | | | | | Name = 'x' -!PARSE-TREE: | | | | | | | Expr = 'omp_out%x' -!PARSE-TREE: | | | | | | | | Designator -> DataRef -> StructureComponent -!PARSE-TREE: | | | | | | | | | DataRef -> Name = 'omp_out' -!PARSE-TREE: | | | | | | | | | Name = 'x' +!PARSE-TREE: | | | | OmpCombinerExpression -> OmpStylizedInstance +!PARSE-TREE: | | | | | OmpStylizedDeclaration +!PARSE-TREE: | | | | | OmpStylizedDeclaration +!PARSE-TREE: | | | | | Instance -> AssignmentStmt = 'omp_out%x=omp_in%x+omp_out%x' +!PARSE-TREE: | | | | | | Variable = 'omp_out%x' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | | | Name = 'x' +!PARSE-TREE: | | | | | | Expr = 'omp_in%x+omp_out%x' +!PARSE-TREE: | | | | | | | Add +!PARSE-TREE: | | | | | | | | Expr = 'omp_in%x' +!PARSE-TREE: | | | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | | | DataRef -> Name = 'omp_in' +!PARSE-TREE: | | | | | | | | | | Name = 'x' +!PARSE-TREE: | | | | | | | | Expr = 'omp_out%x' +!PARSE-TREE: | | | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | | | DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | | | | | Name = 'x' +!PARSE-TREE: | | | | OmpStylizedInstance +!PARSE-TREE: | | | | | OmpStylizedDeclaration +!PARSE-TREE: | | | | | OmpStylizedDeclaration +!PARSE-TREE: | | | | | Instance -> AssignmentStmt = 'omp_out%x=omp_in%x+omp_out%x' +!PARSE-TREE: | | | | | | Variable = 'omp_out%x' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | | | Name = 'x' +!PARSE-TREE: | | | | | | Expr = 'omp_in%x+omp_out%x' +!PARSE-TREE: | | | | | | | Add +!PARSE-TREE: | | | | | | | | Expr = 'omp_in%x' +!PARSE-TREE: | | | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | | | DataRef -> Name = 'omp_in' +!PARSE-TREE: | | | | | | | | | | Name = 'x' +!PARSE-TREE: | | | | | | | | Expr = 'omp_out%x' +!PARSE-TREE: | | | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | | | DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | | | | | Name = 'x' !PARSE-TREE: | | | OmpClauseList -> +!PARSE-TREE: | | | Flags = None subroutine f04 !$omp metadirective when(user={condition(.true.)}: & diff --git a/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90 b/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90 index 39e8f059bbb24..50a38c6494aa6 100644 --- a/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90 +++ b/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90 @@ -79,7 +79,7 @@ subroutine f02 !UNPARSE: TYPE :: t !UNPARSE: INTEGER :: x !UNPARSE: END TYPE -!UNPARSE: !$OMP DECLARE_REDUCTION(+:t: omp_out%x = omp_out%x+omp_in%x) +!UNPARSE: !$OMP DECLARE_REDUCTION(+:t: omp_out%x = omp_out%x + omp_in%x) !UNPARSE: END SUBROUTINE !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification @@ -88,21 +88,24 @@ subroutine f02 !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec !PARSE-TREE: | | | Name = 't' -!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%x=omp_out%x+omp_in%x' -!PARSE-TREE: | | | Variable = 'omp_out%x' -!PARSE-TREE: | | | | Designator -> DataRef -> StructureComponent -!PARSE-TREE: | | | | | DataRef -> Name = 'omp_out' -!PARSE-TREE: | | | | | Name = 'x' -!PARSE-TREE: | | | Expr = 'omp_out%x+omp_in%x' -!PARSE-TREE: | | | | Add -!PARSE-TREE: | | | | | Expr = 'omp_out%x' -!PARSE-TREE: | | | | | | Designator -> DataRef -> StructureComponent -!PARSE-TREE: | | | | | | | DataRef -> Name = 'omp_out' -!PARSE-TREE: | | | | | | | Name = 'x' -!PARSE-TREE: | | | | | Expr = 'omp_in%x' -!PARSE-TREE: | | | | | | Designator -> DataRef -> StructureComponent -!PARSE-TREE: | | | | | | | DataRef -> Name = 'omp_in' -!PARSE-TREE: | | | | | | | Name = 'x' +!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | OmpStylizedDeclaration +!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out%x=omp_out%x+omp_in%x' +!PARSE-TREE: | | | | Variable = 'omp_out%x' +!PARSE-TREE: | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | Name = 'x' +!PARSE-TREE: | | | | Expr = 'omp_out%x+omp_in%x' +!PARSE-TREE: | | | | | Add +!PARSE-TREE: | | | | | | Expr = 'omp_out%x' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_out' +!PARSE-TREE: | | | | | | | | Name = 'x' +!PARSE-TREE: | | | | | | Expr = 'omp_in%x' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent +!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_in' +!PARSE-TREE: | | | | | | | | Name = 'x' !PARSE-TREE: | OmpClauseList -> !PARSE-TREE: | Flags = None diff --git a/flang/test/Semantics/OpenMP/declare-reduction-error.f90 b/flang/test/Semantics/OpenMP/declare-reduction-error.f90 deleted file mode 100644 index 21f5cc186e037..0000000000000 --- a/flang/test/Semantics/OpenMP/declare-reduction-error.f90 +++ /dev/null @@ -1,11 +0,0 @@ -! RUN: not %flang_fc1 -emit-obj -fopenmp -fopenmp-version=50 %s 2>&1 | FileCheck %s - -subroutine initme(x,n) - integer x,n - x=n -end subroutine initme - -subroutine subr - !$omp declare reduction(red_add:integer(4):omp_out=omp_out+omp_in) initializer(initme(omp_priv,0)) - !CHECK: error: Implicit subroutine declaration 'initme' in DECLARE REDUCTION -end subroutine subr diff --git a/flang/test/Semantics/OpenMP/declare-reduction-functions.f90 b/flang/test/Semantics/OpenMP/declare-reduction-functions.f90 index 000d323f522cf..89e0771e8abff 100644 --- a/flang/test/Semantics/OpenMP/declare-reduction-functions.f90 +++ b/flang/test/Semantics/OpenMP/declare-reduction-functions.f90 @@ -57,9 +57,10 @@ function functwo(x, n) !CHECK: adder: UserReductionDetails TYPE(two) !CHECK OtherConstruct scope !CHECK: omp_in size=8 offset=0: ObjectEntity type: TYPE(two) -!CHECK: omp_orig size=8 offset=8: ObjectEntity type: TYPE(two) -!CHECK: omp_out size=8 offset=16: ObjectEntity type: TYPE(two) -!CHECK: omp_priv size=8 offset=24: ObjectEntity type: TYPE(two) +!CHECK: omp_out size=8 offset=8: ObjectEntity type: TYPE(two) +!CHECK OtherConstruct scope +!CHECK: omp_orig size=8 offset=0: ObjectEntity type: TYPE(two) +!CHECK: omp_priv size=8 offset=8: ObjectEntity type: TYPE(two) !$omp simd reduction(adder:res) @@ -101,14 +102,16 @@ function functwothree(x, n) !CHECK: adder: UserReductionDetails TYPE(two) TYPE(three) !CHECK OtherConstruct scope !CHECK: omp_in size=8 offset=0: ObjectEntity type: TYPE(two) -!CHECK: omp_orig size=8 offset=8: ObjectEntity type: TYPE(two) -!CHECK: omp_out size=8 offset=16: ObjectEntity type: TYPE(two) -!CHECK: omp_priv size=8 offset=24: ObjectEntity type: TYPE(two) +!CHECK: omp_out size=8 offset=8: ObjectEntity type: TYPE(two) +!CHECK OtherConstruct scope +!CHECK: omp_orig size=8 offset=0: ObjectEntity type: TYPE(two) +!CHECK: omp_priv size=8 offset=8: ObjectEntity type: TYPE(two) !CHECK OtherConstruct scope !CHECK: omp_in size=24 offset=0: ObjectEntity type: TYPE(three) -!CHECK: omp_orig size=24 offset=24: ObjectEntity type: TYPE(three) -!CHECK: omp_out size=24 offset=48: ObjectEntity type: TYPE(three) -!CHECK: omp_priv size=24 offset=72: ObjectEntity type: TYPE(three) +!CHECK: omp_out size=24 offset=24: ObjectEntity type: TYPE(three) +!CHECK OtherConstruct scope +!CHECK: omp_orig size=24 offset=0: ObjectEntity type: TYPE(three) +!CHECK: omp_priv size=24 offset=24: ObjectEntity type: TYPE(three) !$omp simd reduction(adder:res3) do i=1,n @@ -135,9 +138,10 @@ function funcBtwo(x, n) !CHECK: op.+: UserReductionDetails TYPE(two) !CHECK OtherConstruct scope !CHECK: omp_in size=8 offset=0: ObjectEntity type: TYPE(two) -!CHECK: omp_orig size=8 offset=8: ObjectEntity type: TYPE(two) -!CHECK: omp_out size=8 offset=16: ObjectEntity type: TYPE(two) -!CHECK: omp_priv size=8 offset=24: ObjectEntity type: TYPE(two) +!CHECK: omp_out size=8 offset=8: ObjectEntity type: TYPE(two) +!CHECK OtherConstruct scope +!CHECK: omp_orig size=8 offset=0: ObjectEntity type: TYPE(two) +!CHECK: omp_priv size=8 offset=8: ObjectEntity type: TYPE(two) !$omp simd reduction(+:res) @@ -163,14 +167,16 @@ function funcBtwothree(x, n) !CHECK: op.+: UserReductionDetails TYPE(two) TYPE(three) !CHECK OtherConstruct scope !CHECK: omp_in size=8 offset=0: ObjectEntity type: TYPE(two) -!CHECK: omp_orig size=8 offset=8: ObjectEntity type: TYPE(two) -!CHECK: omp_out size=8 offset=16: ObjectEntity type: TYPE(two) -!CHECK: omp_priv size=8 offset=24: ObjectEntity type: TYPE(two) +!CHECK: omp_out size=8 offset=8: ObjectEntity type: TYPE(two) +!CHECK OtherConstruct scope +!CHECK: omp_orig size=8 offset=0: ObjectEntity type: TYPE(two) +!CHECK: omp_priv size=8 offset=8: ObjectEntity type: TYPE(two) !CHECK: OtherConstruct scope !CHECK: omp_in size=24 offset=0: ObjectEntity type: TYPE(three) -!CHECK: omp_orig size=24 offset=24: ObjectEntity type: TYPE(three) -!CHECK: omp_out size=24 offset=48: ObjectEntity type: TYPE(three) -!CHECK: omp_priv size=24 offset=72: ObjectEntity type: TYPE(three) +!CHECK: omp_out size=24 offset=24: ObjectEntity type: TYPE(three) +!CHECK OtherConstruct scope +!CHECK: omp_orig size=24 offset=0: ObjectEntity type: TYPE(three) +!CHECK: omp_priv size=24 offset=24: ObjectEntity type: TYPE(three) !$omp simd reduction(+:res3) do i=1,n @@ -183,6 +189,7 @@ function funcBtwothree(x, n) enddo res%t2 = res2 res%t3 = res3 + funcBtwothree = res end function funcBtwothree !! This is checking a special case, where a reduction is declared inside a @@ -191,11 +198,12 @@ end function funcBtwothree pure logical function reduction() !CHECK: reduction size=4 offset=0: ObjectEntity funcResult type: LOGICAL(4) !CHECK: rr: UserReductionDetails INTEGER(4) -!CHECK: OtherConstruct scope: size=16 alignment=4 sourceRange=0 bytes +!CHECK: OtherConstruct scope: size=8 alignment=4 sourceRange=0 bytes !CHECK: omp_in size=4 offset=0: ObjectEntity type: INTEGER(4) -!CHECK: omp_orig size=4 offset=4: ObjectEntity type: INTEGER(4) -!CHECK: omp_out size=4 offset=8: ObjectEntity type: INTEGER(4) -!CHECK: omp_priv size=4 offset=12: ObjectEntity type: INTEGER(4) +!CHECK: omp_out size=4 offset=4: ObjectEntity type: INTEGER(4) +!CHECK: OtherConstruct scope: size=8 alignment=4 sourceRange=0 bytes +!CHECK: omp_orig size=4 offset=0: ObjectEntity type: INTEGER(4) +!CHECK: omp_priv size=4 offset=4: ObjectEntity type: INTEGER(4) !$omp declare reduction (rr : integer : omp_out = omp_out + omp_in) initializer (omp_priv = 0) reduction = .false. end function reduction diff --git a/flang/test/Semantics/OpenMP/declare-reduction-logical.f90 b/flang/test/Semantics/OpenMP/declare-reduction-logical.f90 index 7ab7cad473ac8..87fcecdbae2a5 100644 --- a/flang/test/Semantics/OpenMP/declare-reduction-logical.f90 +++ b/flang/test/Semantics/OpenMP/declare-reduction-logical.f90 @@ -18,9 +18,10 @@ function func(x, n) !CHECK: op.AND: UserReductionDetails TYPE(logicalwrapper) !CHECK OtherConstruct scope !CHECK: omp_in size=4 offset=0: ObjectEntity type: TYPE(logicalwrapper) -!CHECK: omp_orig size=4 offset=4: ObjectEntity type: TYPE(logicalwrapper) -!CHECK: omp_out size=4 offset=8: ObjectEntity type: TYPE(logicalwrapper) -!CHECK: omp_priv size=4 offset=12: ObjectEntity type: TYPE(logicalwrapper) +!CHECK: omp_out size=4 offset=4: ObjectEntity type: TYPE(logicalwrapper) +!CHECK OtherConstruct scope +!CHECK: omp_orig size=4 offset=0: ObjectEntity type: TYPE(logicalwrapper) +!CHECK: omp_priv size=4 offset=4: ObjectEntity type: TYPE(logicalwrapper) !$omp simd reduction(.AND.:res) do i=1,n diff --git a/flang/test/Semantics/OpenMP/declare-reduction-modfile.f90 b/flang/test/Semantics/OpenMP/declare-reduction-modfile.f90 index 0882de80fdcc6..763179cb52a13 100644 --- a/flang/test/Semantics/OpenMP/declare-reduction-modfile.f90 +++ b/flang/test/Semantics/OpenMP/declare-reduction-modfile.f90 @@ -6,13 +6,13 @@ !type::t1 !integer(4)::val !endtype -!!$OMP DECLARE REDUCTION(*:t1:omp_out=omp_out*omp_in)INITIALIZER(omp_priv=& -!!$OMP&t1(1)) +!!$OMP DECLARE REDUCTION(*:t1: omp_out=omp_out*omp_in) INITIALIZER(omp_priv=t1(& +!!$OMP&1)) !!$OMP METADIRECTIVE OTHERWISE(DECLARE REDUCTION(+:INTEGER)) -!!$OMP DECLARE REDUCTION(.fluffy.:t1:omp_out=omp_out.fluffy.omp_in)INITIALI& -!!$OMP&ZER(omp_priv=t1(0)) -!!$OMP DECLARE REDUCTION(.mul.:t1:omp_out=omp_out.mul.omp_in)INITIALIZER(om& -!!$OMP&p_priv=t1(1)) +!!$OMP DECLARE REDUCTION(.fluffy.:t1: omp_out=omp_out.fluffy.omp_in) INITIALIZE& +!!$OMP&R(omp_priv=t1(0)) +!!$OMP DECLARE REDUCTION(.mul.:t1: omp_out=omp_out.mul.omp_in) INITIALIZER(omp_& +!!$OMP&priv=t1(1)) !interface operator(.mul.) !procedure::mul !end interface diff --git a/flang/test/Semantics/OpenMP/declare-reduction-operator.f90 b/flang/test/Semantics/OpenMP/declare-reduction-operator.f90 index dc12332b80baf..5fc42054882f0 100644 --- a/flang/test/Semantics/OpenMP/declare-reduction-operator.f90 +++ b/flang/test/Semantics/OpenMP/declare-reduction-operator.f90 @@ -11,11 +11,9 @@ module m1 !$omp declare reduction(.fluffy.:t1:omp_out=omp_out.fluffy.omp_in) !CHECK: op.fluffy., PUBLIC: UserReductionDetails TYPE(t1) !CHECK: t1, PUBLIC: DerivedType components: val -!CHECK: OtherConstruct scope: size=16 alignment=4 sourceRange=0 bytes +!CHECK: OtherConstruct scope: size=8 alignment=4 sourceRange=0 bytes !CHECK: omp_in size=4 offset=0: ObjectEntity type: TYPE(t1) -!CHECK: omp_orig size=4 offset=4: ObjectEntity type: TYPE(t1) -!CHECK: omp_out size=4 offset=8: ObjectEntity type: TYPE(t1) -!CHECK: omp_priv size=4 offset=12: ObjectEntity type: TYPE(t1) +!CHECK: omp_out size=4 offset=4: ObjectEntity type: TYPE(t1) contains function my_mul(x, y) type (t1), intent (in) :: x, y diff --git a/flang/test/Semantics/OpenMP/declare-reduction-operators.f90 b/flang/test/Semantics/OpenMP/declare-reduction-operators.f90 index 84dbe1af01877..e0006bfb1fb6a 100644 --- a/flang/test/Semantics/OpenMP/declare-reduction-operators.f90 +++ b/flang/test/Semantics/OpenMP/declare-reduction-operators.f90 @@ -64,9 +64,10 @@ program test_vector !CHECK: OtherConstruct scope: !CHECK: omp_in size=12 offset=0: ObjectEntity type: TYPE(vector) -!CHECK: omp_orig size=12 offset=12: ObjectEntity type: TYPE(vector) -!CHECK: omp_out size=12 offset=24: ObjectEntity type: TYPE(vector) -!CHECK: omp_priv size=12 offset=36: ObjectEntity type: TYPE(vector) +!CHECK: omp_out size=12 offset=12: ObjectEntity type: TYPE(vector) +!CHECK: OtherConstruct scope: +!CHECK: omp_orig size=12 offset=0: ObjectEntity type: TYPE(vector) +!CHECK: omp_priv size=12 offset=12: ObjectEntity type: TYPE(vector) v2 = Vector(0.0, 0.0, 0.0) v1 = Vector(1.0, 2.0, 3.0) diff --git a/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90 b/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90 index 9cd638d796091..115fe517be181 100644 --- a/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90 +++ b/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90 @@ -33,11 +33,12 @@ program test_omp_reduction !$omp declare reduction (.modmul. : t1 : omp_out = omp_out .modmul. omp_in) initializer(omp_priv = t1(1.0)) !CHECK: op.modmul.: UserReductionDetails TYPE(t1) !CHECK: t1: Use from t1 in module1 -!CHECK: OtherConstruct scope: size=16 alignment=4 sourceRange=0 bytes +!CHECK: OtherConstruct scope: size=8 alignment=4 sourceRange=0 bytes !CHECK: omp_in size=4 offset=0: ObjectEntity type: TYPE(t1) -!CHECK: omp_orig size=4 offset=4: ObjectEntity type: TYPE(t1) -!CHECK: omp_out size=4 offset=8: ObjectEntity type: TYPE(t1) -!CHECK: omp_priv size=4 offset=12: ObjectEntity type: TYPE(t1) +!CHECK: omp_out size=4 offset=4: ObjectEntity type: TYPE(t1) +!CHECK: OtherConstruct scope: size=8 alignment=4 sourceRange=0 bytes +!CHECK: omp_orig size=4 offset=0: ObjectEntity type: TYPE(t1) +!CHECK: omp_priv size=4 offset=4: ObjectEntity type: TYPE(t1) result = t1(1.0) !$omp parallel do reduction(.modmul.:result) do i = 1, 10 diff --git a/flang/test/Semantics/OpenMP/declare-reduction.f90 b/flang/test/Semantics/OpenMP/declare-reduction.f90 index 1f39c57c54ad1..c8dee5e240918 100644 --- a/flang/test/Semantics/OpenMP/declare-reduction.f90 +++ b/flang/test/Semantics/OpenMP/declare-reduction.f90 @@ -19,10 +19,12 @@ end subroutine initme !$omp declare reduction(red_add:integer(4):omp_out=omp_out+omp_in) initializer(initme(omp_priv,0)) !CHECK: red_add: UserReductionDetails !CHECK: Subprogram scope: initme +!CHECK: OtherConstruct scope: !CHECK: omp_in size=4 offset=0: ObjectEntity type: INTEGER(4) -!CHECK: omp_orig size=4 offset=4: ObjectEntity type: INTEGER(4) -!CHECK: omp_out size=4 offset=8: ObjectEntity type: INTEGER(4) -!CHECK: omp_priv size=4 offset=12: ObjectEntity type: INTEGER(4) +!CHECK: omp_out size=4 offset=4: ObjectEntity type: INTEGER(4) +!CHECK: OtherConstruct scope: +!CHECK: omp_orig size=4 offset=0: ObjectEntity type: INTEGER(4) +!CHECK: omp_priv size=4 offset=4: ObjectEntity type: INTEGER(4) !$omp simd reduction(red_add:res) do i=1,n res=res+x(i) @@ -36,9 +38,11 @@ program main !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0) !CHECK: my_add_red: UserReductionDetails +!CHECK: OtherConstruct scope: !CHECK: omp_in size=4 offset=0: ObjectEntity type: INTEGER(4) -!CHECK: omp_orig size=4 offset=4: ObjectEntity type: INTEGER(4) -!CHECK: omp_out size=4 offset=8: ObjectEntity type: INTEGER(4) -!CHECK: omp_priv size=4 offset=12: ObjectEntity type: INTEGER(4) +!CHECK: omp_out size=4 offset=4: ObjectEntity type: INTEGER(4) +!CHECK: OtherConstruct scope: +!CHECK: omp_orig size=4 offset=0: ObjectEntity type: INTEGER(4) +!CHECK: omp_priv size=4 offset=4: ObjectEntity type: INTEGER(4) end program main From dfc1adaaebd6a248c2ae525577650953ea8f6b72 Mon Sep 17 00:00:00 2001 From: Sietze Riemersma <43845930+KungFuDonkey@users.noreply.github.com> Date: Tue, 28 Oct 2025 19:01:13 +0100 Subject: [PATCH 045/539] [HLSL][DXIL][SPRIV] Added WaveActiveMin intrinsic (#164385) Adds the WaveActiveMin intrinsic from #99169. I think I did all of the required things on the checklist: - [x] Implement `WaveActiveMin` clang builtin, - [x] Link `WaveActiveMin` clang builtin with `hlsl_intrinsics.h` - [x] Add sema checks for `WaveActiveMin` to `CheckHLSLBuiltinFunctionCall` in `SemaChecking.cpp` - [x] Add codegen for `WaveActiveMin` to `EmitHLSLBuiltinExpr` in `CGBuiltin.cpp` - [x] Add codegen tests to `clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl` - [x] Add sema tests to `clang/test/SemaHLSL/BuiltIns/WaveActiveMin-errors.hlsl` - [x] Create the `int_dx_WaveActiveMin` intrinsic in `IntrinsicsDirectX.td` - [x] Create the `DXILOpMapping` of `int_dx_WaveActiveMin` to `119` in `DXIL.td` - [x] Create the `WaveActiveMin.ll` and `WaveActiveMin_errors.ll` tests in `llvm/test/CodeGen/DirectX/` - [x] Create the `int_spv_WaveActiveMin` intrinsic in `IntrinsicsSPIRV.td` - [x] In SPIRVInstructionSelector.cpp create the `WaveActiveMin` lowering and map it to `int_spv_WaveActiveMin` in `SPIRVInstructionSelector::selectIntrinsic`. - [x] Create SPIR-V backend test case in `llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll But as some of the code has changed and was moved around (E.G. `CGBuiltin.cpp` -> `CGHLSLBuiltins.cpp`) I mostly followed how `WaveActiveMax()` is implemented. I have not been able to run the tests myself as I am unsure which project runs the correct test. Any guidance on how I can test myself would be helpful. Also added some tests to the offload-test-suite https://github.com/llvm/offload-test-suite/pull/478 --- clang/include/clang/Basic/Builtins.td | 6 + clang/lib/CodeGen/CGHLSLBuiltins.cpp | 32 +++- .../lib/Headers/hlsl/hlsl_alias_intrinsics.h | 123 +++++++++++++++ clang/lib/Sema/SemaHLSL.cpp | 1 + .../CodeGenHLSL/builtins/WaveActiveMin.hlsl | 46 ++++++ .../test/SemaHLSL/BuiltIns/WaveActiveMin.hlsl | 29 ++++ llvm/include/llvm/IR/IntrinsicsDirectX.td | 2 + llvm/include/llvm/IR/IntrinsicsSPIRV.td | 4 +- llvm/lib/Target/DirectX/DXIL.td | 10 ++ llvm/lib/Target/DirectX/DXILShaderFlags.cpp | 2 + .../DirectX/DirectXTargetTransformInfo.cpp | 2 + .../Target/SPIRV/SPIRVInstructionSelector.cpp | 36 +++++ .../CodeGen/DirectX/ShaderFlags/wave-ops.ll | 14 ++ llvm/test/CodeGen/DirectX/WaveActiveMin.ll | 143 ++++++++++++++++++ .../SPIRV/hlsl-intrinsics/WaveActiveMin.ll | 57 +++++++ 15 files changed, 505 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl create mode 100644 clang/test/SemaHLSL/BuiltIns/WaveActiveMin.hlsl create mode 100644 llvm/test/CodeGen/DirectX/WaveActiveMin.ll create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index a2c202158522f..2b400b012d6ed 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -5030,6 +5030,12 @@ def HLSLWaveActiveMax : LangBuiltin<"HLSL_LANG"> { let Prototype = "void (...)"; } +def HLSLWaveActiveMin : LangBuiltin<"HLSL_LANG"> { + let Spellings = ["__builtin_hlsl_wave_active_min"]; + let Attributes = [NoThrow, Const]; + let Prototype = "void (...)"; +} + def HLSLWaveActiveSum : LangBuiltin<"HLSL_LANG"> { let Spellings = ["__builtin_hlsl_wave_active_sum"]; let Attributes = [NoThrow, Const]; diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp index 384bd59e7533a..fbf4a5722caed 100644 --- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp +++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp @@ -206,7 +206,7 @@ static Intrinsic::ID getWaveActiveSumIntrinsic(llvm::Triple::ArchType Arch, } } -// Return wave active sum that corresponds to the QT scalar type +// Return wave active max that corresponds to the QT scalar type static Intrinsic::ID getWaveActiveMaxIntrinsic(llvm::Triple::ArchType Arch, CGHLSLRuntime &RT, QualType QT) { switch (Arch) { @@ -225,6 +225,25 @@ static Intrinsic::ID getWaveActiveMaxIntrinsic(llvm::Triple::ArchType Arch, } } +// Return wave active min that corresponds to the QT scalar type +static Intrinsic::ID getWaveActiveMinIntrinsic(llvm::Triple::ArchType Arch, + CGHLSLRuntime &RT, QualType QT) { + switch (Arch) { + case llvm::Triple::spirv: + if (QT->isUnsignedIntegerType()) + return Intrinsic::spv_wave_reduce_umin; + return Intrinsic::spv_wave_reduce_min; + case llvm::Triple::dxil: { + if (QT->isUnsignedIntegerType()) + return Intrinsic::dx_wave_reduce_umin; + return Intrinsic::dx_wave_reduce_min; + } + default: + llvm_unreachable("Intrinsic WaveActiveMin" + " not supported by target architecture"); + } +} + // Returns the mangled name for a builtin function that the SPIR-V backend // will expand into a spec Constant. static std::string getSpecConstantFunctionName(clang::QualType SpecConstantType, @@ -742,6 +761,17 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, &CGM.getModule(), IID, {OpExpr->getType()}), ArrayRef{OpExpr}, "hlsl.wave.active.max"); } + case Builtin::BI__builtin_hlsl_wave_active_min: { + // Due to the use of variadic arguments, explicitly retreive argument + Value *OpExpr = EmitScalarExpr(E->getArg(0)); + Intrinsic::ID IID = getWaveActiveMinIntrinsic( + getTarget().getTriple().getArch(), CGM.getHLSLRuntime(), + E->getArg(0)->getType()); + + return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration( + &CGM.getModule(), IID, {OpExpr->getType()}), + ArrayRef{OpExpr}, "hlsl.wave.active.min"); + } case Builtin::BI__builtin_hlsl_wave_get_lane_index: { // We don't define a SPIR-V intrinsic, instead it is a SPIR-V built-in // defined in SPIRVBuiltins.td. So instead we manually get the matching name diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h index d973371312701..a918af39e4074 100644 --- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h @@ -2597,6 +2597,129 @@ __attribute__((convergent)) double3 WaveActiveMax(double3); _HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max) __attribute__((convergent)) double4 WaveActiveMax(double4); +//===----------------------------------------------------------------------===// +// WaveActiveMin builtins +//===----------------------------------------------------------------------===// + +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) half WaveActiveMin(half); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) half2 WaveActiveMin(half2); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) half3 WaveActiveMin(half3); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) half4 WaveActiveMin(half4); + +#ifdef __HLSL_ENABLE_16_BIT +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) int16_t WaveActiveMin(int16_t); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) int16_t2 WaveActiveMin(int16_t2); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) int16_t3 WaveActiveMin(int16_t3); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) int16_t4 WaveActiveMin(int16_t4); + +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) uint16_t WaveActiveMin(uint16_t); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) uint16_t2 WaveActiveMin(uint16_t2); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) uint16_t3 WaveActiveMin(uint16_t3); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) uint16_t4 WaveActiveMin(uint16_t4); +#endif + +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) int WaveActiveMin(int); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) int2 WaveActiveMin(int2); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) int3 WaveActiveMin(int3); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) int4 WaveActiveMin(int4); + +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) uint WaveActiveMin(uint); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) uint2 WaveActiveMin(uint2); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) uint3 WaveActiveMin(uint3); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) uint4 WaveActiveMin(uint4); + +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) int64_t WaveActiveMin(int64_t); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) int64_t2 WaveActiveMin(int64_t2); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) int64_t3 WaveActiveMin(int64_t3); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) int64_t4 WaveActiveMin(int64_t4); + +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) uint64_t WaveActiveMin(uint64_t); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) uint64_t2 WaveActiveMin(uint64_t2); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) uint64_t3 WaveActiveMin(uint64_t3); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) uint64_t4 WaveActiveMin(uint64_t4); + +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) float WaveActiveMin(float); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) float2 WaveActiveMin(float2); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) float3 WaveActiveMin(float3); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) float4 WaveActiveMin(float4); + +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) double WaveActiveMin(double); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) double2 WaveActiveMin(double2); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) double3 WaveActiveMin(double3); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min) +__attribute__((convergent)) double4 WaveActiveMin(double4); + //===----------------------------------------------------------------------===// // WaveActiveSum builtins //===----------------------------------------------------------------------===// diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 96d51426e0b5c..94a490a8f68dc 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -3279,6 +3279,7 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { break; } case Builtin::BI__builtin_hlsl_wave_active_max: + case Builtin::BI__builtin_hlsl_wave_active_min: case Builtin::BI__builtin_hlsl_wave_active_sum: { if (SemaRef.checkArgCount(TheCall, 1)) return true; diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl new file mode 100644 index 0000000000000..1194f842deed6 --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl @@ -0,0 +1,46 @@ +// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \ +// RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,CHECK-DXIL +// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \ +// RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV + +// Test basic lowering to runtime function call. + +// CHECK-LABEL: test_int +int test_int(int expr) { + // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.min.i32([[TY]] %[[#]]) + // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.min.i32([[TY]] %[[#]]) + // CHECK: ret [[TY]] %[[RET]] + return WaveActiveMin(expr); +} + +// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.reduce.min.i32([[TY]]) #[[#attr:]] +// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.reduce.min.i32([[TY]]) #[[#attr:]] + +// CHECK-LABEL: test_uint64_t +uint64_t test_uint64_t(uint64_t expr) { + // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.umin.i64([[TY]] %[[#]]) + // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.umin.i64([[TY]] %[[#]]) + // CHECK: ret [[TY]] %[[RET]] + return WaveActiveMin(expr); +} + +// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.reduce.umin.i64([[TY]]) #[[#attr:]] +// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.reduce.umin.i64([[TY]]) #[[#attr:]] + +// Test basic lowering to runtime function call with array and float value. + +// CHECK-LABEL: test_floatv4 +float4 test_floatv4(float4 expr) { + // CHECK-SPIRV: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.reduce.min.v4f32([[TY1]] %[[#]] + // CHECK-DXIL: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.reduce.min.v4f32([[TY1]] %[[#]]) + // CHECK: ret [[TY1]] %[[RET1]] + return WaveActiveMin(expr); +} + +// CHECK-DXIL: declare [[TY1]] @llvm.dx.wave.reduce.min.v4f32([[TY1]]) #[[#attr]] +// CHECK-SPIRV: declare [[TY1]] @llvm.spv.wave.reduce.min.v4f32([[TY1]]) #[[#attr]] + +// CHECK: attributes #[[#attr]] = {{{.*}} convergent {{.*}}} + diff --git a/clang/test/SemaHLSL/BuiltIns/WaveActiveMin.hlsl b/clang/test/SemaHLSL/BuiltIns/WaveActiveMin.hlsl new file mode 100644 index 0000000000000..3b12faf8d9978 --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/WaveActiveMin.hlsl @@ -0,0 +1,29 @@ +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify + +int test_too_few_arg() { + return __builtin_hlsl_wave_active_min(); + // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} +} + +float2 test_too_many_arg(float2 p0) { + return __builtin_hlsl_wave_active_min(p0, p0); + // expected-error@-1 {{too many arguments to function call, expected 1, have 2}} +} + +bool test_expr_bool_type_check(bool p0) { + return __builtin_hlsl_wave_active_min(p0); + // expected-error@-1 {{invalid operand of type 'bool'}} +} + +bool2 test_expr_bool_vec_type_check(bool2 p0) { + return __builtin_hlsl_wave_active_min(p0); + // expected-error@-1 {{invalid operand of type 'bool2' (aka 'vector')}} +} + +struct S { float f; }; + +S test_expr_struct_type_check(S p0) { + return __builtin_hlsl_wave_active_min(p0); + // expected-error@-1 {{invalid operand of type 'S' where a scalar or vector is required}} +} + diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index 3b7077c52db21..d6b85630eb979 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -153,6 +153,8 @@ def int_dx_wave_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i1_ty], [IntrCon def int_dx_wave_getlaneindex : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrConvergent, IntrNoMem]>; def int_dx_wave_reduce_max : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>; def int_dx_wave_reduce_umax : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>; +def int_dx_wave_reduce_min : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>; +def int_dx_wave_reduce_umin : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>; def int_dx_wave_reduce_sum : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>; def int_dx_wave_reduce_usum : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>; def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>; diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index 49a182be98acd..bc51fb639fd75 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -122,6 +122,8 @@ def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty] def int_spv_wave_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i1_ty], [IntrConvergent, IntrNoMem]>; def int_spv_wave_reduce_umax : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>; def int_spv_wave_reduce_max : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>; + def int_spv_wave_reduce_min : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>; + def int_spv_wave_reduce_umin : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>; def int_spv_wave_reduce_sum : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>; def int_spv_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>; def int_spv_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>; @@ -136,7 +138,7 @@ def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty] def int_spv_sclamp : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; def int_spv_nclamp : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - // Create resource handle given the binding information. Returns a + // Create resource handle given the binding information. Returns a // type appropriate for the kind of resource given the set id, binding id, // array size of the binding, as well as an index and an indicator // whether that index may be non-uniform. diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 44c48305f2832..7ae500a55b92d 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -1058,6 +1058,16 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> { IntrinArgIndex<0>, IntrinArgI8, IntrinArgI8 ]>, + IntrinSelect, IntrinArgI8, + IntrinArgI8 + ]>, + IntrinSelect, IntrinArgI8, + IntrinArgI8 + ]>, ]; let arguments = [OverloadTy, Int8Ty, Int8Ty]; diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp index e7e7f2ce66ae8..ce6e8121b9d94 100644 --- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp +++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp @@ -94,6 +94,8 @@ static bool checkWaveOps(Intrinsic::ID IID) { case Intrinsic::dx_wave_reduce_usum: case Intrinsic::dx_wave_reduce_max: case Intrinsic::dx_wave_reduce_umax: + case Intrinsic::dx_wave_reduce_min: + case Intrinsic::dx_wave_reduce_umin: return true; } } diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp index 68fd3e0bc74c7..60dfd9650937c 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp @@ -55,8 +55,10 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( case Intrinsic::dx_splitdouble: case Intrinsic::dx_wave_readlane: case Intrinsic::dx_wave_reduce_max: + case Intrinsic::dx_wave_reduce_min: case Intrinsic::dx_wave_reduce_sum: case Intrinsic::dx_wave_reduce_umax: + case Intrinsic::dx_wave_reduce_umin: case Intrinsic::dx_wave_reduce_usum: case Intrinsic::dx_imad: case Intrinsic::dx_umad: diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 021353ab716f7..3fea21e6e694c 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -222,6 +222,9 @@ class SPIRVInstructionSelector : public InstructionSelector { bool selectWaveReduceMax(Register ResVReg, const SPIRVType *ResType, MachineInstr &I, bool IsUnsigned) const; + bool selectWaveReduceMin(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I, bool IsUnsigned) const; + bool selectWaveReduceSum(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; @@ -2456,6 +2459,35 @@ bool SPIRVInstructionSelector::selectWaveReduceMax(Register ResVReg, .constrainAllUses(TII, TRI, RBI); } +bool SPIRVInstructionSelector::selectWaveReduceMin(Register ResVReg, + const SPIRVType *ResType, + MachineInstr &I, + bool IsUnsigned) const { + assert(I.getNumOperands() == 3); + assert(I.getOperand(2).isReg()); + MachineBasicBlock &BB = *I.getParent(); + Register InputRegister = I.getOperand(2).getReg(); + SPIRVType *InputType = GR.getSPIRVTypeForVReg(InputRegister); + + if (!InputType) + report_fatal_error("Input Type could not be determined."); + + SPIRVType *IntTy = GR.getOrCreateSPIRVIntegerType(32, I, TII); + // Retreive the operation to use based on input type + bool IsFloatTy = GR.isScalarOrVectorOfType(InputRegister, SPIRV::OpTypeFloat); + auto IntegerOpcodeType = + IsUnsigned ? SPIRV::OpGroupNonUniformUMin : SPIRV::OpGroupNonUniformSMin; + auto Opcode = IsFloatTy ? SPIRV::OpGroupNonUniformFMin : IntegerOpcodeType; + return BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(GR.getOrCreateConstInt(SPIRV::Scope::Subgroup, I, IntTy, TII, + !STI.isShader())) + .addImm(SPIRV::GroupOperation::Reduce) + .addUse(I.getOperand(2).getReg()) + .constrainAllUses(TII, TRI, RBI); +} + bool SPIRVInstructionSelector::selectWaveReduceSum(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const { @@ -3431,6 +3463,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ true); case Intrinsic::spv_wave_reduce_max: return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ false); + case Intrinsic::spv_wave_reduce_umin: + return selectWaveReduceMin(ResVReg, ResType, I, /*IsUnsigned*/ true); + case Intrinsic::spv_wave_reduce_min: + return selectWaveReduceMin(ResVReg, ResType, I, /*IsUnsigned*/ false); case Intrinsic::spv_wave_reduce_sum: return selectWaveReduceSum(ResVReg, ResType, I); case Intrinsic::spv_wave_readlane: diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll index 7a876f67615cd..3544017062e8e 100644 --- a/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll +++ b/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll @@ -76,6 +76,20 @@ entry: ret i32 %ret } +define noundef i32 @wave_reduce_min(i32 noundef %x) { +entry: + ; CHECK: Function wave_reduce_min : [[WAVE_FLAG]] + %ret = call i32 @llvm.dx.wave.reduce.min.i32(i32 %x) + ret i32 %ret +} + +define noundef i32 @wave_reduce_umin(i32 noundef %x) { +entry: + ; CHECK: Function wave_reduce_umin : [[WAVE_FLAG]] + %ret = call i32 @llvm.dx.wave.reduce.umin.i32(i32 %x) + ret i32 %ret +} + define void @wave_active_countbits(i1 %expr) { entry: ; CHECK: Function wave_active_countbits : [[WAVE_FLAG]] diff --git a/llvm/test/CodeGen/DirectX/WaveActiveMin.ll b/llvm/test/CodeGen/DirectX/WaveActiveMin.ll new file mode 100644 index 0000000000000..24fde48fadfeb --- /dev/null +++ b/llvm/test/CodeGen/DirectX/WaveActiveMin.ll @@ -0,0 +1,143 @@ +; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s + +; Test that for scalar values, WaveActiveMin maps down to the DirectX op + +define noundef half @wave_active_min_half(half noundef %expr) { +entry: +; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr, i8 2, i8 0){{$}} + %ret = call half @llvm.dx.wave.reduce.min.f16(half %expr) + ret half %ret +} + +define noundef float @wave_active_min_float(float noundef %expr) { +entry: +; CHECK: call float @dx.op.waveActiveOp.f32(i32 119, float %expr, i8 2, i8 0){{$}} + %ret = call float @llvm.dx.wave.reduce.min.f32(float %expr) + ret float %ret +} + +define noundef double @wave_active_min_double(double noundef %expr) { +entry: +; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr, i8 2, i8 0){{$}} + %ret = call double @llvm.dx.wave.reduce.min.f64(double %expr) + ret double %ret +} + +define noundef i16 @wave_active_min_i16(i16 noundef %expr) { +entry: +; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr, i8 2, i8 0){{$}} + %ret = call i16 @llvm.dx.wave.reduce.min.i16(i16 %expr) + ret i16 %ret +} + +define noundef i32 @wave_active_min_i32(i32 noundef %expr) { +entry: +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr, i8 2, i8 0){{$}} + %ret = call i32 @llvm.dx.wave.reduce.min.i32(i32 %expr) + ret i32 %ret +} + +define noundef i64 @wave_active_min_i64(i64 noundef %expr) { +entry: +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr, i8 2, i8 0){{$}} + %ret = call i64 @llvm.dx.wave.reduce.min.i64(i64 %expr) + ret i64 %ret +} + +define noundef i16 @wave_active_umin_i16(i16 noundef %expr) { +entry: +; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr, i8 2, i8 1){{$}} + %ret = call i16 @llvm.dx.wave.reduce.umin.i16(i16 %expr) + ret i16 %ret +} + +define noundef i32 @wave_active_umin_i32(i32 noundef %expr) { +entry: +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr, i8 2, i8 1){{$}} + %ret = call i32 @llvm.dx.wave.reduce.umin.i32(i32 %expr) + ret i32 %ret +} + +define noundef i64 @wave_active_umin_i64(i64 noundef %expr) { +entry: +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr, i8 2, i8 1){{$}} + %ret = call i64 @llvm.dx.wave.reduce.umin.i64(i64 %expr) + ret i64 %ret +} + +declare half @llvm.dx.wave.reduce.min.f16(half) +declare float @llvm.dx.wave.reduce.min.f32(float) +declare double @llvm.dx.wave.reduce.min.f64(double) + +declare i16 @llvm.dx.wave.reduce.min.i16(i16) +declare i32 @llvm.dx.wave.reduce.min.i32(i32) +declare i64 @llvm.dx.wave.reduce.min.i64(i64) + +declare i16 @llvm.dx.wave.reduce.umin.i16(i16) +declare i32 @llvm.dx.wave.reduce.umin.i32(i32) +declare i64 @llvm.dx.wave.reduce.umin.i64(i64) + +; Test that for vector values, WaveActiveMin scalarizes and maps down to the +; DirectX op + +define noundef <2 x half> @wave_active_min_v2half(<2 x half> noundef %expr) { +entry: +; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr.i0, i8 2, i8 0){{$}} +; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr.i1, i8 2, i8 0){{$}} + %ret = call <2 x half> @llvm.dx.wave.reduce.min.v2f16(<2 x half> %expr) + ret <2 x half> %ret +} + +define noundef <3 x i32> @wave_active_min_v3i32(<3 x i32> noundef %expr) { +entry: +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i0, i8 2, i8 0){{$}} +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i1, i8 2, i8 0){{$}} +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i2, i8 2, i8 0){{$}} + %ret = call <3 x i32> @llvm.dx.wave.reduce.min.v3i32(<3 x i32> %expr) + ret <3 x i32> %ret +} + +define noundef <4 x double> @wave_active_min_v4f64(<4 x double> noundef %expr) { +entry: +; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i0, i8 2, i8 0){{$}} +; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i1, i8 2, i8 0){{$}} +; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i2, i8 2, i8 0){{$}} +; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i3, i8 2, i8 0){{$}} + %ret = call <4 x double> @llvm.dx.wave.reduce.min.v4f64(<4 x double> %expr) + ret <4 x double> %ret +} + +declare <2 x half> @llvm.dx.wave.reduce.min.v2f16(<2 x half>) +declare <3 x i32> @llvm.dx.wave.reduce.min.v3i32(<3 x i32>) +declare <4 x double> @llvm.dx.wave.reduce.min.v4f64(<4 x double>) + +define noundef <2 x i16> @wave_active_umin_v2i16(<2 x i16> noundef %expr) { +entry: +; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr.i0, i8 2, i8 1){{$}} +; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr.i1, i8 2, i8 1){{$}} + %ret = call <2 x i16> @llvm.dx.wave.reduce.umin.v2f16(<2 x i16> %expr) + ret <2 x i16> %ret +} + +define noundef <3 x i32> @wave_active_umin_v3i32(<3 x i32> noundef %expr) { +entry: +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i0, i8 2, i8 1){{$}} +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i1, i8 2, i8 1){{$}} +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i2, i8 2, i8 1){{$}} + %ret = call <3 x i32> @llvm.dx.wave.reduce.umin.v3i32(<3 x i32> %expr) + ret <3 x i32> %ret +} + +define noundef <4 x i64> @wave_active_umin_v4f64(<4 x i64> noundef %expr) { +entry: +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i0, i8 2, i8 1){{$}} +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i1, i8 2, i8 1){{$}} +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i2, i8 2, i8 1){{$}} +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i3, i8 2, i8 1){{$}} + %ret = call <4 x i64> @llvm.dx.wave.reduce.umin.v4f64(<4 x i64> %expr) + ret <4 x i64> %ret +} + +declare <2 x i16> @llvm.dx.wave.reduce.umin.v2f16(<2 x i16>) +declare <3 x i32> @llvm.dx.wave.reduce.umin.v3i32(<3 x i32>) +declare <4 x i64> @llvm.dx.wave.reduce.umin.v4f64(<4 x i64>) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll new file mode 100644 index 0000000000000..d121c1a937a9b --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll @@ -0,0 +1,57 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %} + +; Test lowering to spir-v backend for various types and scalar/vector + +; CHECK: OpCapability GroupNonUniformArithmetic + +; CHECK-DAG: %[[#f16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#f32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#uint:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#v4_half:]] = OpTypeVector %[[#f16]] 4 +; CHECK-DAG: %[[#scope:]] = OpConstant %[[#uint]] 3 + +; CHECK-LABEL: Begin function test_float +; CHECK: %[[#fexpr:]] = OpFunctionParameter %[[#f32]] +define float @test_float(float %fexpr) { +entry: +; CHECK: %[[#fret:]] = OpGroupNonUniformFMin %[[#f32]] %[[#scope]] Reduce %[[#fexpr]] + %0 = call float @llvm.spv.wave.reduce.min.f32(float %fexpr) + ret float %0 +} + +; CHECK-LABEL: Begin function test_int_signed +; CHECK: %[[#iexpr:]] = OpFunctionParameter %[[#uint]] +define i32 @test_int_signed(i32 %iexpr) { +entry: +; CHECK: %[[#iret:]] = OpGroupNonUniformSMin %[[#uint]] %[[#scope]] Reduce %[[#iexpr]] + %0 = call i32 @llvm.spv.wave.reduce.min.i32(i32 %iexpr) + ret i32 %0 +} + +; CHECK-LABEL: Begin function test_int_unsigned +; CHECK: %[[#iexpr:]] = OpFunctionParameter %[[#uint]] +define i32 @test_int_unsigned(i32 %iexpr) { +entry: +; CHECK: %[[#iret:]] = OpGroupNonUniformUMin %[[#uint]] %[[#scope]] Reduce %[[#iexpr]] + %0 = call i32 @llvm.spv.wave.reduce.umin.i32(i32 %iexpr) + ret i32 %0 +} + +; CHECK-LABEL: Begin function test_vhalf +; CHECK: %[[#vbexpr:]] = OpFunctionParameter %[[#v4_half]] +define <4 x half> @test_vhalf(<4 x half> %vbexpr) { +entry: +; CHECK: %[[#vhalfret:]] = OpGroupNonUniformFMin %[[#v4_half]] %[[#scope]] Reduce %[[#vbexpr]] + %0 = call <4 x half> @llvm.spv.wave.reduce.min.v4half(<4 x half> %vbexpr) + ret <4 x half> %0 +} + +declare float @llvm.spv.wave.reduce.min.f32(float) +declare i32 @llvm.spv.wave.reduce.min.i32(i32) +declare <4 x half> @llvm.spv.wave.reduce.min.v4half(<4 x half>) + +declare float @llvm.spv.wave.reduce.umin.f32(float) +declare i32 @llvm.spv.wave.reduce.umin.i32(i32) +declare <4 x half> @llvm.spv.wave.reduce.umin.v4half(<4 x half>) + From b70363136afce36c4631c370403e48734c9a7f81 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 28 Oct 2025 06:34:56 -0700 Subject: [PATCH 046/539] [SLP]Check only instructions with unique parent instruction user Need to re-check the instruction with the non-schedulable parent, only if this parent has a user phi node (i.e. it is used only outside the block) and the user instruction has unique parent instruction. Fixes issue reported in https://github.com/llvm/llvm-project/commit/20675ee67d048a42482c246e25b284637d55347c#commitcomment-168863594 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 21 ++- ...t-node-schedulable-with-multi-copyables.ll | 170 ++++++++++++++++++ 2 files changed, 189 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/parent-node-schedulable-with-multi-copyables.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 4fcaf6dabb513..43166c035fe7a 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5608,6 +5608,7 @@ class BoUpSLP { for (ScheduleBundle *Bundle : Bundles) { if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0) break; + SmallPtrSet ParentsUniqueUsers; // Need to search for the lane since the tree entry can be // reordered. auto *It = find(Bundle->getTreeEntry()->Scalars, In); @@ -5636,6 +5637,22 @@ class BoUpSLP { Bundle->getTreeEntry()->isCopyableElement(In)) && "Missed TreeEntry operands?"); + bool IsNonSchedulableWithParentPhiNode = + Bundle->getTreeEntry()->doesNotNeedToSchedule() && + Bundle->getTreeEntry()->UserTreeIndex && + Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() && + Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() == + Instruction::PHI; + // Count the number of unique phi nodes, which are the parent for + // parent entry, and exit, if all the unique phis are processed. + if (IsNonSchedulableWithParentPhiNode) { + const TreeEntry *ParentTE = + Bundle->getTreeEntry()->UserTreeIndex.UserTE; + Value *User = ParentTE->Scalars[Lane]; + if (!ParentsUniqueUsers.insert(User).second) + break; + } + for (unsigned OpIdx : seq(Bundle->getTreeEntry()->getNumOperands())) if (auto *I = dyn_cast( @@ -5644,8 +5661,8 @@ class BoUpSLP { << *I << "\n"); DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked); } - // If parent node is schedulable, it will be handle correctly. - if (!Bundle->getTreeEntry()->doesNotNeedToSchedule()) + // If parent node is schedulable, it will be handled correctly. + if (!IsNonSchedulableWithParentPhiNode) break; It = std::find(std::next(It), Bundle->getTreeEntry()->Scalars.end(), In); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/parent-node-schedulable-with-multi-copyables.ll b/llvm/test/Transforms/SLPVectorizer/X86/parent-node-schedulable-with-multi-copyables.ll new file mode 100644 index 0000000000000..9e96e93a3205b --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/parent-node-schedulable-with-multi-copyables.ll @@ -0,0 +1,170 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define i64 @test(ptr %arg1, i64 %alloca.promoted344, i8 %load.311.i, i1 %load1.i) { +; CHECK-LABEL: define i64 @test( +; CHECK-SAME: ptr [[ARG1:%.*]], i64 [[ALLOCA_PROMOTED344:%.*]], i8 [[LOAD_311_I:%.*]], i1 [[LOAD1_I:%.*]]) { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i8> , i8 [[LOAD_311_I]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> , i8 [[LOAD_311_I]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[ALLOCA_PROMOTED344]], i32 0 +; CHECK-NEXT: br label %[[BB2:.*]] +; CHECK: [[BB2]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i64> [ zeroinitializer, %[[BB]] ], [ [[TMP28:%.*]], %[[BB12_8_I:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <8 x i8> [ zeroinitializer, %[[BB]] ], [ [[TMP29:%.*]], %[[BB12_8_I]] ] +; CHECK-NEXT: br i1 [[LOAD1_I]], label %[[SPAM_EXIT:.*]], label %[[BB4_LR_PH_I:.*]] +; CHECK: [[BB4_LR_PH_I]]: +; CHECK-NEXT: br i1 true, label %[[BB3_I_I_PEEL:.*]], label %[[EGGS_EXIT_I_PEEL:.*]] +; CHECK: [[BB3_I_I_PEEL]]: +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i64> [[TMP3]], splat (i64 1) +; CHECK-NEXT: [[LOAD4_I_I_PEEL:%.*]] = load i64, ptr [[ARG1]], align 8 +; CHECK-NEXT: [[SHL_I_I_PEEL:%.*]] = shl i64 [[LOAD4_I_I_PEEL]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[SHL_I_I_PEEL]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <2 x i64> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <2 x i32> +; CHECK-NEXT: br label %[[EGGS_EXIT_I_PEEL]] +; CHECK: [[EGGS_EXIT_I_PEEL]]: +; CHECK-NEXT: [[TMP11:%.*]] = phi <2 x i64> [ [[TMP10]], %[[BB3_I_I_PEEL]] ], [ zeroinitializer, %[[BB4_LR_PH_I]] ] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = trunc <4 x i64> [[TMP12]] to <4 x i8> +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP12]], i32 1 +; CHECK-NEXT: br label %[[SPAM_EXIT]] +; CHECK: [[SPAM_EXIT]]: +; CHECK-NEXT: [[GETELEMENTPTR_I_I_PROMOTED346:%.*]] = phi i64 [ [[TMP14]], %[[EGGS_EXIT_I_PEEL]] ], [ 0, %[[BB2]] ] +; CHECK-NEXT: [[LOAD_8_I:%.*]] = phi i8 [ 0, %[[EGGS_EXIT_I_PEEL]] ], [ 1, %[[BB2]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i8> [ [[TMP13]], %[[EGGS_EXIT_I_PEEL]] ], [ zeroinitializer, %[[BB2]] ] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <4 x i32> +; CHECK-NEXT: br i1 [[LOAD1_I]], label %[[BB12_8_I]], label %[[BB12_1_THREAD_I:.*]] +; CHECK: [[BB12_1_THREAD_I]]: +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i8> [[TMP4]], i32 0 +; CHECK-NEXT: [[ICMP5_3_I:%.*]] = icmp eq i8 [[TMP17]], 0 +; CHECK-NEXT: br i1 [[ICMP5_3_I]], label %[[BB12_3_I:.*]], label %[[BB8_3_I:.*]] +; CHECK: [[BB8_3_I]]: +; CHECK-NEXT: br label %[[BB12_3_I]] +; CHECK: [[BB12_3_I]]: +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i8> [[TMP4]], i32 1 +; CHECK-NEXT: [[ICMP5_4_I:%.*]] = icmp eq i8 [[TMP18]], 0 +; CHECK-NEXT: br i1 [[ICMP5_4_I]], label %[[BB12_4_I:.*]], label %[[BB8_4_I:.*]] +; CHECK: [[BB8_4_I]]: +; CHECK-NEXT: br label %[[BB12_4_I]] +; CHECK: [[BB12_4_I]]: +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i8> [[TMP4]], i32 2 +; CHECK-NEXT: [[ICMP5_5_I:%.*]] = icmp eq i8 [[TMP19]], 0 +; CHECK-NEXT: br i1 [[ICMP5_5_I]], label %[[BB12_5_I:.*]], label %[[BB8_5_I:.*]] +; CHECK: [[BB8_5_I]]: +; CHECK-NEXT: br label %[[BB12_5_I]] +; CHECK: [[BB12_5_I]]: +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i8> [[TMP4]], i32 3 +; CHECK-NEXT: [[ICMP5_7_I:%.*]] = icmp eq i8 [[TMP20]], 0 +; CHECK-NEXT: br i1 [[ICMP5_7_I]], label %[[BB12_7_I:.*]], label %[[BB8_7_I:.*]] +; CHECK: [[BB8_7_I]]: +; CHECK-NEXT: br label %[[BB12_7_I]] +; CHECK: [[BB12_7_I]]: +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x i8> [[TMP4]], i32 4 +; CHECK-NEXT: [[ICMP5_8_I:%.*]] = icmp eq i8 [[TMP21]], 0 +; CHECK-NEXT: br i1 [[ICMP5_8_I]], label %[[BB12_8_I]], label %[[BB8_8_I:.*]] +; CHECK: [[BB8_8_I]]: +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[LOAD_8_I]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i8> poison, i8 [[LOAD_8_I]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP23]], <4 x i8> [[TMP24]], <4 x i32> +; CHECK-NEXT: br label %[[BB12_8_I]] +; CHECK: [[BB12_8_I]]: +; CHECK-NEXT: [[TMP26:%.*]] = phi <4 x i8> [ [[TMP0]], %[[BB12_7_I]] ], [ [[TMP22]], %[[BB8_8_I]] ], [ [[TMP15]], %[[SPAM_EXIT]] ] +; CHECK-NEXT: [[TMP27:%.*]] = phi <4 x i8> [ zeroinitializer, %[[BB12_7_I]] ], [ [[TMP25]], %[[BB8_8_I]] ], [ [[TMP16]], %[[SPAM_EXIT]] ] +; CHECK-NEXT: [[TMP28]] = insertelement <2 x i64> [[TMP2]], i64 [[GETELEMENTPTR_I_I_PROMOTED346]], i32 1 +; CHECK-NEXT: [[TMP29]] = shufflevector <4 x i8> [[TMP26]], <4 x i8> [[TMP27]], <8 x i32> +; CHECK-NEXT: br label %[[BB2]] +; +bb: + br label %bb2 + +bb2: + %getelementptr.i.i.promoted = phi i64 [ 0, %bb ], [ %getelementptr.i.i.promoted346, %bb12.8.i ] + %alloca.promoted = phi i64 [ 0, %bb ], [ %alloca.promoted344, %bb12.8.i ] + %load.8.i231 = phi i8 [ 0, %bb ], [ %load.8.i239, %bb12.8.i ] + %load.7.i217 = phi i8 [ 0, %bb ], [ %load.7.i225, %bb12.8.i ] + %load.626.i200 = phi i8 [ 0, %bb ], [ %load.626.i208, %bb12.8.i ] + %load.6.i183 = phi i8 [ 0, %bb ], [ %load.6.i191, %bb12.8.i ] + %load.5.i167 = phi i8 [ 0, %bb ], [ %load.5.i175, %bb12.8.i ] + %load.418.i148 = phi i8 [ 0, %bb ], [ %load.418.i156, %bb12.8.i ] + %load.4.i129 = phi i8 [ 0, %bb ], [ %load.4.i137, %bb12.8.i ] + %load.3.i111 = phi i8 [ 0, %bb ], [ %load.3.i119, %bb12.8.i ] + br i1 %load1.i, label %spam.exit, label %bb4.lr.ph.i + +bb4.lr.ph.i: + br i1 true, label %bb3.i.i.peel, label %eggs.exit.i.peel + +bb3.i.i.peel: + %and.i.i.peel = and i64 %alloca.promoted, 1 + %load4.i.i.peel = load i64, ptr %arg1, align 8 + %shl.i.i.peel = shl i64 %load4.i.i.peel, 1 + %or.i.i.peel = or i64 %shl.i.i.peel, %and.i.i.peel + %and6.i.i.peel = and i64 %getelementptr.i.i.promoted, 1 + %xor.i.i.peel = xor i64 %and6.i.i.peel, %alloca.promoted + br label %eggs.exit.i.peel + +eggs.exit.i.peel: + %load5.i.i93.peel = phi i64 [ %xor.i.i.peel, %bb3.i.i.peel ], [ 0, %bb4.lr.ph.i ] + %or.i.i91.peel = phi i64 [ %or.i.i.peel, %bb3.i.i.peel ], [ 0, %bb4.lr.ph.i ] + %0 = trunc i64 %or.i.i91.peel to i8 + %1 = trunc nuw i64 %or.i.i91.peel to i8 + %2 = trunc i64 %load5.i.i93.peel to i8 + br label %spam.exit + +spam.exit: + %getelementptr.i.i.promoted346 = phi i64 [ %load5.i.i93.peel, %eggs.exit.i.peel ], [ 0, %bb2 ] + %load.834.i = phi i8 [ %2, %eggs.exit.i.peel ], [ 0, %bb2 ] + %load.7.i25 = phi i8 [ %1, %eggs.exit.i.peel ], [ 0, %bb2 ] + %load.8.i = phi i8 [ 0, %eggs.exit.i.peel ], [ 1, %bb2 ] + %load.6.i18 = phi i8 [ %0, %eggs.exit.i.peel ], [ 0, %bb2 ] + br i1 %load1.i, label %bb12.8.i, label %bb12.1.thread.i + +bb12.1.thread.i: + %icmp5.3.i = icmp eq i8 %load.3.i111, 0 + br i1 %icmp5.3.i, label %bb12.3.i, label %bb8.3.i + +bb8.3.i: + br label %bb12.3.i + +bb12.3.i: + %icmp5.4.i = icmp eq i8 %load.4.i129, 0 + br i1 %icmp5.4.i, label %bb12.4.i, label %bb8.4.i + +bb8.4.i: + br label %bb12.4.i + +bb12.4.i: + %icmp5.5.i = icmp eq i8 %load.5.i167, 0 + br i1 %icmp5.5.i, label %bb12.5.i, label %bb8.5.i + +bb8.5.i: + br label %bb12.5.i + +bb12.5.i: + %icmp5.7.i = icmp eq i8 %load.7.i217, 0 + br i1 %icmp5.7.i, label %bb12.7.i, label %bb8.7.i + +bb8.7.i: + br label %bb12.7.i + +bb12.7.i: + %icmp5.8.i = icmp eq i8 %load.8.i231, 0 + br i1 %icmp5.8.i, label %bb12.8.i, label %bb8.8.i + +bb8.8.i: + br label %bb12.8.i + +bb12.8.i: + %load.8.i239 = phi i8 [ 0, %bb12.7.i ], [ %load.8.i, %bb8.8.i ], [ %load.834.i, %spam.exit ] + %load.7.i225 = phi i8 [ 0, %bb12.7.i ], [ %load.311.i, %bb8.8.i ], [ %load.7.i25, %spam.exit ] + %load.626.i208 = phi i8 [ 0, %bb12.7.i ], [ %load.8.i, %bb8.8.i ], [ %load.6.i18, %spam.exit ] + %load.6.i191 = phi i8 [ %load.311.i, %bb12.7.i ], [ 0, %bb8.8.i ], [ %load.6.i18, %spam.exit ] + %load.5.i175 = phi i8 [ 0, %bb12.7.i ], [ %load.6.i183, %bb8.8.i ], [ %load.6.i18, %spam.exit ] + %load.418.i156 = phi i8 [ 0, %bb12.7.i ], [ %load.626.i200, %bb8.8.i ], [ %load.6.i18, %spam.exit ] + %load.4.i137 = phi i8 [ 0, %bb12.7.i ], [ %load.418.i148, %bb8.8.i ], [ %load.6.i18, %spam.exit ] + %load.3.i119 = phi i8 [ 0, %bb12.7.i ], [ 0, %bb8.8.i ], [ %load.6.i18, %spam.exit ] + br label %bb2 +} From f14c9bd08d8e8de3825f3af3b14b6a43f098a9d5 Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Tue, 28 Oct 2025 19:27:18 +0100 Subject: [PATCH 047/539] [CIR] Fix building ClangIR after RegionBranchOpInterface revamp (#165441) Fix building ClangIR after RegionBranchOpInterface revamp (#165429) --- clang/lib/CIR/Dialect/IR/CIRDialect.cpp | 23 +++++++++++-------- .../lib/CIR/Interfaces/CIRLoopOpInterface.cpp | 13 +++++++---- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp index 2d2ef422bfaef..7ba03ce40140c 100644 --- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp +++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp @@ -286,14 +286,14 @@ void cir::ConditionOp::getSuccessorRegions( // Parent is a loop: condition may branch to the body or to the parent op. if (auto loopOp = dyn_cast(getOperation()->getParentOp())) { regions.emplace_back(&loopOp.getBody(), loopOp.getBody().getArguments()); - regions.emplace_back(loopOp->getResults()); + regions.emplace_back(getOperation(), loopOp->getResults()); } assert(!cir::MissingFeatures::awaitOp()); } MutableOperandRange -cir::ConditionOp::getMutableSuccessorOperands(RegionBranchPoint point) { +cir::ConditionOp::getMutableSuccessorOperands(RegionSuccessor point) { // No values are yielded to the successor region. return MutableOperandRange(getOperation(), 0, 0); } @@ -989,7 +989,8 @@ void cir::IfOp::getSuccessorRegions(mlir::RegionBranchPoint point, SmallVectorImpl ®ions) { // The `then` and the `else` region branch back to the parent operation. if (!point.isParent()) { - regions.push_back(RegionSuccessor()); + regions.push_back( + RegionSuccessor(getOperation(), getOperation()->getResults())); return; } @@ -1039,7 +1040,7 @@ void cir::ScopeOp::getSuccessorRegions( mlir::RegionBranchPoint point, SmallVectorImpl ®ions) { // The only region always branch back to the parent operation. if (!point.isParent()) { - regions.push_back(RegionSuccessor(getODSResults(0))); + regions.push_back(RegionSuccessor(getOperation(), getODSResults(0))); return; } @@ -1124,7 +1125,8 @@ Block *cir::BrCondOp::getSuccessorForOperands(ArrayRef operands) { void cir::CaseOp::getSuccessorRegions( mlir::RegionBranchPoint point, SmallVectorImpl ®ions) { if (!point.isParent()) { - regions.push_back(RegionSuccessor()); + regions.push_back( + RegionSuccessor(getOperation(), getOperation()->getResults())); return; } regions.push_back(RegionSuccessor(&getCaseRegion())); @@ -1188,7 +1190,8 @@ static void printSwitchOp(OpAsmPrinter &p, cir::SwitchOp op, void cir::SwitchOp::getSuccessorRegions( mlir::RegionBranchPoint point, SmallVectorImpl ®ion) { if (!point.isParent()) { - region.push_back(RegionSuccessor()); + region.push_back( + RegionSuccessor(getOperation(), getOperation()->getResults())); return; } @@ -1402,7 +1405,8 @@ void cir::GlobalOp::getSuccessorRegions( mlir::RegionBranchPoint point, SmallVectorImpl ®ions) { // The `ctor` and `dtor` regions always branch back to the parent operation. if (!point.isParent()) { - regions.push_back(RegionSuccessor()); + regions.push_back( + RegionSuccessor(getOperation(), getOperation()->getResults())); return; } @@ -1961,7 +1965,7 @@ void cir::TernaryOp::getSuccessorRegions( mlir::RegionBranchPoint point, SmallVectorImpl ®ions) { // The `true` and the `false` region branch back to the parent operation. if (!point.isParent()) { - regions.push_back(RegionSuccessor(this->getODSResults(0))); + regions.push_back(RegionSuccessor(getOperation(), this->getODSResults(0))); return; } @@ -2978,7 +2982,8 @@ void cir::TryOp::getSuccessorRegions( llvm::SmallVectorImpl ®ions) { // The `try` and the `catchers` region branch back to the parent operation. if (!point.isParent()) { - regions.push_back(mlir::RegionSuccessor()); + regions.push_back( + RegionSuccessor(getOperation(), getOperation()->getResults())); return; } diff --git a/clang/lib/CIR/Interfaces/CIRLoopOpInterface.cpp b/clang/lib/CIR/Interfaces/CIRLoopOpInterface.cpp index 0ce5017a399da..6de51f12837ba 100644 --- a/clang/lib/CIR/Interfaces/CIRLoopOpInterface.cpp +++ b/clang/lib/CIR/Interfaces/CIRLoopOpInterface.cpp @@ -17,7 +17,7 @@ namespace cir { void LoopOpInterface::getLoopOpSuccessorRegions( LoopOpInterface op, mlir::RegionBranchPoint point, llvm::SmallVectorImpl ®ions) { - assert(point.isParent() || point.getRegionOrNull()); + assert(point.isParent() || point.getTerminatorPredecessorOrNull()); // Branching to first region: go to condition or body (do-while). if (point.isParent()) { @@ -25,15 +25,18 @@ void LoopOpInterface::getLoopOpSuccessorRegions( return; } + mlir::Region *parentRegion = + point.getTerminatorPredecessorOrNull()->getParentRegion(); + // Branching from condition: go to body or exit. - if (&op.getCond() == point.getRegionOrNull()) { - regions.emplace_back(mlir::RegionSuccessor(op->getResults())); + if (&op.getCond() == parentRegion) { + regions.emplace_back(mlir::RegionSuccessor(op, op->getResults())); regions.emplace_back(&op.getBody(), op.getBody().getArguments()); return; } // Branching from body: go to step (for) or condition. - if (&op.getBody() == point.getRegionOrNull()) { + if (&op.getBody() == parentRegion) { // FIXME(cir): Should we consider break/continue statements here? mlir::Region *afterBody = (op.maybeGetStep() ? op.maybeGetStep() : &op.getCond()); @@ -42,7 +45,7 @@ void LoopOpInterface::getLoopOpSuccessorRegions( } // Branching from step: go to condition. - if (op.maybeGetStep() == point.getRegionOrNull()) { + if (op.maybeGetStep() == parentRegion) { regions.emplace_back(&op.getCond(), op.getCond().getArguments()); return; } From 849de0b22174d582f6d7194fd78b4663658b3a1d Mon Sep 17 00:00:00 2001 From: Baranov Victor Date: Tue, 28 Oct 2025 22:01:27 +0300 Subject: [PATCH 048/539] [CI] fix typo in code-format job (#165461) --- llvm/utils/git/code-format-helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py index 406a72817acb8..dff7f78ce64a2 100755 --- a/llvm/utils/git/code-format-helper.py +++ b/llvm/utils/git/code-format-helper.py @@ -508,7 +508,7 @@ def hook_main(): parser = argparse.ArgumentParser() parser.add_argument( - "--token", type=str, required=True, help="GitHub authentiation token" + "--token", type=str, required=True, help="GitHub authentication token" ) parser.add_argument( "--repo", From a07b8e1ebabf6583f6f96999b209db63178ffe71 Mon Sep 17 00:00:00 2001 From: Tarun Prabhu Date: Tue, 28 Oct 2025 13:04:01 -0600 Subject: [PATCH 049/539] [clang][Driver] Move test out of clang/include In 9865171e24961, a file named aarch64-mlr-for-calls-only.c was added to clang/include/clang/Driver. This file contains only llvm-lit directives. The file has been moved to clang/test/Driver where it ought to reside. --- clang/{include/clang => test}/Driver/aarch64-mlr-for-calls-only.c | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename clang/{include/clang => test}/Driver/aarch64-mlr-for-calls-only.c (100%) diff --git a/clang/include/clang/Driver/aarch64-mlr-for-calls-only.c b/clang/test/Driver/aarch64-mlr-for-calls-only.c similarity index 100% rename from clang/include/clang/Driver/aarch64-mlr-for-calls-only.c rename to clang/test/Driver/aarch64-mlr-for-calls-only.c From a1173854c4a16384ee9533d276130562e7424d69 Mon Sep 17 00:00:00 2001 From: Razvan Lupusoru Date: Tue, 28 Oct 2025 12:34:35 -0700 Subject: [PATCH 050/539] [acc] Add `acc.kernel_environment` to enable compute decomposition (#165455) Introduce `acc.kernel_environment` operation to capture data mapping and asynchronous behavior from OpenACC compute constructs. This enables decomposition by separating data movement and synchronization from kernel execution parallelism, facilitating lowering to GPU dialect. --- .../mlir/Dialect/OpenACC/OpenACCOps.td | 50 +++++++++++++ mlir/test/Dialect/OpenACC/ops.mlir | 73 +++++++++++++++++++ 2 files changed, 123 insertions(+) diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td index 2f87975ebaa04..a18c18af8a753 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td @@ -2116,6 +2116,56 @@ def OpenACC_KernelsOp : OpenACC_Op<"kernels", let hasVerifier = 1; } +//===----------------------------------------------------------------------===// +// acc.kernel_environment +//===----------------------------------------------------------------------===// + +def OpenACC_KernelEnvironmentOp : OpenACC_Op<"kernel_environment", + [AttrSizedOperandSegments, RecursiveMemoryEffects, SingleBlock, + NoTerminator, + MemoryEffects<[MemWrite, + MemRead]>]> { + let summary = "Decomposition of compute constructs to capture data mapping " + "and asynchronous behavior information"; + let description = [{ + The `acc.kernel_environment` operation represents a decomposition of + any OpenACC compute construct (acc.kernels, acc.parallel, or + acc.serial) that captures data mapping and asynchronous behavior: + - data clause operands + - async clause operands + - wait clause operands + + This allows kernel execution parallelism and privatization to be + handled separately, facilitating eventual lowering to GPU dialect where + kernel launching and compute offloading are handled separately. + }]; + + let arguments = (ins + Variadic:$dataClauseOperands, + Variadic:$asyncOperands, + OptionalAttr:$asyncOperandsDeviceType, + OptionalAttr:$asyncOnly, + Variadic:$waitOperands, + OptionalAttr:$waitOperandsSegments, + OptionalAttr:$waitOperandsDeviceType, + OptionalAttr:$hasWaitDevnum, + OptionalAttr:$waitOnly); + + let regions = (region SizedRegion<1>:$region); + + let assemblyFormat = [{ + oilist( + `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)` + | `async` `` custom($asyncOperands, + type($asyncOperands), $asyncOperandsDeviceType, $asyncOnly) + | `wait` `` custom($waitOperands, type($waitOperands), + $waitOperandsDeviceType, $waitOperandsSegments, $hasWaitDevnum, + $waitOnly) + ) + $region attr-dict + }]; +} + //===----------------------------------------------------------------------===// // 2.6.5 data Construct //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir index 77d18da49276a..042ee2503cb95 100644 --- a/mlir/test/Dialect/OpenACC/ops.mlir +++ b/mlir/test/Dialect/OpenACC/ops.mlir @@ -2243,3 +2243,76 @@ func.func @test_firstprivate_map(%arg0: memref<10xf32>) { // CHECK-NEXT: acc.yield // CHECK-NEXT: } // CHECK-NEXT: return + +// ----- + +func.func @test_kernel_environment(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { + %c1 = arith.constant 1 : index + %c1024 = arith.constant 1024 : index + + // Create data clause operands for the kernel environment + %copyin = acc.copyin varPtr(%arg0 : memref<1024xf32>) -> memref<1024xf32> + %create = acc.create varPtr(%arg1 : memref<1024xf32>) -> memref<1024xf32> + + // Kernel environment wraps gpu.launch and captures data mapping + acc.kernel_environment dataOperands(%copyin, %create : memref<1024xf32>, memref<1024xf32>) { + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %c1024, %block_y = %c1, %block_z = %c1) { + // Kernel body uses the mapped data + %val = memref.load %copyin[%tx] : memref<1024xf32> + %result = arith.mulf %val, %val : f32 + memref.store %result, %create[%tx] : memref<1024xf32> + gpu.terminator + } + } + + // Copy results back to host and deallocate device memory + acc.copyout accPtr(%create : memref<1024xf32>) to varPtr(%arg1 : memref<1024xf32>) + acc.delete accPtr(%copyin : memref<1024xf32>) + + return +} + +// CHECK-LABEL: func @test_kernel_environment +// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr(%{{.*}} : memref<1024xf32>) -> memref<1024xf32> +// CHECK: %[[CREATE:.*]] = acc.create varPtr(%{{.*}} : memref<1024xf32>) -> memref<1024xf32> +// CHECK: acc.kernel_environment dataOperands(%[[COPYIN]], %[[CREATE]] : memref<1024xf32>, memref<1024xf32>) { +// CHECK: gpu.launch +// CHECK: memref.load %[[COPYIN]] +// CHECK: memref.store %{{.*}}, %[[CREATE]] +// CHECK: } +// CHECK: } +// CHECK: acc.copyout accPtr(%[[CREATE]] : memref<1024xf32>) to varPtr(%{{.*}} : memref<1024xf32>) +// CHECK: acc.delete accPtr(%[[COPYIN]] : memref<1024xf32>) + +// ----- + +func.func @test_kernel_environment_with_async(%arg0: memref<1024xf32>) { + %c1 = arith.constant 1 : index + %c1024 = arith.constant 1024 : index + %async_val = arith.constant 1 : i32 + + %create = acc.create varPtr(%arg0 : memref<1024xf32>) async(%async_val : i32) -> memref<1024xf32> + + // Kernel environment with async clause + acc.kernel_environment dataOperands(%create : memref<1024xf32>) async(%async_val : i32) { + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %c1024, %block_y = %c1, %block_z = %c1) { + %f0 = arith.constant 0.0 : f32 + memref.store %f0, %create[%tx] : memref<1024xf32> + gpu.terminator + } + } + + acc.copyout accPtr(%create : memref<1024xf32>) async(%async_val : i32) to varPtr(%arg0 : memref<1024xf32>) + + return +} + +// CHECK-LABEL: func @test_kernel_environment_with_async +// CHECK: %[[ASYNC:.*]] = arith.constant 1 : i32 +// CHECK: %[[CREATE:.*]] = acc.create varPtr(%{{.*}} : memref<1024xf32>) async(%[[ASYNC]] : i32) -> memref<1024xf32> +// CHECK: acc.kernel_environment dataOperands(%[[CREATE]] : memref<1024xf32>) async(%[[ASYNC]] : i32) +// CHECK: gpu.launch +// CHECK: memref.store %{{.*}}, %[[CREATE]] +// CHECK: acc.copyout accPtr(%[[CREATE]] : memref<1024xf32>) async(%[[ASYNC]] : i32) to varPtr(%{{.*}} : memref<1024xf32>) From 5e8739fb49bbe06440a8023573d7925db80f00e0 Mon Sep 17 00:00:00 2001 From: slachowsky Date: Tue, 28 Oct 2025 12:51:11 -0700 Subject: [PATCH 051/539] [RISCV] 'Zalrsc' may permit non-base instructions (#165042) Provide shorter atomic LR/SC sequences with non-base instructions (eg. ''B'' extension instructions) when implementations opt in to FeaturePermissiveZalrsc. Currently this shortens `atomicrmw {min,max,umin,umax}` pseudo expansions. There is no functional change for machines when this target feature is not requested. --- .../RISCV/RISCVExpandAtomicPseudoInsts.cpp | 23 + llvm/lib/Target/RISCV/RISCVFeatures.td | 19 + llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll | 642 ++++++++++++++++++ llvm/test/CodeGen/RISCV/features-info.ll | 1 + 4 files changed, 685 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp index 98b636e8e0e55..9bd66a43717e7 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp @@ -373,6 +373,26 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI, .addReg(ScratchReg) .addImm(-1); break; + case AtomicRMWInst::Max: + BuildMI(LoopMBB, DL, TII->get(RISCV::MAX), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::Min: + BuildMI(LoopMBB, DL, TII->get(RISCV::MIN), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::UMax: + BuildMI(LoopMBB, DL, TII->get(RISCV::MAXU), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::UMin: + BuildMI(LoopMBB, DL, TII->get(RISCV::MINU), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; } BuildMI(LoopMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg) .addReg(ScratchReg) @@ -682,6 +702,9 @@ bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width, MachineBasicBlock::iterator &NextMBBI) { + // Using MIN(U)/MAX(U) is preferrable if permitted + if (STI->hasPermissiveZalrsc() && STI->hasStdExtZbb() && !IsMasked) + return expandAtomicBinOp(MBB, MBBI, BinOp, IsMasked, Width, NextMBBI); MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 2754d789b9899..b4556f66473d6 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1906,6 +1906,25 @@ def FeatureForcedAtomics : SubtargetFeature< def HasAtomicLdSt : Predicate<"Subtarget->hasStdExtZalrsc() || Subtarget->hasForcedAtomics()">; +// The RISC-V Unprivileged Architecture - ISA Volume 1 (Version: 20250508) +// [https://docs.riscv.org/reference/isa/_attachments/riscv-unprivileged.pdf] +// in section 13.3. Eventual Success of Store-Conditional Instructions, defines +// _constrained_ LR/SC loops: +// The dynamic code executed between the LR and SC instructions can only +// contain instructions from the base ''I'' instruction set, excluding loads, +// stores, backward jumps, taken backward branches, JALR, FENCE, and SYSTEM +// instructions. Compressed forms of the aforementioned ''I'' instructions in +// the Zca and Zcb extensions are also permitted. +// LR/SC loops that do not adhere to the above are _unconstrained_ LR/SC loops, +// and success is implementation specific. For implementations which know that +// non-base instructions (such as the ''B'' extension) will not violate any +// forward progress guarantees, using these instructions to reduce the LR/SC +// sequence length is desirable. +def FeaturePermissiveZalrsc + : SubtargetFeature< + "permissive-zalrsc", "HasPermissiveZalrsc", "true", + "Implementation permits non-base instructions between LR/SC pairs">; + def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals", "AllowTaggedGlobals", "true", "Use an instruction sequence for taking the address of a global " diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll b/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll new file mode 100644 index 0000000000000..b43555c6637c4 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll @@ -0,0 +1,642 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV32IB-COMMON,RV32IB-ZALRSC %s +; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc,+permissive-zalrsc -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV32IB-COMMON,RV32IB-ZALRSC-PERM %s +; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc,+permissive-zalrsc,+a -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV32IB-COMMON,RV32IAB %s +; +; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV64IB-ZALRSC %s +; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc,+permissive-zalrsc -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV64IB-ZALRSC-PERM %s +; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc,+permissive-zalrsc,+a -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV64IAB %s + +define i32 @atomicrmw_max_i32_seq_cst(ptr %a, i32 %b) nounwind { +; RV32IB-ZALRSC-LABEL: atomicrmw_max_i32_seq_cst: +; RV32IB-ZALRSC: # %bb.0: +; RV32IB-ZALRSC-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-NEXT: mv a3, a2 +; RV32IB-ZALRSC-NEXT: bge a3, a1, .LBB0_3 +; RV32IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB0_1 Depth=1 +; RV32IB-ZALRSC-NEXT: mv a3, a1 +; RV32IB-ZALRSC-NEXT: .LBB0_3: # in Loop: Header=BB0_1 Depth=1 +; RV32IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-NEXT: bnez a3, .LBB0_1 +; RV32IB-ZALRSC-NEXT: # %bb.4: +; RV32IB-ZALRSC-NEXT: mv a0, a2 +; RV32IB-ZALRSC-NEXT: ret +; +; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_max_i32_seq_cst: +; RV32IB-ZALRSC-PERM: # %bb.0: +; RV32IB-ZALRSC-PERM-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-PERM-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-PERM-NEXT: max a3, a2, a1 +; RV32IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-PERM-NEXT: bnez a3, .LBB0_1 +; RV32IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV32IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV32IB-ZALRSC-PERM-NEXT: ret +; +; RV32IAB-LABEL: atomicrmw_max_i32_seq_cst: +; RV32IAB: # %bb.0: +; RV32IAB-NEXT: amomax.w.aqrl a0, a1, (a0) +; RV32IAB-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_max_i32_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: bge a3, a2, .LBB0_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB0_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: .LBB0_3: # in Loop: Header=BB0_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB0_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a1 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_max_i32_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-PERM-NEXT: max a3, a1, a2 +; RV64IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB0_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a1 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_max_i32_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomax.w.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw max ptr %a, i32 %b seq_cst + ret i32 %1 +} + +define i32 @atomicrmw_min_i32_seq_cst(ptr %a, i32 %b) nounwind { +; RV32IB-ZALRSC-LABEL: atomicrmw_min_i32_seq_cst: +; RV32IB-ZALRSC: # %bb.0: +; RV32IB-ZALRSC-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-NEXT: mv a3, a2 +; RV32IB-ZALRSC-NEXT: bge a1, a3, .LBB1_3 +; RV32IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB1_1 Depth=1 +; RV32IB-ZALRSC-NEXT: mv a3, a1 +; RV32IB-ZALRSC-NEXT: .LBB1_3: # in Loop: Header=BB1_1 Depth=1 +; RV32IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-NEXT: bnez a3, .LBB1_1 +; RV32IB-ZALRSC-NEXT: # %bb.4: +; RV32IB-ZALRSC-NEXT: mv a0, a2 +; RV32IB-ZALRSC-NEXT: ret +; +; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_min_i32_seq_cst: +; RV32IB-ZALRSC-PERM: # %bb.0: +; RV32IB-ZALRSC-PERM-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-PERM-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-PERM-NEXT: min a3, a2, a1 +; RV32IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-PERM-NEXT: bnez a3, .LBB1_1 +; RV32IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV32IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV32IB-ZALRSC-PERM-NEXT: ret +; +; RV32IAB-LABEL: atomicrmw_min_i32_seq_cst: +; RV32IAB: # %bb.0: +; RV32IAB-NEXT: amomin.w.aqrl a0, a1, (a0) +; RV32IAB-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_min_i32_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: bge a2, a3, .LBB1_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB1_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: .LBB1_3: # in Loop: Header=BB1_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB1_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a1 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_min_i32_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-PERM-NEXT: min a3, a1, a2 +; RV64IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB1_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a1 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_min_i32_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomin.w.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw min ptr %a, i32 %b seq_cst + ret i32 %1 +} + +define i32 @atomicrmw_umax_i32_seq_cst(ptr %a, i32 %b) nounwind { +; RV32IB-ZALRSC-LABEL: atomicrmw_umax_i32_seq_cst: +; RV32IB-ZALRSC: # %bb.0: +; RV32IB-ZALRSC-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-NEXT: mv a3, a2 +; RV32IB-ZALRSC-NEXT: bgeu a3, a1, .LBB2_3 +; RV32IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB2_1 Depth=1 +; RV32IB-ZALRSC-NEXT: mv a3, a1 +; RV32IB-ZALRSC-NEXT: .LBB2_3: # in Loop: Header=BB2_1 Depth=1 +; RV32IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-NEXT: bnez a3, .LBB2_1 +; RV32IB-ZALRSC-NEXT: # %bb.4: +; RV32IB-ZALRSC-NEXT: mv a0, a2 +; RV32IB-ZALRSC-NEXT: ret +; +; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i32_seq_cst: +; RV32IB-ZALRSC-PERM: # %bb.0: +; RV32IB-ZALRSC-PERM-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-PERM-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-PERM-NEXT: maxu a3, a2, a1 +; RV32IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-PERM-NEXT: bnez a3, .LBB2_1 +; RV32IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV32IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV32IB-ZALRSC-PERM-NEXT: ret +; +; RV32IAB-LABEL: atomicrmw_umax_i32_seq_cst: +; RV32IAB: # %bb.0: +; RV32IAB-NEXT: amomaxu.w.aqrl a0, a1, (a0) +; RV32IAB-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_umax_i32_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: bgeu a3, a2, .LBB2_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB2_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: .LBB2_3: # in Loop: Header=BB2_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB2_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a1 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i32_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-PERM-NEXT: maxu a3, a1, a2 +; RV64IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB2_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a1 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_umax_i32_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomaxu.w.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw umax ptr %a, i32 %b seq_cst + ret i32 %1 +} + +define i32 @atomicrmw_umin_i32_seq_cst(ptr %a, i32 %b) nounwind { +; RV32IB-ZALRSC-LABEL: atomicrmw_umin_i32_seq_cst: +; RV32IB-ZALRSC: # %bb.0: +; RV32IB-ZALRSC-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-NEXT: mv a3, a2 +; RV32IB-ZALRSC-NEXT: bgeu a1, a3, .LBB3_3 +; RV32IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 +; RV32IB-ZALRSC-NEXT: mv a3, a1 +; RV32IB-ZALRSC-NEXT: .LBB3_3: # in Loop: Header=BB3_1 Depth=1 +; RV32IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-NEXT: bnez a3, .LBB3_1 +; RV32IB-ZALRSC-NEXT: # %bb.4: +; RV32IB-ZALRSC-NEXT: mv a0, a2 +; RV32IB-ZALRSC-NEXT: ret +; +; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i32_seq_cst: +; RV32IB-ZALRSC-PERM: # %bb.0: +; RV32IB-ZALRSC-PERM-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-PERM-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-PERM-NEXT: minu a3, a2, a1 +; RV32IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-PERM-NEXT: bnez a3, .LBB3_1 +; RV32IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV32IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV32IB-ZALRSC-PERM-NEXT: ret +; +; RV32IAB-LABEL: atomicrmw_umin_i32_seq_cst: +; RV32IAB: # %bb.0: +; RV32IAB-NEXT: amominu.w.aqrl a0, a1, (a0) +; RV32IAB-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_umin_i32_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: bgeu a2, a3, .LBB3_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: .LBB3_3: # in Loop: Header=BB3_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB3_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a1 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i32_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-PERM-NEXT: minu a3, a1, a2 +; RV64IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB3_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a1 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_umin_i32_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amominu.w.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw umin ptr %a, i32 %b seq_cst + ret i32 %1 +} + +define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind { +; RV32IB-COMMON-LABEL: atomicrmw_max_i64_seq_cst: +; RV32IB-COMMON: # %bb.0: +; RV32IB-COMMON-NEXT: addi sp, sp, -32 +; RV32IB-COMMON-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: mv s0, a2 +; RV32IB-COMMON-NEXT: mv s1, a0 +; RV32IB-COMMON-NEXT: lw a4, 0(a0) +; RV32IB-COMMON-NEXT: lw a5, 4(a0) +; RV32IB-COMMON-NEXT: mv s2, a1 +; RV32IB-COMMON-NEXT: j .LBB4_2 +; RV32IB-COMMON-NEXT: .LBB4_1: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB4_2 Depth=1 +; RV32IB-COMMON-NEXT: sw a4, 8(sp) +; RV32IB-COMMON-NEXT: sw a5, 12(sp) +; RV32IB-COMMON-NEXT: addi a1, sp, 8 +; RV32IB-COMMON-NEXT: li a4, 5 +; RV32IB-COMMON-NEXT: li a5, 5 +; RV32IB-COMMON-NEXT: mv a0, s1 +; RV32IB-COMMON-NEXT: call __atomic_compare_exchange_8 +; RV32IB-COMMON-NEXT: lw a4, 8(sp) +; RV32IB-COMMON-NEXT: lw a5, 12(sp) +; RV32IB-COMMON-NEXT: bnez a0, .LBB4_7 +; RV32IB-COMMON-NEXT: .LBB4_2: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IB-COMMON-NEXT: beq a5, s0, .LBB4_4 +; RV32IB-COMMON-NEXT: # %bb.3: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB4_2 Depth=1 +; RV32IB-COMMON-NEXT: slt a0, s0, a5 +; RV32IB-COMMON-NEXT: j .LBB4_5 +; RV32IB-COMMON-NEXT: .LBB4_4: # in Loop: Header=BB4_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, s2, a4 +; RV32IB-COMMON-NEXT: .LBB4_5: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB4_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, a4 +; RV32IB-COMMON-NEXT: mv a3, a5 +; RV32IB-COMMON-NEXT: bnez a0, .LBB4_1 +; RV32IB-COMMON-NEXT: # %bb.6: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB4_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, s2 +; RV32IB-COMMON-NEXT: mv a3, s0 +; RV32IB-COMMON-NEXT: j .LBB4_1 +; RV32IB-COMMON-NEXT: .LBB4_7: # %atomicrmw.end +; RV32IB-COMMON-NEXT: mv a0, a4 +; RV32IB-COMMON-NEXT: mv a1, a5 +; RV32IB-COMMON-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: addi sp, sp, 32 +; RV32IB-COMMON-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_max_i64_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: bge a3, a1, .LBB4_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB4_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: .LBB4_3: # in Loop: Header=BB4_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB4_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a2 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_max_i64_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-PERM-NEXT: max a3, a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB4_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_max_i64_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomax.d.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw max ptr %a, i64 %b seq_cst + ret i64 %1 +} + +define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind { +; RV32IB-COMMON-LABEL: atomicrmw_min_i64_seq_cst: +; RV32IB-COMMON: # %bb.0: +; RV32IB-COMMON-NEXT: addi sp, sp, -32 +; RV32IB-COMMON-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: mv s0, a2 +; RV32IB-COMMON-NEXT: mv s1, a0 +; RV32IB-COMMON-NEXT: lw a4, 0(a0) +; RV32IB-COMMON-NEXT: lw a5, 4(a0) +; RV32IB-COMMON-NEXT: mv s2, a1 +; RV32IB-COMMON-NEXT: j .LBB5_2 +; RV32IB-COMMON-NEXT: .LBB5_1: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB5_2 Depth=1 +; RV32IB-COMMON-NEXT: sw a4, 8(sp) +; RV32IB-COMMON-NEXT: sw a5, 12(sp) +; RV32IB-COMMON-NEXT: addi a1, sp, 8 +; RV32IB-COMMON-NEXT: li a4, 5 +; RV32IB-COMMON-NEXT: li a5, 5 +; RV32IB-COMMON-NEXT: mv a0, s1 +; RV32IB-COMMON-NEXT: call __atomic_compare_exchange_8 +; RV32IB-COMMON-NEXT: lw a4, 8(sp) +; RV32IB-COMMON-NEXT: lw a5, 12(sp) +; RV32IB-COMMON-NEXT: bnez a0, .LBB5_7 +; RV32IB-COMMON-NEXT: .LBB5_2: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IB-COMMON-NEXT: beq a5, s0, .LBB5_4 +; RV32IB-COMMON-NEXT: # %bb.3: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB5_2 Depth=1 +; RV32IB-COMMON-NEXT: slt a0, a5, s0 +; RV32IB-COMMON-NEXT: j .LBB5_5 +; RV32IB-COMMON-NEXT: .LBB5_4: # in Loop: Header=BB5_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, a4, s2 +; RV32IB-COMMON-NEXT: .LBB5_5: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB5_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, a4 +; RV32IB-COMMON-NEXT: mv a3, a5 +; RV32IB-COMMON-NEXT: bnez a0, .LBB5_1 +; RV32IB-COMMON-NEXT: # %bb.6: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB5_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, s2 +; RV32IB-COMMON-NEXT: mv a3, s0 +; RV32IB-COMMON-NEXT: j .LBB5_1 +; RV32IB-COMMON-NEXT: .LBB5_7: # %atomicrmw.end +; RV32IB-COMMON-NEXT: mv a0, a4 +; RV32IB-COMMON-NEXT: mv a1, a5 +; RV32IB-COMMON-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: addi sp, sp, 32 +; RV32IB-COMMON-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_min_i64_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: bge a1, a3, .LBB5_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB5_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: .LBB5_3: # in Loop: Header=BB5_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB5_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a2 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_min_i64_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-PERM-NEXT: min a3, a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB5_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_min_i64_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomin.d.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw min ptr %a, i64 %b seq_cst + ret i64 %1 +} + +define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind { +; RV32IB-COMMON-LABEL: atomicrmw_umax_i64_seq_cst: +; RV32IB-COMMON: # %bb.0: +; RV32IB-COMMON-NEXT: addi sp, sp, -32 +; RV32IB-COMMON-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: mv s0, a2 +; RV32IB-COMMON-NEXT: mv s1, a0 +; RV32IB-COMMON-NEXT: lw a4, 0(a0) +; RV32IB-COMMON-NEXT: lw a5, 4(a0) +; RV32IB-COMMON-NEXT: mv s2, a1 +; RV32IB-COMMON-NEXT: j .LBB6_2 +; RV32IB-COMMON-NEXT: .LBB6_1: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB6_2 Depth=1 +; RV32IB-COMMON-NEXT: sw a4, 8(sp) +; RV32IB-COMMON-NEXT: sw a5, 12(sp) +; RV32IB-COMMON-NEXT: addi a1, sp, 8 +; RV32IB-COMMON-NEXT: li a4, 5 +; RV32IB-COMMON-NEXT: li a5, 5 +; RV32IB-COMMON-NEXT: mv a0, s1 +; RV32IB-COMMON-NEXT: call __atomic_compare_exchange_8 +; RV32IB-COMMON-NEXT: lw a4, 8(sp) +; RV32IB-COMMON-NEXT: lw a5, 12(sp) +; RV32IB-COMMON-NEXT: bnez a0, .LBB6_7 +; RV32IB-COMMON-NEXT: .LBB6_2: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IB-COMMON-NEXT: beq a5, s0, .LBB6_4 +; RV32IB-COMMON-NEXT: # %bb.3: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB6_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, s0, a5 +; RV32IB-COMMON-NEXT: j .LBB6_5 +; RV32IB-COMMON-NEXT: .LBB6_4: # in Loop: Header=BB6_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, s2, a4 +; RV32IB-COMMON-NEXT: .LBB6_5: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB6_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, a4 +; RV32IB-COMMON-NEXT: mv a3, a5 +; RV32IB-COMMON-NEXT: bnez a0, .LBB6_1 +; RV32IB-COMMON-NEXT: # %bb.6: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB6_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, s2 +; RV32IB-COMMON-NEXT: mv a3, s0 +; RV32IB-COMMON-NEXT: j .LBB6_1 +; RV32IB-COMMON-NEXT: .LBB6_7: # %atomicrmw.end +; RV32IB-COMMON-NEXT: mv a0, a4 +; RV32IB-COMMON-NEXT: mv a1, a5 +; RV32IB-COMMON-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: addi sp, sp, 32 +; RV32IB-COMMON-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_umax_i64_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: bgeu a3, a1, .LBB6_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB6_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: .LBB6_3: # in Loop: Header=BB6_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB6_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a2 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i64_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-PERM-NEXT: maxu a3, a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB6_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_umax_i64_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomaxu.d.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw umax ptr %a, i64 %b seq_cst + ret i64 %1 +} + +define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind { +; RV32IB-COMMON-LABEL: atomicrmw_umin_i64_seq_cst: +; RV32IB-COMMON: # %bb.0: +; RV32IB-COMMON-NEXT: addi sp, sp, -32 +; RV32IB-COMMON-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: mv s0, a2 +; RV32IB-COMMON-NEXT: mv s1, a0 +; RV32IB-COMMON-NEXT: lw a4, 0(a0) +; RV32IB-COMMON-NEXT: lw a5, 4(a0) +; RV32IB-COMMON-NEXT: mv s2, a1 +; RV32IB-COMMON-NEXT: j .LBB7_2 +; RV32IB-COMMON-NEXT: .LBB7_1: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB7_2 Depth=1 +; RV32IB-COMMON-NEXT: sw a4, 8(sp) +; RV32IB-COMMON-NEXT: sw a5, 12(sp) +; RV32IB-COMMON-NEXT: addi a1, sp, 8 +; RV32IB-COMMON-NEXT: li a4, 5 +; RV32IB-COMMON-NEXT: li a5, 5 +; RV32IB-COMMON-NEXT: mv a0, s1 +; RV32IB-COMMON-NEXT: call __atomic_compare_exchange_8 +; RV32IB-COMMON-NEXT: lw a4, 8(sp) +; RV32IB-COMMON-NEXT: lw a5, 12(sp) +; RV32IB-COMMON-NEXT: bnez a0, .LBB7_7 +; RV32IB-COMMON-NEXT: .LBB7_2: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IB-COMMON-NEXT: beq a5, s0, .LBB7_4 +; RV32IB-COMMON-NEXT: # %bb.3: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB7_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, a5, s0 +; RV32IB-COMMON-NEXT: j .LBB7_5 +; RV32IB-COMMON-NEXT: .LBB7_4: # in Loop: Header=BB7_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, a4, s2 +; RV32IB-COMMON-NEXT: .LBB7_5: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB7_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, a4 +; RV32IB-COMMON-NEXT: mv a3, a5 +; RV32IB-COMMON-NEXT: bnez a0, .LBB7_1 +; RV32IB-COMMON-NEXT: # %bb.6: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB7_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, s2 +; RV32IB-COMMON-NEXT: mv a3, s0 +; RV32IB-COMMON-NEXT: j .LBB7_1 +; RV32IB-COMMON-NEXT: .LBB7_7: # %atomicrmw.end +; RV32IB-COMMON-NEXT: mv a0, a4 +; RV32IB-COMMON-NEXT: mv a1, a5 +; RV32IB-COMMON-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: addi sp, sp, 32 +; RV32IB-COMMON-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_umin_i64_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: .LBB7_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: bgeu a1, a3, .LBB7_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB7_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: .LBB7_3: # in Loop: Header=BB7_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB7_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a2 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i64_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: .LBB7_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-PERM-NEXT: minu a3, a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB7_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_umin_i64_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amominu.d.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw umin ptr %a, i64 %b seq_cst + ret i64 %1 +} diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index 5e5f2b78e8869..37e11dbb12731 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -81,6 +81,7 @@ ; CHECK-NEXT: optimized-nf7-segment-load-store - vlseg7eN.v and vsseg7eN.v are implemented as a wide memory op and shuffle. ; CHECK-NEXT: optimized-nf8-segment-load-store - vlseg8eN.v and vsseg8eN.v are implemented as a wide memory op and shuffle. ; CHECK-NEXT: optimized-zero-stride-load - Optimized (perform fewer memory operations)zero-stride vector load. +; CHECK-NEXT: permissive-zalrsc - Implementation permits non-base instructions between LR/SC pairs. ; CHECK-NEXT: predictable-select-expensive - Prefer likely predicted branches over selects. ; CHECK-NEXT: prefer-vsetvli-over-read-vlenb - Prefer vsetvli over read vlenb CSR to calculate VLEN. ; CHECK-NEXT: prefer-w-inst - Prefer instructions with W suffix. From d3f894ba2abc6720a0df8d85831ae6c37e403f15 Mon Sep 17 00:00:00 2001 From: Justin Rosner Date: Tue, 28 Oct 2025 16:05:30 -0400 Subject: [PATCH 052/539] [mlir][ROCDL] Add tensor load and store instructions to ROCDL (#165016) Add support for `tensor.load.to.lds` and `tensor.store.from.lds` instructions in ROCDL. --- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 115 +++++++++++++++---- mlir/test/Dialect/LLVMIR/rocdl.mlir | 30 +++++ mlir/test/Target/LLVMIR/rocdl.mlir | 30 +++++ 3 files changed, 151 insertions(+), 24 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index d2df244eb9363..5241f9a6f2b43 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -146,6 +146,35 @@ class ROCDL_DimGetterFunctionOp : + FixedVectorOfLengthAndType<[length], [elem]>, + BuildableType< + "::mlir::VectorType::get({" # length # "} ," + # elem.builderCall # ")">; + +def ROCDL_V2I16Type : ROCDL_ConcreteVector; +def ROCDL_V2F16Type : ROCDL_ConcreteVector; +def ROCDL_V2I32Type : ROCDL_ConcreteVector; +def ROCDL_V2BF16Type : ROCDL_ConcreteVector; +def ROCDL_V2F32Type : ROCDL_ConcreteVector; +def ROCDL_V3I32Type : ROCDL_ConcreteVector; +def ROCDL_V4I32Type : ROCDL_ConcreteVector; +def ROCDL_V6I32Type : ROCDL_ConcreteVector; +def ROCDL_V8I32Type : ROCDL_ConcreteVector; +def ROCDL_V8BF16Type : ROCDL_ConcreteVector; +def ROCDL_V8F16Type : ROCDL_ConcreteVector; +def ROCDL_V8F32Type : ROCDL_ConcreteVector; +def ROCDL_V16BF16Type : ROCDL_ConcreteVector; +def ROCDL_V16F16Type : ROCDL_ConcreteVector; +def ROCDL_V16F32Type : ROCDL_ConcreteVector; +def ROCDL_V32F16Type : ROCDL_ConcreteVector; +def ROCDL_V32BF16Type : ROCDL_ConcreteVector; +def ROCDL_V32F32Type : ROCDL_ConcreteVector; + //===----------------------------------------------------------------------===// // Wave-level primitives //===----------------------------------------------------------------------===// @@ -663,6 +692,68 @@ def ROCDL_GlobalLoadLDSOp : }]; } +//===---------------------------------------------------------------------===// +// Tensor load/store intrinsics (available in GFX1250) +//===---------------------------------------------------------------------===// + +// Base class for tensor load/store operations with 4 descriptor groups. +class ROCDL_TensorLDSIntrOp : + ROCDL_IntrOp { + dag args = (ins ROCDL_V4I32Type:$dgroup0, ROCDL_V8I32Type:$dgroup1, + ROCDL_V4I32Type:$dgroup2, ROCDL_V4I32Type:$dgroup3, + I32Attr:$cachePolicy); + let arguments = !con(args, baseArgs); + let summary = "Base class for ROCDL tensor load/store to/from LDS."; + let description = [{ + Moves tiles of tensor data between global memory and LDS. The tile is + described by the $dgroup descriptors. 4 $dgroup descriptors allows for + movement of up to 5D tensors. $cachePolicy describes the memory scope and an + indicator of expected data re-use. + + This op is for gfx1250+ architectures. + }]; + let assemblyFormat = [{ + attr-dict operands `cachepolicy` $cachePolicy `:` type($dgroup0) `,` type($dgroup1) + }]; + let extraClassDefinition = [{ + SmallVector $cppClass::getAccessedOperands() { + return {getDgroup0(), getDgroup1(), getDgroup2(), getDgroup3()}; + } + }]; +} + +// Base class for tensor load/store operations with 2 descriptor groups +// (D2 variant). +class ROCDL_TensorLDSIntrD2Op : + ROCDL_IntrOp { + dag args = (ins ROCDL_V4I32Type:$dgroup0, ROCDL_V8I32Type:$dgroup1, + I32Attr:$cachePolicy); + let arguments = !con(args, baseArgs); + let summary = "Base class for ROCDL tensor load/store to/from LDS (D2 variant)."; + let description = [{ + Moves tiles of tensor data between global memory and LDS. The tile is + described by the $dgroup descriptors. 2 $dgroup descriptors allows for + movement of up to 2D tensors. $cachePolicy describes the memory scope and an + indicator of expected data re-use. + + This op is for gfx1250+ architectures. + }]; + let assemblyFormat = [{ + attr-dict operands `cachepolicy` $cachePolicy `:` type($dgroup0) `,` type($dgroup1) + }]; + let extraClassDefinition = [{ + SmallVector $cppClass::getAccessedOperands() { + return {getDgroup0(), getDgroup1()}; + } + }]; +} + +// Tensor load and store operations +def ROCDL_TensorLoadToLDSOp : ROCDL_TensorLDSIntrOp<"tensor.load.to.lds">; +def ROCDL_TensorStoreFromLDSOp : ROCDL_TensorLDSIntrOp<"tensor.store.from.lds">; +def ROCDL_TensorLoadToLDSD2Op : ROCDL_TensorLDSIntrD2Op<"tensor.load.to.lds.d2">; +def ROCDL_TensorStoreFromLDSD2Op : ROCDL_TensorLDSIntrD2Op<"tensor.store.from.lds.d2">; + //===---------------------------------------------------------------------===// // Operations on raw buffer resources (stride of 0, bounds checks either off or in // raw buffer mode). @@ -932,30 +1023,6 @@ def ROCDL_Permlane32SwapOp : ROCDL_IntrOp<"permlane32.swap", [], [], }]; } -class ROCDL_ConcreteVector : - FixedVectorOfLengthAndType<[length], [elem]>, - BuildableType< - "::mlir::VectorType::get({" # length # "} ," - # elem.builderCall # ")">; - -def ROCDL_V2I16Type : ROCDL_ConcreteVector; -def ROCDL_V2F16Type : ROCDL_ConcreteVector; -def ROCDL_V2I32Type : ROCDL_ConcreteVector; -def ROCDL_V2BF16Type : ROCDL_ConcreteVector; -def ROCDL_V2F32Type : ROCDL_ConcreteVector; -def ROCDL_V3I32Type : ROCDL_ConcreteVector; -def ROCDL_V6I32Type : ROCDL_ConcreteVector; -def ROCDL_V8I32Type : ROCDL_ConcreteVector; -def ROCDL_V8BF16Type : ROCDL_ConcreteVector; -def ROCDL_V8F16Type : ROCDL_ConcreteVector; -def ROCDL_V8F32Type : ROCDL_ConcreteVector; -def ROCDL_V16BF16Type : ROCDL_ConcreteVector; -def ROCDL_V16F16Type : ROCDL_ConcreteVector; -def ROCDL_V16F32Type : ROCDL_ConcreteVector; -def ROCDL_V32F16Type : ROCDL_ConcreteVector; -def ROCDL_V32BF16Type : ROCDL_ConcreteVector; -def ROCDL_V32F32Type : ROCDL_ConcreteVector; - //===---------------------------------------------------------------------===// // 16-bit float intrinsics //===---------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir index d270ee8b089aa..e703600c71c8e 100644 --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -664,6 +664,36 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { llvm.return } +// CHECK-LABEL @rocdl.tensor.load.to.lds +llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>, + %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) { + // CHECK: rocdl.tensor.load.to.lds %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} cachepolicy 0 : vector<4xi32>, vector<8xi32> + rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32> + llvm.return +} + +// CHECK-LABEL @rocdl.tensor.store.from.lds +llvm.func @rocdl.tensor.store.from.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>, + %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) { + // CHECK: rocdl.tensor.store.from.lds %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} cachepolicy 0 : vector<4xi32>, vector<8xi32> + rocdl.tensor.store.from.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32> + llvm.return +} + +// CHECK-LABEL @rocdl.tensor.load.to.lds.d2 +llvm.func @rocdl.tensor.load.to.lds.d2(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>) { + // CHECK: rocdl.tensor.load.to.lds.d2 %{{.*}}, %{{.*}} cachepolicy 0 : vector<4xi32>, vector<8xi32> + rocdl.tensor.load.to.lds.d2 %dgroup0, %dgroup1 cachepolicy 0 : vector<4xi32>, vector<8xi32> + llvm.return +} + +// CHECK-LABEL @rocdl.tensor.store.from.lds.d2 +llvm.func @rocdl.tensor.store.from.lds.d2(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>) { + // CHECK: rocdl.tensor.store.from.lds.d2 %{{.*}}, %{{.*}} cachepolicy 0 : vector<4xi32>, vector<8xi32> + rocdl.tensor.store.from.lds.d2 %dgroup0, %dgroup1 cachepolicy 0 : vector<4xi32>, vector<8xi32> + llvm.return +} + llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr, %stride : i16, %numRecords : i64, diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 30126f6bff05a..8a848221a50dd 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -1040,6 +1040,36 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { llvm.return } +// CHECK-LABEL: rocdl.tensor.load.to.lds +llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>, + %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) { + // CHECK: call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 0) + rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32> + llvm.return +} + +// CHECK-LABEL: rocdl.tensor.store.from.lds +llvm.func @rocdl.tensor.store.from.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>, + %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) { + // CHECK: call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 0) + rocdl.tensor.store.from.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32> + llvm.return +} + +// CHECK-LABEL: rocdl.tensor.load.to.lds.d2 +llvm.func @rocdl.tensor.load.to.lds.d2(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>) { + // CHECK: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i32 0) + rocdl.tensor.load.to.lds.d2 %dgroup0, %dgroup1 cachepolicy 0 : vector<4xi32>, vector<8xi32> + llvm.return +} + +// CHECK-LABEL: rocdl.tensor.store.from.lds.d2 +llvm.func @rocdl.tensor.store.from.lds.d2(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>) { + // CHECK: call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i32 0) + rocdl.tensor.store.from.lds.d2 %dgroup0, %dgroup1 cachepolicy 0 : vector<4xi32>, vector<8xi32> + llvm.return +} + llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr, %stride : i16, %numRecords : i64, From 733b304c6d021dcbc71c2f68a5c964fae97b6506 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 28 Oct 2025 13:16:20 -0700 Subject: [PATCH 053/539] [libc][hdrgen] Fix `includes` sorting in JSON emission (#165460) The JSON output support in hdrgen had a bug that tripped when used with headers that use special-case headers like to supply some times, as well as llvm-libc-types/*.h headers. --- libc/utils/hdrgen/hdrgen/header.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/libc/utils/hdrgen/hdrgen/header.py b/libc/utils/hdrgen/hdrgen/header.py index 2118db6e5fb75..715d4b7c9b7ed 100644 --- a/libc/utils/hdrgen/hdrgen/header.py +++ b/libc/utils/hdrgen/hdrgen/header.py @@ -241,7 +241,5 @@ def json_data(self): return { "name": self.name, "standards": self.standards, - "includes": [ - str(file) for file in sorted({COMMON_HEADER} | self.includes()) - ], + "includes": sorted(str(file) for file in {COMMON_HEADER} | self.includes()), } From 57ce6e7a30d8d3dbbcd735611392f4163d3cd015 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 28 Oct 2025 14:02:54 -0700 Subject: [PATCH 054/539] [Clang] Restore SafeStack support for x86-32 Fuchsia (#165471) Fuchsia does not fully support an x86-32 (i?86-fuchsia) target. But the x86_64-fuchsia target in -m32 mode is used when building some kernel / boot-loader related code. This narrow use of an (effective) i?86-fuchsia target still supports SafeStack using the same Fuchsia-specific ABI as x86_64-fuchsia. --- clang/lib/Driver/ToolChains/Fuchsia.cpp | 4 +++- clang/test/Driver/fuchsia.c | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp index 31c2f3f7e1be4..507cc03b27513 100644 --- a/clang/lib/Driver/ToolChains/Fuchsia.cpp +++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp @@ -483,7 +483,8 @@ SanitizerMask Fuchsia::getSupportedSanitizers() const { Res |= SanitizerKind::Leak; Res |= SanitizerKind::Scudo; Res |= SanitizerKind::Thread; - if (getTriple().getArch() == llvm::Triple::x86_64) { + if (getTriple().getArch() == llvm::Triple::x86_64 || + getTriple().getArch() == llvm::Triple::x86) { Res |= SanitizerKind::SafeStack; } return Res; @@ -496,6 +497,7 @@ SanitizerMask Fuchsia::getDefaultSanitizers() const { case llvm::Triple::riscv64: Res |= SanitizerKind::ShadowCallStack; break; + case llvm::Triple::x86: case llvm::Triple::x86_64: Res |= SanitizerKind::SafeStack; break; diff --git a/clang/test/Driver/fuchsia.c b/clang/test/Driver/fuchsia.c index d0fec18e13a20..99e5018117924 100644 --- a/clang/test/Driver/fuchsia.c +++ b/clang/test/Driver/fuchsia.c @@ -130,6 +130,11 @@ // RUN: -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \ // RUN: -fuse-ld=ld \ // RUN: | FileCheck %s -check-prefix=CHECK-SAFESTACK +// RUN: %clang -### %s --target=x86_64-unknown-fuchsia -m32 \ +// RUN: -fsanitize=safe-stack 2>&1 \ +// RUN: -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \ +// RUN: -fuse-ld=ld \ +// RUN: | FileCheck %s -check-prefix=CHECK-SAFESTACK // CHECK-SAFESTACK: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]" // CHECK-SAFESTACK: "-fsanitize=safe-stack" // CHECK-SAFESTACK-NOT: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}x86_64-unknown-fuchsia{{/|\\\\}}libclang_rt.safestack.a" From 53cd32ad64f862697d670c127bf55666413ede17 Mon Sep 17 00:00:00 2001 From: Sam Clegg Date: Tue, 28 Oct 2025 14:46:22 -0700 Subject: [PATCH 055/539] [lld][WebAssembly] LTO: Use PIC reloc model with dynamic imports (#165342) --- lld/test/wasm/lto/relocation-model.ll | 5 +++++ lld/wasm/LTO.cpp | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/lld/test/wasm/lto/relocation-model.ll b/lld/test/wasm/lto/relocation-model.ll index 8fe198d0c64e6..a042615b8fe1c 100644 --- a/lld/test/wasm/lto/relocation-model.ll +++ b/lld/test/wasm/lto/relocation-model.ll @@ -8,6 +8,11 @@ ; RUN: wasm-ld %t.o -o %t_static.wasm -save-temps -r -mllvm -relocation-model=static ; RUN: llvm-readobj -r %t_static.wasm.lto.o | FileCheck %s --check-prefix=STATIC +;; Linking with --unresolved-symbols=import-dynamic should also generate PIC +;; code for external references. +; RUN: wasm-ld %t.o -o %t_import.wasm -save-temps --experimental-pic --unresolved-symbols=import-dynamic +; RUN: llvm-readobj -r %t_import.wasm.lto.o | FileCheck %s --check-prefix=PIC + ; PIC: R_WASM_GLOBAL_INDEX_LEB foo ; STATIC: R_WASM_MEMORY_ADDR_LEB foo diff --git a/lld/wasm/LTO.cpp b/lld/wasm/LTO.cpp index ae85f4693214b..668cdf21ea3ed 100644 --- a/lld/wasm/LTO.cpp +++ b/lld/wasm/LTO.cpp @@ -63,6 +63,12 @@ static lto::Config createConfig() { c.RelocModel = std::nullopt; else if (ctx.isPic) c.RelocModel = Reloc::PIC_; + else if (ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic) + // With ImportDynamic we also need to use the PIC relocation model so that + // external symbols are references via the GOT. + // TODO(sbc): This should probably be Reloc::DynamicNoPIC, but the backend + // doesn't currently support that. + c.RelocModel = Reloc::PIC_; else c.RelocModel = Reloc::Static; From 8054ff28db8f503bb0560038c8f4872f7adcc8ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 28 Oct 2025 11:54:13 -1000 Subject: [PATCH 056/539] [flang][cuda] Add instructions for tma_bulk_s2g (#165480) --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 5 +++++ flang/test/Lower/CUDA/cuda-device-proc.cuf | 2 ++ 2 files changed, 7 insertions(+) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 0d225532f2460..6e8b411644758 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -9287,6 +9287,11 @@ void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef args) { mlir::NVVM::NVVMMemorySpace::Global); mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create( builder, loc, dst, src, fir::getBase(args[2]), {}, {}); + + mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, + "cp.async.bulk.commit_group", {}); + mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, + builder.getI32IntegerAttr(0), {}); } // TMA_BULK_WAIT_GROUP (CUDA) diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 99b1a2fc0cbf7..8bf506b0518ed 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -479,6 +479,8 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_bulk_s2g ! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(device) subroutine testAtomicCasLoop(aa, n) integer :: a From a05e8b462e6317a66127b04b4adc9b1d9b17f06d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 28 Oct 2025 11:54:42 -1000 Subject: [PATCH 057/539] [flang][cuda] Add interfaces and lowering for tma_bulk_load (#165474) As defined in https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/#load-and-store-functions-using-bulk-tma-operations --- .../flang/Optimizer/Builder/IntrinsicCall.h | 7 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 136 ++++++++++++++++++ flang/module/cudadevice.f90 | 61 ++++++++ flang/test/Lower/CUDA/cuda-device-proc.cuf | 133 +++++++++++++++++ 4 files changed, 337 insertions(+) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index ed0cbd3bdf16b..f5ff6626da654 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -461,6 +461,13 @@ struct IntrinsicLibrary { mlir::Value genTime(mlir::Type, llvm::ArrayRef); void genTMABulkCommitGroup(llvm::ArrayRef); void genTMABulkG2S(llvm::ArrayRef); + void genTMABulkLoadC4(llvm::ArrayRef); + void genTMABulkLoadC8(llvm::ArrayRef); + void genTMABulkLoadI4(llvm::ArrayRef); + void genTMABulkLoadI8(llvm::ArrayRef); + void genTMABulkLoadR2(llvm::ArrayRef); + void genTMABulkLoadR4(llvm::ArrayRef); + void genTMABulkLoadR8(llvm::ArrayRef); void genTMABulkS2G(llvm::ArrayRef); void genTMABulkWaitGroup(llvm::ArrayRef); mlir::Value genTrailz(mlir::Type, llvm::ArrayRef); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 6e8b411644758..65317599ecd35 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -1045,6 +1045,55 @@ static constexpr IntrinsicHandler handlers[]{ {"dst", asAddr}, {"nbytes", asValue}}}, /*isElemental=*/false}, + {"tma_bulk_ldc4", + &I::genTMABulkLoadC4, + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldc8", + &I::genTMABulkLoadC8, + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldi4", + &I::genTMABulkLoadI4, + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldi8", + &I::genTMABulkLoadI8, + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldr2", + &I::genTMABulkLoadR2, + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldr4", + &I::genTMABulkLoadR4, + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldr8", + &I::genTMABulkLoadR8, + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, {"tma_bulk_s2g", &I::genTMABulkS2G, {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}}, @@ -9278,6 +9327,93 @@ void IntrinsicLibrary::genTMABulkG2S(llvm::ArrayRef args) { builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {}); } +static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value barrier, mlir::Value src, + mlir::Value dst, mlir::Value nelem, + mlir::Value eleSize) { + mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize); + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + barrier = builder.createConvert(loc, llvmPtrTy, barrier); + mlir::NVVM::InlinePtxOp::create( + builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {}, + "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], " + "[%1], %2, [%3];", + {}); + mlir::NVVM::InlinePtxOp::create( + builder, loc, mlir::TypeRange{}, {barrier, size}, {}, + "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {}); +} + +// TMA_BULK_LOADC4 +void IntrinsicLibrary::genTMABulkLoadC4( + llvm::ArrayRef args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADC8 +void IntrinsicLibrary::genTMABulkLoadC8( + llvm::ArrayRef args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 16); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADI4 +void IntrinsicLibrary::genTMABulkLoadI4( + llvm::ArrayRef args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 4); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADI8 +void IntrinsicLibrary::genTMABulkLoadI8( + llvm::ArrayRef args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADR2 +void IntrinsicLibrary::genTMABulkLoadR2( + llvm::ArrayRef args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 2); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADR4 +void IntrinsicLibrary::genTMABulkLoadR4( + llvm::ArrayRef args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 4); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADR8 +void IntrinsicLibrary::genTMABulkLoadR8( + llvm::ArrayRef args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + // TMA_BULK_S2G (CUDA) void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef args) { assert(args.size() == 3); diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index ea54c974c9e7c..e6decbb96c271 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -2067,6 +2067,67 @@ attributes(device) subroutine tma_bulk_s2g(src, dst, nbytes) end subroutine end interface + ! Load specific types, count is in elements + ! ----------------------------------------- + interface tma_bulk_load + attributes(device) subroutine tma_bulk_ldc4(barrier, src, dst, nelems) + !dir$ ignore_tkr (r) src, (r) dst + integer(8), shared :: barrier + complex(4), device :: src(*) + complex(4), shared :: dst(*) + integer(4), value :: nelems + end subroutine + + attributes(device) subroutine tma_bulk_ldc8(barrier, src, dst, nelems) + !dir$ ignore_tkr (r) src, (r) dst + integer(8), shared :: barrier + complex(8), device :: src(*) + complex(8), shared :: dst(*) + integer(4), value :: nelems + end subroutine + + attributes(device) subroutine tma_bulk_ldi4(barrier, src, dst, nelems) + !dir$ ignore_tkr (r) src, (r) dst + integer(8), shared :: barrier + integer(4), device :: src(*) + integer(4), shared :: dst(*) + integer(4), value :: nelems + end subroutine + + attributes(device) subroutine tma_bulk_ldi8(barrier, src, dst, nelems) + !dir$ ignore_tkr (r) src, (r) dst + integer(8), shared :: barrier + integer(8), device :: src(*) + integer(8), shared :: dst(*) + integer(4), value :: nelems + end subroutine + + attributes(device) subroutine tma_bulk_ldr2(barrier, src, dst, nelems) + !dir$ ignore_tkr (r) src, (r) dst + integer(8), shared :: barrier + real(2), device :: src(*) + real(2), shared :: dst(*) + integer(4), value :: nelems + end subroutine + + attributes(device) subroutine tma_bulk_ldr4(barrier, src, dst, nelems) + !dir$ ignore_tkr (r) src, (r) dst + integer(8), shared :: barrier + real(4), device :: src(*) + real(4), shared :: dst(*) + integer(4), value :: nelems + end subroutine + + attributes(device) subroutine tma_bulk_ldr8(barrier, src, dst, nelems) + !dir$ ignore_tkr (r) src, (r) dst + integer(8), shared :: barrier + real(8), device :: src(*) + real(8), shared :: dst(*) + integer(4), value :: nelems + end subroutine + end interface + + contains attributes(device) subroutine syncthreads() diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 8bf506b0518ed..71d3d1ef2e2e9 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -516,3 +516,136 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_barrier_try_wait_sleep() ! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %0, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32 + +attributes(global) subroutine test_tma_bulk_load_c4(a, n) + integer(8), shared :: barrier1 + integer, value :: n + complex(4), device :: r8(n) + complex(4), shared :: tmp(1024) + integer(4) :: j, elem_count + call tma_bulk_load(barrier1, r8(j), tmp, elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c4 +! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, uniq_name = "_QFtest_tma_bulk_load_c4Ebarrier1"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, uniq_name = "_QFtest_tma_bulk_load_c4Eelem_count"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref +! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32 +! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 +! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref>>, !fir.ref>, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) + +attributes(global) subroutine test_tma_bulk_load_c8(a, n) + integer(8), shared :: barrier1 + integer, value :: n + complex(8), device :: r8(n) + complex(8), shared :: tmp(1024) + integer(4) :: j, elem_count + call tma_bulk_load(barrier1, r8(j), tmp, elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c8 +! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, uniq_name = "_QFtest_tma_bulk_load_c8Ebarrier1"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, uniq_name = "_QFtest_tma_bulk_load_c8Eelem_count"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref +! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 16 : i32 +! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 +! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref>>, !fir.ref>, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) + +attributes(global) subroutine test_tma_bulk_load_i4(a, n) + integer(8), shared :: barrier1 + integer, value :: n + integer(4), device :: r8(n) + integer(4), shared :: tmp(1024) + integer(4) :: j, elem_count + call tma_bulk_load(barrier1, r8(j), tmp, elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i4 +! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, uniq_name = "_QFtest_tma_bulk_load_i4Ebarrier1"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, uniq_name = "_QFtest_tma_bulk_load_i4Eelem_count"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref +! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32 +! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 +! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref>, !fir.ref, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) + +attributes(global) subroutine test_tma_bulk_load_i8(a, n) + integer(8), shared :: barrier1 + integer, value :: n + integer(8), device :: r8(n) + integer(8), shared :: tmp(1024) + integer(4) :: j, elem_count + call tma_bulk_load(barrier1, r8(j), tmp, elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i8 +! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, uniq_name = "_QFtest_tma_bulk_load_i8Ebarrier1"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, uniq_name = "_QFtest_tma_bulk_load_i8Eelem_count"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref +! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32 +! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 +! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref>, !fir.ref, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) + +attributes(global) subroutine test_tma_bulk_load_r2(a, n) + integer(8), shared :: barrier1 + integer, value :: n + real(2), device :: r8(n) + real(2), shared :: tmp(1024) + integer(4) :: j, elem_count + call tma_bulk_load(barrier1, r8(j), tmp, elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r2 +! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, uniq_name = "_QFtest_tma_bulk_load_r2Ebarrier1"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, uniq_name = "_QFtest_tma_bulk_load_r2Eelem_count"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref +! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 2 : i32 +! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 +! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref>, !fir.ref, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) + +attributes(global) subroutine test_tma_bulk_load_r4(a, n) + integer(8), shared :: barrier1 + integer, value :: n + real(4), device :: r8(n) + real(4), shared :: tmp(1024) + integer(4) :: j, elem_count + call tma_bulk_load(barrier1, r8(j), tmp, elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r4 +! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, uniq_name = "_QFtest_tma_bulk_load_r4Ebarrier1"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, uniq_name = "_QFtest_tma_bulk_load_r4Eelem_count"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref +! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32 +! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 +! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref>, !fir.ref, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) + +attributes(global) subroutine test_tma_bulk_load_r8(a, n) + integer(8), shared :: barrier1 + integer, value :: n + real(8), device :: r8(n) + real(8), shared :: tmp(1024) + integer(4) :: j, elem_count + call tma_bulk_load(barrier1, r8(j), tmp, elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r8 +! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, uniq_name = "_QFtest_tma_bulk_load_r8Ebarrier1"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, uniq_name = "_QFtest_tma_bulk_load_r8Eelem_count"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref +! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32 +! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 +! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref>, !fir.ref, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) From bdfb3a4541c0915710f1f89533da5a44b9c7b6b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 28 Oct 2025 12:10:26 -1000 Subject: [PATCH 058/539] [flang][cuda] Add interfaces and lowering for tma_bulk_store (#165482) As defined in https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/#load-and-store-functions-using-bulk-tma-operations --- .../flang/Optimizer/Builder/IntrinsicCall.h | 7 ++ flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 114 ++++++++++++++++++ flang/module/cudadevice.f90 | 85 +++++++++++-- flang/test/Lower/CUDA/cuda-device-proc.cuf | 92 ++++++++++++++ 4 files changed, 289 insertions(+), 9 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index f5ff6626da654..3407dd01dd504 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -469,6 +469,13 @@ struct IntrinsicLibrary { void genTMABulkLoadR4(llvm::ArrayRef); void genTMABulkLoadR8(llvm::ArrayRef); void genTMABulkS2G(llvm::ArrayRef); + void genTMABulkStoreI4(llvm::ArrayRef); + void genTMABulkStoreI8(llvm::ArrayRef); + void genTMABulkStoreR2(llvm::ArrayRef); + void genTMABulkStoreR4(llvm::ArrayRef); + void genTMABulkStoreR8(llvm::ArrayRef); + void genTMABulkStoreC4(llvm::ArrayRef); + void genTMABulkStoreC8(llvm::ArrayRef); void genTMABulkWaitGroup(llvm::ArrayRef); mlir::Value genTrailz(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genTransfer(mlir::Type, diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 65317599ecd35..53fe9c0d2f6f0 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -1098,6 +1098,34 @@ static constexpr IntrinsicHandler handlers[]{ &I::genTMABulkS2G, {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}}, /*isElemental=*/false}, + {"tma_bulk_store_c4", + &I::genTMABulkStoreC4, + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_c8", + &I::genTMABulkStoreC8, + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_i4", + &I::genTMABulkStoreI4, + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_i8", + &I::genTMABulkStoreI8, + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_r2", + &I::genTMABulkStoreR2, + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_r4", + &I::genTMABulkStoreR4, + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_r8", + &I::genTMABulkStoreR8, + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, {"tma_bulk_wait_group", &I::genTMABulkWaitGroup, {{}}, @@ -9430,6 +9458,92 @@ void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef args) { builder.getI32IntegerAttr(0), {}); } +static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value src, mlir::Value dst, mlir::Value count, + mlir::Value eleSize) { + mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count); + src = convertPtrToNVVMSpace(builder, loc, src, + mlir::NVVM::NVVMMemorySpace::Shared); + dst = convertPtrToNVVMSpace(builder, loc, dst, + mlir::NVVM::NVVMMemorySpace::Global); + mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src, + size, {}, {}); + mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, + "cp.async.bulk.commit_group", {}); + mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, + builder.getI32IntegerAttr(0), {}); +} + +// TMA_BULK_STORE_C4 (CUDA) +void IntrinsicLibrary::genTMABulkStoreC4( + llvm::ArrayRef args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_C8 (CUDA) +void IntrinsicLibrary::genTMABulkStoreC8( + llvm::ArrayRef args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 16); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_I4 (CUDA) +void IntrinsicLibrary::genTMABulkStoreI4( + llvm::ArrayRef args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 4); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_I8 (CUDA) +void IntrinsicLibrary::genTMABulkStoreI8( + llvm::ArrayRef args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_R2 (CUDA) +void IntrinsicLibrary::genTMABulkStoreR2( + llvm::ArrayRef args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 2); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_R4 (CUDA) +void IntrinsicLibrary::genTMABulkStoreR4( + llvm::ArrayRef args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 4); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_R8 (CUDA) +void IntrinsicLibrary::genTMABulkStoreR8( + llvm::ArrayRef args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + // TMA_BULK_WAIT_GROUP (CUDA) void IntrinsicLibrary::genTMABulkWaitGroup( llvm::ArrayRef args) { diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index e6decbb96c271..59af58ddcd32e 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -2047,7 +2047,13 @@ attributes(device) subroutine tma_bulk_wait_group() end subroutine end interface + ! -------------------- + ! Bulk load functions + ! -------------------- + ! Generic load, count is in bytes + ! ------------------------------- + interface attributes(device) subroutine tma_bulk_g2s(barrier, src, dst, nbytes) !dir$ ignore_tkr src, dst @@ -2058,17 +2064,9 @@ attributes(device) subroutine tma_bulk_g2s(barrier, src, dst, nbytes) end subroutine end interface - interface - attributes(device) subroutine tma_bulk_s2g(src, dst, nbytes) - !dir$ ignore_tkr src, dst - integer(4), shared :: src(*) - integer(4), device :: dst(*) - integer(4), value :: nbytes - end subroutine - end interface - ! Load specific types, count is in elements ! ----------------------------------------- + interface tma_bulk_load attributes(device) subroutine tma_bulk_ldc4(barrier, src, dst, nelems) !dir$ ignore_tkr (r) src, (r) dst @@ -2127,6 +2125,75 @@ attributes(device) subroutine tma_bulk_ldr8(barrier, src, dst, nelems) end subroutine end interface + ! -------------------- + ! Bulk Store functions + ! -------------------- + + ! Generic store, count is in bytes + ! -------------------------------- + + interface + attributes(device) subroutine tma_bulk_s2g(src, dst, nbytes) + !dir$ ignore_tkr src, dst + integer(4), shared :: src(*) + integer(4), device :: dst(*) + integer(4), value :: nbytes + end subroutine + end interface + + ! Load specific types, count is in elements + ! ----------------------------------------- + + interface tma_bulk_store + attributes(device) subroutine tma_bulk_store_c4(src, dst, nelems) + !dir$ ignore_tkr (r) src, (r) dst + complex(4), shared :: src(*) + complex(4), device :: dst(*) + integer(4), value :: nelems + end subroutine + + attributes(device) subroutine tma_bulk_store_c8(src, dst, nelems) + !dir$ ignore_tkr (r) src, (r) dst + complex(8), shared :: src(*) + complex(8), device :: dst(*) + integer(4), value :: nelems + end subroutine + + attributes(device) subroutine tma_bulk_store_i4(src, dst, nelems) + !dir$ ignore_tkr (r) src, (r) dst + integer(4), shared :: src(*) + integer(4), device :: dst(*) + integer(4), value :: nelems + end subroutine + + attributes(device) subroutine tma_bulk_store_i8(src, dst, nelems) + !dir$ ignore_tkr (r) src, (r) dst + integer(8), shared :: src(*) + integer(8), device :: dst(*) + integer(4), value :: nelems + end subroutine + + attributes(device) subroutine tma_bulk_store_r2(src, dst, nelems) + !dir$ ignore_tkr (r) src, (r) dst + real(2), shared :: src(*) + real(2), device :: dst(*) + integer(4), value :: nelems + end subroutine + + attributes(device) subroutine tma_bulk_store_r4(src, dst, nelems) + !dir$ ignore_tkr (r) src, (r) dst + real(4), shared :: src(*) + real(4), device :: dst(*) + integer(4), value :: nelems + end subroutine + + attributes(device) subroutine tma_bulk_store_r8(src, dst, nelems) + !dir$ ignore_tkr (r) src, (r) dst + real(8), shared :: src(*) + real(8), device :: dst(*) + integer(4), value :: nelems + end subroutine + end interface contains diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 71d3d1ef2e2e9..8f355217899b3 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -649,3 +649,95 @@ end subroutine ! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr ! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref>, !fir.ref, i32, !llvm.ptr) ! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) + +attributes(global) subroutine test_tma_bulk_store_c4(c, n) + integer, value :: n + complex(4), device :: c(n) + complex(4), shared :: tmpa(1024) + integer(4) :: j, elem_count + call tma_bulk_store(tmpa, c(j), elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c4 +! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.cp.async.bulk.wait_group 0 + +attributes(global) subroutine test_tma_bulk_store_c8(c, n) + integer, value :: n + complex(8), device :: c(n) + complex(8), shared :: tmpa(1024) + integer(4) :: j, elem_count + call tma_bulk_store(tmpa, c(j), elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c8 +! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.cp.async.bulk.wait_group 0 + +attributes(global) subroutine test_tma_bulk_store_i4(c, n) + integer, value :: n + integer(4), device :: c(n) + integer(4), shared :: tmpa(1024) + integer(4) :: j, elem_count + call tma_bulk_store(tmpa, c(j), elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i4 +! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.cp.async.bulk.wait_group 0 + +attributes(global) subroutine test_tma_bulk_store_i8(c, n) + integer, value :: n + integer(8), device :: c(n) + integer(8), shared :: tmpa(1024) + integer(4) :: j, elem_count + call tma_bulk_store(tmpa, c(j), elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i8 +! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.cp.async.bulk.wait_group 0 + + +attributes(global) subroutine test_tma_bulk_store_r2(c, n) + integer, value :: n + real(2), device :: c(n) + real(2), shared :: tmpa(1024) + integer(4) :: j, elem_count + call tma_bulk_store(tmpa, c(j), elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r2 +! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.cp.async.bulk.wait_group 0 + +attributes(global) subroutine test_tma_bulk_store_r4(c, n) + integer, value :: n + real(4), device :: c(n) + real(4), shared :: tmpa(1024) + integer(4) :: j, elem_count + call tma_bulk_store(tmpa, c(j), elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r4 +! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.cp.async.bulk.wait_group 0 + +attributes(global) subroutine test_tma_bulk_store_r8(c, n) + integer, value :: n + real(8), device :: c(n) + real(8), shared :: tmpa(1024) + integer(4) :: j, elem_count + call tma_bulk_store(tmpa, c(j), elem_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r8 +! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.cp.async.bulk.wait_group 0 From cab741adcb23bc9362d942f4c14aa809a0078fe0 Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Tue, 28 Oct 2025 15:20:36 -0700 Subject: [PATCH 059/539] [Clang][BPF] Add __BPF_FEATURE_GOTOX (#165456) Add a macro __BPF_FEATURE_GOTOX for bpf target for cpu v4. So the developer can easily detect whether insn gotox is supported or not. --- clang/lib/Basic/Targets/BPF.cpp | 1 + clang/test/Preprocessor/bpf-predefined-macros.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/clang/lib/Basic/Targets/BPF.cpp b/clang/lib/Basic/Targets/BPF.cpp index 0411bcca51789..8de1083d758c7 100644 --- a/clang/lib/Basic/Targets/BPF.cpp +++ b/clang/lib/Basic/Targets/BPF.cpp @@ -75,6 +75,7 @@ void BPFTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__BPF_FEATURE_GOTOL"); Builder.defineMacro("__BPF_FEATURE_ST"); Builder.defineMacro("__BPF_FEATURE_LOAD_ACQ_STORE_REL"); + Builder.defineMacro("__BPF_FEATURE_GOTOX"); } } diff --git a/clang/test/Preprocessor/bpf-predefined-macros.c b/clang/test/Preprocessor/bpf-predefined-macros.c index cd8a2ec031925..a9ae8c58c3ba7 100644 --- a/clang/test/Preprocessor/bpf-predefined-macros.c +++ b/clang/test/Preprocessor/bpf-predefined-macros.c @@ -70,6 +70,9 @@ int u; #ifdef __BPF_FEATURE_LOAD_ACQ_STORE_REL int v; #endif +#ifdef __BPF_FEATURE_GOTOX +int w; +#endif // CHECK: int b; // CHECK: int c; @@ -110,6 +113,7 @@ int v; // CPU_V4: int u; // CPU_V4: int v; +// CPU_V4: int w; // CPU_GENERIC: int g; From 7628698fd28c35e8fc006c814abeed8424934873 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Tue, 28 Oct 2025 22:21:43 +0000 Subject: [PATCH 060/539] [MemRef] Fix-forward use-after-scope in #164955 (#165478) https://github.com/llvm/llvm-project/pull/164955 has a use-after-scope (https://lab.llvm.org/buildbot/#/builders/169/builds/16454): ``` ==mlir-opt==3940651==ERROR: AddressSanitizer: stack-use-after-scope on address 0x6e1f6ba5c878 at pc 0x6336b214912a bp 0x7ffe607f1670 sp 0x7ffe607f1668 READ of size 4 at 0x6e1f6ba5c878 thread T0 #0 0x6336b2149129 in size /home/b/sanitizer-x86_64-linux-fast/build/llvm-project/llvm/include/llvm/ADT/SmallVector.h:80:32 #1 0x6336b2149129 in operator[] /home/b/sanitizer-x86_64-linux-fast/build/llvm-project/llvm/include/llvm/ADT/SmallVector.h:299:5 #2 0x6336b2149129 in populateBoundsForShapedValueDim /home/b/sanitizer-x86_64-linux-fast/build/llvm-project/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp:113:43 ... ``` This patch attempts to fix-forward by stack-allocating reassocIndices, instead of taking a reference to a return value. --- mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp index 3aa801b48a2e9..69afbcadb0b07 100644 --- a/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp @@ -107,7 +107,7 @@ struct CollapseShapeOpInterface assert(value == collapseOp.getResult() && "invalid value"); // Multiply the expressions for the dimensions in the reassociation group. - const ReassociationIndices &reassocIndices = + const ReassociationIndices reassocIndices = collapseOp.getReassociationIndices()[dim]; AffineExpr productExpr = cstr.getExpr(collapseOp.getSrc(), reassocIndices[0]); From 5585745f66a19fe90260db60fb99ea7096c66174 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 28 Oct 2025 16:14:46 -0700 Subject: [PATCH 061/539] [Analysis] Use std::clamp in getHeatColor (NFC) (#165394) This patch uses std::clamp to simplify manual clamping in getHeatColor. --- llvm/lib/Analysis/HeatUtils.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llvm/lib/Analysis/HeatUtils.cpp b/llvm/lib/Analysis/HeatUtils.cpp index a1cc7071f0e22..08e9428059e7e 100644 --- a/llvm/lib/Analysis/HeatUtils.cpp +++ b/llvm/lib/Analysis/HeatUtils.cpp @@ -64,10 +64,7 @@ std::string llvm::getHeatColor(uint64_t Freq, uint64_t MaxFreq) { } std::string llvm::getHeatColor(double Percent) { - if (Percent > 1.0) - Percent = 1.0; - if (Percent < 0.0) - Percent = 0.0; + Percent = std::clamp(Percent, 0.0, 1.0); unsigned ColorID = unsigned(round(Percent * (HeatSize - 1.0))); return HeatPalette[ColorID]; } From 477f11cebdccb9c8ee1d6b26c8d6686d76fc8e68 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 28 Oct 2025 16:14:53 -0700 Subject: [PATCH 062/539] [Analysis] Use "= default" in a constructor (NFC) (#165395) Note that all of the members are properly initialized a few lines above the constructor. --- llvm/include/llvm/Analysis/IR2Vec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h index 5ad62880a779c..71055dd16a378 100644 --- a/llvm/include/llvm/Analysis/IR2Vec.h +++ b/llvm/include/llvm/Analysis/IR2Vec.h @@ -161,7 +161,7 @@ class VocabStorage { public: /// Default constructor creates empty storage (invalid state) - VocabStorage() : Sections(), TotalSize(0), Dimension(0) {} + VocabStorage() = default; /// Create a VocabStorage with pre-organized section data VocabStorage(std::vector> &&SectionData); From 21247146b565706123e05d23726762f9275a8a31 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 28 Oct 2025 16:15:01 -0700 Subject: [PATCH 063/539] [llvm] Use nullptr instead of 0 or NULL (NFC) (#165396) Identified with modernize-use-nullptr. --- llvm/include/llvm/CodeGen/TargetLowering.h | 2 +- llvm/lib/DebugInfo/GSYM/GsymCreator.cpp | 2 +- llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp | 2 +- llvm/lib/SandboxIR/Context.cpp | 2 +- llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp | 2 +- llvm/lib/TargetParser/Host.cpp | 2 +- llvm/lib/Transforms/IPO/ExpandVariadics.cpp | 2 +- llvm/unittests/ProfileData/InstrProfTest.cpp | 4 ++-- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 4058dd728e5d1..1920b98c8a1ef 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3761,7 +3761,7 @@ class LLVM_ABI TargetLoweringBase { /// register class is the largest legal super-reg register class of the /// register class of the specified type. e.g. On x86, i8, i16, and i32's /// representative class would be GR32. - const TargetRegisterClass *RepRegClassForVT[MVT::VALUETYPE_SIZE] = {0}; + const TargetRegisterClass *RepRegClassForVT[MVT::VALUETYPE_SIZE] = {nullptr}; /// This indicates the "cost" of the "representative" register class for each /// ValueType. The cost is used by the scheduler to approximate register diff --git a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp index 93ff3b924db32..d87cb4d2210a6 100644 --- a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp +++ b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp @@ -552,7 +552,7 @@ llvm::Error GsymCreator::saveSegments(StringRef Path, createSegment(SegmentSize, FuncIdx); if (ExpectedGC) { GsymCreator *GC = ExpectedGC->get(); - if (GC == NULL) + if (!GC) break; // We had not more functions to encode. // Don't collect any messages at all OutputAggregator Out(nullptr); diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp index 1a61d3188a820..e609a7d3dc08e 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp @@ -55,7 +55,7 @@ struct PerfState { std::unique_ptr Dumpstream; // perf mmap marker - void *MarkerAddr = NULL; + void *MarkerAddr = nullptr; }; // prevent concurrent dumps from messing up the output file diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp index 70ac68abbcb0d..fb6ff6203567a 100644 --- a/llvm/lib/SandboxIR/Context.cpp +++ b/llvm/lib/SandboxIR/Context.cpp @@ -443,7 +443,7 @@ Argument *Context::getOrCreateArgument(llvm::Argument *LLVMArg) { } Constant *Context::getOrCreateConstant(llvm::Constant *LLVMC) { - return cast(getOrCreateValueInternal(LLVMC, 0)); + return cast(getOrCreateValueInternal(LLVMC, nullptr)); } BasicBlock *Context::createBasicBlock(llvm::BasicBlock *LLVMBB) { diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 6181abb281cc6..47022b3f89a8b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -745,7 +745,7 @@ Register SPIRVGlobalRegistry::buildGlobalVariable( .addDef(ResVReg) .addUse(getSPIRVTypeID(BaseType)) .addImm(static_cast(Storage)); - if (Init != 0) + if (Init) MIB.addUse(Init->getOperand(0).getReg()); // ISel may introduce a new register on this step, so we need to add it to // DT and correct its type avoiding fails on the next stage. diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index c8d193887d92f..0849fc7d55a32 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1179,7 +1179,7 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family, const unsigned *Features, unsigned *Type, unsigned *Subtype) { - const char *CPU = 0; + const char *CPU = nullptr; switch (Family) { case 4: diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp index 042578d26818a..6a11aec6c5cb0 100644 --- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp +++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp @@ -380,7 +380,7 @@ bool ExpandVariadics::runOnModule(Module &M) { if (CB->isIndirectCall()) { FunctionType *FTy = CB->getFunctionType(); if (FTy->isVarArg()) - Changed |= expandCall(M, Builder, CB, FTy, 0); + Changed |= expandCall(M, Builder, CB, FTy, /*NF=*/nullptr); } } } diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp index dd17844aef8a6..8641b939dd35d 100644 --- a/llvm/unittests/ProfileData/InstrProfTest.cpp +++ b/llvm/unittests/ProfileData/InstrProfTest.cpp @@ -914,7 +914,7 @@ TEST_P(MaybeSparseInstrProfTest, annotate_vp_data) { ASSERT_THAT(ValueData, SizeIs(0)); // Remove the MD_prof metadata - Inst->setMetadata(LLVMContext::MD_prof, 0); + Inst->setMetadata(LLVMContext::MD_prof, nullptr); // Annotate 5 records this time. annotateValueSite(*M, *Inst, R.get(), IPVK_IndirectCallTarget, 0, 5); ValueData = getValueProfDataFromInst(*Inst, IPVK_IndirectCallTarget, 5, T); @@ -932,7 +932,7 @@ TEST_P(MaybeSparseInstrProfTest, annotate_vp_data) { ASSERT_EQ(2U, ValueData[4].Count); // Remove the MD_prof metadata - Inst->setMetadata(LLVMContext::MD_prof, 0); + Inst->setMetadata(LLVMContext::MD_prof, nullptr); // Annotate with 4 records. InstrProfValueData VD0Sorted[] = {{1000, 6}, {2000, 5}, {3000, 4}, {4000, 3}, {5000, 2}, {6000, 1}}; From 6015fcea14c76a123572ab3920a83eae5ce0ddd9 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 28 Oct 2025 16:15:08 -0700 Subject: [PATCH 064/539] [llvm] Use StringRef::contains (NFC) (#165397) Identified with readability-container-contains --- llvm/include/llvm/IR/Mangler.h | 3 +-- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp | 2 +- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/IR/Mangler.h b/llvm/include/llvm/IR/Mangler.h index 232101a8926b7..4d387ba73127d 100644 --- a/llvm/include/llvm/IR/Mangler.h +++ b/llvm/include/llvm/IR/Mangler.h @@ -80,8 +80,7 @@ getArm64ECDemangledFunctionName(StringRef Name); /// Check if an ARM64EC function name is mangled. bool inline isArm64ECMangledFunctionName(StringRef Name) { - return Name[0] == '#' || - (Name[0] == '?' && Name.find("@$$h") != StringRef::npos); + return Name[0] == '#' || (Name[0] == '?' && Name.contains("@$$h")); } } // End llvm namespace diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 4029e143ae2a4..729c077884f3a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -493,7 +493,7 @@ NVPTXTTIImpl::getInstructionCost(const User *U, // predicate ("@"). return !AsmInst.empty() && (AsmInst[0] == '@' || isAlpha(AsmInst[0]) || - AsmInst.find(".pragma") != StringRef::npos); + AsmInst.contains(".pragma")); }); return InstCount * TargetTransformInfo::TCC_Basic; } diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 3da720f54e6ab..58109acc92015 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -8973,8 +8973,7 @@ SystemZTargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc, if (const auto *CB = dyn_cast(RHSVal)) { if (CB->isInlineAsm()) { const InlineAsm *IA = cast(CB->getCalledOperand()); - return IA && - IA->getConstraintString().find("{@cc}") != std::string::npos; + return IA && IA->getConstraintString().contains("{@cc}"); } } } From 74989bb0bc1d8cd15973774049b58517d83e3d01 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 28 Oct 2025 16:15:16 -0700 Subject: [PATCH 065/539] [clang] Proofread ThreadSafetyAnalysis.rst (#165398) --- clang/docs/ThreadSafetyAnalysis.rst | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/clang/docs/ThreadSafetyAnalysis.rst b/clang/docs/ThreadSafetyAnalysis.rst index 853a8fae4a907..d0f96f58dac17 100644 --- a/clang/docs/ThreadSafetyAnalysis.rst +++ b/clang/docs/ThreadSafetyAnalysis.rst @@ -118,7 +118,7 @@ require exclusive access, while read operations require only shared access. At any given moment during program execution, a thread holds a specific set of capabilities (e.g. the set of mutexes that it has locked.) These act like keys or tokens that allow the thread to access a given resource. Just like physical -security keys, a thread cannot make copy of a capability, nor can it destroy +security keys, a thread cannot make a copy of a capability, nor can it destroy one. A thread can only release a capability to another thread, or acquire one from another thread. The annotations are deliberately agnostic about the exact mechanism used to acquire and release capabilities; it assumes that the @@ -131,7 +131,7 @@ by calculating an approximation of that set, called the *capability environment*. The capability environment is calculated for every program point, and describes the set of capabilities that are statically known to be held, or not held, at that particular point. This environment is a conservative -approximation of the full set of capabilities that will actually held by a +approximation of the full set of capabilities that will actually be held by a thread at run-time. @@ -369,7 +369,7 @@ thread-safe, but too complicated for the analysis to understand. Reasons for void unsafeIncrement() NO_THREAD_SAFETY_ANALYSIS { a++; } }; -Unlike the other attributes, NO_THREAD_SAFETY_ANALYSIS is not part of the +Unlike the other attributes, ``NO_THREAD_SAFETY_ANALYSIS`` is not part of the interface of a function, and should thus be placed on the function definition (in the ``.cc`` or ``.cpp`` file) rather than on the function declaration (in the header). @@ -509,7 +509,7 @@ ASSERT_CAPABILITY(...) and ASSERT_SHARED_CAPABILITY(...) *Previously:* ``ASSERT_EXCLUSIVE_LOCK``, ``ASSERT_SHARED_LOCK`` These are attributes on a function or method which asserts the calling thread -already holds the given capability, for example by performing a run-time test +already holds the given capability, for example, by performing a run-time test and terminating if the capability is not held. Presence of this annotation causes the analysis to assume the capability is held after calls to the annotated function. See :ref:`mutexheader`, below, for example uses. @@ -554,19 +554,19 @@ Negative Capabilities ===================== Thread Safety Analysis is designed to prevent both race conditions and -deadlock. The GUARDED_BY and REQUIRES attributes prevent race conditions, by +deadlock. The ``GUARDED_BY`` and ``REQUIRES`` attributes prevent race conditions, by ensuring that a capability is held before reading or writing to guarded data, -and the EXCLUDES attribute prevents deadlock, by making sure that a mutex is +and the ``EXCLUDES`` attribute prevents deadlock, by making sure that a mutex is *not* held. -However, EXCLUDES is an optional attribute, and does not provide the same -safety guarantee as REQUIRES. In particular: +However, ``EXCLUDES`` is an optional attribute, and does not provide the same +safety guarantee as ``REQUIRES``. In particular: * A function which acquires a capability does not have to exclude it. * A function which calls a function that excludes a capability does not - have transitively exclude that capability. + have to transitively exclude that capability. -As a result, EXCLUDES can easily produce false negatives: +As a result, ``EXCLUDES`` can easily produce false negatives: .. code-block:: c++ @@ -594,8 +594,8 @@ As a result, EXCLUDES can easily produce false negatives: }; -Negative requirements are an alternative EXCLUDES that provide -a stronger safety guarantee. A negative requirement uses the REQUIRES +Negative requirements are an alternative to ``EXCLUDES`` that provide +a stronger safety guarantee. A negative requirement uses the ``REQUIRES`` attribute, in conjunction with the ``!`` operator, to indicate that a capability should *not* be held. @@ -642,7 +642,7 @@ Frequently Asked Questions (A) Attributes are part of the formal interface of a function, and should always go in the header, where they are visible to anything that includes -the header. Attributes in the .cpp file are not visible outside of the +the header. Attributes in the ``.cpp`` file are not visible outside of the immediate translation unit, which leads to false negatives and false positives. @@ -684,7 +684,7 @@ Private Mutexes --------------- Good software engineering practice dictates that mutexes should be private -members, because the locking mechanism used by a thread-safe class is part of +members because the locking mechanism used by a thread-safe class is part of its internal implementation. However, private mutexes can sometimes leak into the public interface of a class. Thread safety attributes follow normal C++ access restrictions, so if ``mu`` From b6ddaee44bf710c5b3bbdca79ee3dfeffa49f93b Mon Sep 17 00:00:00 2001 From: Charles Zablit Date: Tue, 28 Oct 2025 16:26:04 -0700 Subject: [PATCH 066/539] [docs][lldb] update the Windows tools instructions (#164491) This patch updates the instructions explaining how to install the required tools for building `lldb` on Windows, mostly removing the duplication and removing references to `GnuWin32` which still has executables built in 2009 and is no longer functional per their website. --- lldb/docs/resources/build.rst | 66 +++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst index 0db8c92ad49d6..2eb167709dbda 100644 --- a/lldb/docs/resources/build.rst +++ b/lldb/docs/resources/build.rst @@ -95,37 +95,31 @@ commands below. Windows ******* -* Visual Studio 2019. -* The latest Windows SDK. -* The Active Template Library (ATL). -* `GnuWin32 `_ for CoreUtils and Make. -* `Python 3 `_. Make sure to (1) get - the x64 variant if that's what you're targeting and (2) install the debug - library if you want to build a debug lldb. The standalone installer is the - easiest way to get the debug library. -* `Python Tools for Visual Studio - `_. If you plan to debug test failures - or even write new tests at all, PTVS is an indispensable debugging - extension to VS that enables full editing and debugging support for Python - (including mixed native/managed debugging). -* `SWIG for Windows `_ - -The steps outlined here describes how to set up your system and install the -required dependencies such that they can be found when needed during the build -process. They only need to be performed once. - -#. Install Visual Studio with the "Desktop Development with C++" workload and - the "Python Development" workload. -#. Install GnuWin32, making sure ``\bin`` is added to - your PATH environment variable. Verify that utilities like ``dirname`` and - ``make`` are available from your terminal. -#. Install SWIG for Windows, making sure ```` is added to - your PATH environment variable. Verify that ``swig`` is available from your - terminal. -#. Install Python 3 from the standalone installer and include the debug libraries - in the install, making sure the Python install path is added to your PATH - environment variable. -#. Register the Debug Interface Access DLLs with the Registry from a privileged +The steps outlined here describe how to set up your system and install the +required dependencies for building and testing LLDB on Windows. They only need +to be performed once. + +Build Requirements +^^^^^^^^^^^^^^^^^^ + +Please follow the steps below if you only want to **build** lldb. + +1. Install `Visual Studio ` with the + "Desktop Development with C++" workload. Make sure that the latest Windows + SDK and the Active Template Library (ATL) are installed. +2. Install `Git Bash `_ and add + ``\usr\bin`` to your ``PATH``. Verify that utilities like + ``dirname`` are available from your terminal. +3. Install `make `_ and + verify that it's in your ``PATH``. +4. Install `Python 3 `_ from the + GUI installer. If you will be building LLDB in Debug mode, **include the + debug libraries** during the install. Make sure ``python`` is added to your + ``PATH``. +5. Install `SWIG for Windows `_. Make sure + ``swig`` is added to your ``PATH`` and that ``swig -swiglib`` points to the + correct directory. +6. Register the Debug Interface Access DLLs with the Registry from a privileged terminal. :: @@ -139,6 +133,16 @@ Prompt for VS `_, + an indispensable debugging extension to Visual Studio which enables full + editing and debugging support for Python (including mixed native/managed + debugging). + macOS ***** From 0fd270a1a08a3db42b05b9905a7749beed9af6b4 Mon Sep 17 00:00:00 2001 From: Slava Gurevich Date: Tue, 28 Oct 2025 17:01:01 -0700 Subject: [PATCH 067/539] [MLIR] Fix use-after-move in debug logging (#165208) 1. In `Transforms.cpp` the debug macro is accessing a SmallVector variable that has been moved-from and reset. Fixed by reordering code for the move-from to happen last. 2. `IterationGraphSorter` Refine the previous use-after-move fix for style/readability by renaming the private constructor args to resolve naming ambiguity with the class members. Testing: `ninja check-mlir` --- mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp | 3 ++- .../Transforms/Utils/IterationGraphSorter.cpp | 15 ++++++++------- .../Transforms/Utils/IterationGraphSorter.h | 6 +++--- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp index eb2d825e17e44..bd25e946908b6 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @@ -495,13 +495,14 @@ FailureOr linalg::pack(RewriterBase &rewriter, if (failed(maybePackedDimForEachOperand)) return failure(); packedOperandsDims.packedDimForEachOperand = *maybePackedDimForEachOperand; - listOfPackedOperandsDim.pushBack(std::move(packedOperandsDims)); LDBG() << "++++ After pack size #" << i << ": " << packedSizes[i]; LDBG() << "maps: " << llvm::interleaved(indexingMaps); LDBG() << "iterators: " << llvm::interleaved(iteratorTypes); LDBG() << "packedDimForEachOperand: " << llvm::interleaved(packedOperandsDims.packedDimForEachOperand); + + listOfPackedOperandsDim.pushBack(std::move(packedOperandsDims)); } // Step 2. Propagate packing to all LinalgOp operands. diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp index f53d2727c9b00..ffa8b402e0b6b 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp @@ -152,19 +152,20 @@ IterationGraphSorter IterationGraphSorter::fromGenericOp( } IterationGraphSorter::IterationGraphSorter( - SmallVector &&ins, SmallVector &&loop2InsLvl, Value out, - AffineMap loop2OutLvl, SmallVector &&iterTypes, + SmallVector &&insArg, SmallVector &&loop2InsLvlArg, + Value out, AffineMap loop2OutLvl, + SmallVector &&iterTypesArg, sparse_tensor::LoopOrderingStrategy strategy) - : ins(std::move(ins)), loop2InsLvl(std::move(loop2InsLvl)), out(out), - loop2OutLvl(loop2OutLvl), iterTypes(std::move(iterTypes)), + : ins(std::move(insArg)), loop2InsLvl(std::move(loop2InsLvlArg)), out(out), + loop2OutLvl(loop2OutLvl), iterTypes(std::move(iterTypesArg)), strategy(strategy) { // One map per tensor. - assert(this->loop2InsLvl.size() == this->ins.size()); + assert(loop2InsLvl.size() == ins.size()); // All the affine maps have the same number of dimensions (loops). assert(llvm::all_equal(llvm::map_range( - this->loop2InsLvl, [](AffineMap m) { return m.getNumDims(); }))); + loop2InsLvl, [](AffineMap m) { return m.getNumDims(); }))); // The number of results of the map should match the rank of the tensor. - assert(llvm::all_of(llvm::zip(this->loop2InsLvl, this->ins), [](auto mvPair) { + assert(llvm::all_of(llvm::zip(loop2InsLvl, ins), [](auto mvPair) { auto [m, v] = mvPair; // For ranked types the rank must match. diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.h index b2a16e9382758..35e58edeb2562 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.h @@ -59,10 +59,10 @@ class IterationGraphSorter { private: // Private constructor. - IterationGraphSorter(SmallVector &&ins, - SmallVector &&loop2InsLvl, Value out, + IterationGraphSorter(SmallVector &&insArg, + SmallVector &&loop2InsLvlArg, Value out, AffineMap loop2OutLvl, - SmallVector &&iterTypes, + SmallVector &&iterTypesArg, sparse_tensor::LoopOrderingStrategy strategy = sparse_tensor::LoopOrderingStrategy::kDefault); From 0527ef4e6c97acb437569635bf7cae8991c0a898 Mon Sep 17 00:00:00 2001 From: Andres-Salamanca Date: Tue, 28 Oct 2025 19:26:33 -0500 Subject: [PATCH 068/539] [CIR][NFC] Update TypeCache file to use MLIR-style camel case (#165060) This PR updates the file `CIRGenTypeCache` to use MLIR-style camel case naming.The change was inspired by the discussion here: https://github.com/llvm/llvm-project/pull/164180#discussion_r2461444730 --- clang/lib/CIR/CodeGen/CIRGenBuilder.h | 34 ++++++------- clang/lib/CIR/CodeGen/CIRGenClass.cpp | 4 +- clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp | 8 ++-- clang/lib/CIR/CodeGen/CIRGenDecl.cpp | 4 +- clang/lib/CIR/CodeGen/CIRGenExpr.cpp | 2 +- clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp | 8 ++-- clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp | 4 +- clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp | 2 +- clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 18 +++---- clang/lib/CIR/CodeGen/CIRGenFunction.cpp | 8 ++-- clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp | 10 ++-- clang/lib/CIR/CodeGen/CIRGenModule.cpp | 48 +++++++++---------- clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp | 6 +-- clang/lib/CIR/CodeGen/CIRGenTypeCache.h | 48 +++++++++---------- clang/lib/CIR/CodeGen/CIRGenTypes.cpp | 32 ++++++------- 15 files changed, 118 insertions(+), 118 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h index 50d585dca3b8c..e5066fac19185 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h +++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h @@ -108,11 +108,11 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy { cir::LongDoubleType getLongDoubleTy(const llvm::fltSemantics &format) const { if (&format == &llvm::APFloat::IEEEdouble()) - return cir::LongDoubleType::get(getContext(), typeCache.DoubleTy); + return cir::LongDoubleType::get(getContext(), typeCache.doubleTy); if (&format == &llvm::APFloat::x87DoubleExtended()) - return cir::LongDoubleType::get(getContext(), typeCache.FP80Ty); + return cir::LongDoubleType::get(getContext(), typeCache.fP80Ty); if (&format == &llvm::APFloat::IEEEquad()) - return cir::LongDoubleType::get(getContext(), typeCache.FP128Ty); + return cir::LongDoubleType::get(getContext(), typeCache.fP128Ty); if (&format == &llvm::APFloat::PPCDoubleDouble()) llvm_unreachable("NYI: PPC double-double format for long double"); llvm_unreachable("Unsupported format for long double"); @@ -258,17 +258,17 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy { } } - cir::VoidType getVoidTy() { return typeCache.VoidTy; } + cir::VoidType getVoidTy() { return typeCache.voidTy; } - cir::IntType getSInt8Ty() { return typeCache.SInt8Ty; } - cir::IntType getSInt16Ty() { return typeCache.SInt16Ty; } - cir::IntType getSInt32Ty() { return typeCache.SInt32Ty; } - cir::IntType getSInt64Ty() { return typeCache.SInt64Ty; } + cir::IntType getSInt8Ty() { return typeCache.sInt8Ty; } + cir::IntType getSInt16Ty() { return typeCache.sInt16Ty; } + cir::IntType getSInt32Ty() { return typeCache.sInt32Ty; } + cir::IntType getSInt64Ty() { return typeCache.sInt64Ty; } - cir::IntType getUInt8Ty() { return typeCache.UInt8Ty; } - cir::IntType getUInt16Ty() { return typeCache.UInt16Ty; } - cir::IntType getUInt32Ty() { return typeCache.UInt32Ty; } - cir::IntType getUInt64Ty() { return typeCache.UInt64Ty; } + cir::IntType getUInt8Ty() { return typeCache.uInt8Ty; } + cir::IntType getUInt16Ty() { return typeCache.uInt16Ty; } + cir::IntType getUInt32Ty() { return typeCache.uInt32Ty; } + cir::IntType getUInt64Ty() { return typeCache.uInt64Ty; } cir::ConstantOp getConstInt(mlir::Location loc, llvm::APSInt intVal); @@ -280,21 +280,21 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy { llvm::APFloat fpVal); bool isInt8Ty(mlir::Type i) { - return i == typeCache.UInt8Ty || i == typeCache.SInt8Ty; + return i == typeCache.uInt8Ty || i == typeCache.sInt8Ty; } bool isInt16Ty(mlir::Type i) { - return i == typeCache.UInt16Ty || i == typeCache.SInt16Ty; + return i == typeCache.uInt16Ty || i == typeCache.sInt16Ty; } bool isInt32Ty(mlir::Type i) { - return i == typeCache.UInt32Ty || i == typeCache.SInt32Ty; + return i == typeCache.uInt32Ty || i == typeCache.sInt32Ty; } bool isInt64Ty(mlir::Type i) { - return i == typeCache.UInt64Ty || i == typeCache.SInt64Ty; + return i == typeCache.uInt64Ty || i == typeCache.sInt64Ty; } bool isInt(mlir::Type i) { return mlir::isa(i); } // Fetch the type representing a pointer to unsigned int8 values. - cir::PointerType getUInt8PtrTy() { return typeCache.UInt8PtrTy; } + cir::PointerType getUInt8PtrTy() { return typeCache.uInt8PtrTy; } /// Get a CIR anonymous record type. cir::RecordType getAnonRecordTy(llvm::ArrayRef members, diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp index 5046e0945002f..a8296782ebc40 100644 --- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp @@ -362,7 +362,7 @@ static Address applyNonVirtualAndVirtualOffset( // not bytes. So the pointer must be cast to a byte pointer and back. mlir::Value ptr = addr.getPointer(); - mlir::Type charPtrType = cgf.cgm.UInt8PtrTy; + mlir::Type charPtrType = cgf.cgm.uInt8PtrTy; mlir::Value charPtr = cgf.getBuilder().createBitcast(ptr, charPtrType); mlir::Value adjusted = cir::PtrStrideOp::create( cgf.getBuilder(), loc, charPtrType, charPtr, baseOffset); @@ -1105,7 +1105,7 @@ mlir::Value CIRGenFunction::getVTTParameter(GlobalDecl gd, bool forVirtualBase, // We're the complete constructor, so get the VTT by name. cir::GlobalOp vtt = cgm.getVTables().getAddrOfVTT(rd); return builder.createVTTAddrPoint( - loc, builder.getPointerTo(cgm.VoidPtrTy), + loc, builder.getPointerTo(cgm.voidPtrTy), mlir::FlatSymbolRefAttr::get(vtt.getSymNameAttr()), subVTTIndex); } } diff --git a/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp index 8723a6e502b38..930ae55405756 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp @@ -55,7 +55,7 @@ cir::CallOp CIRGenFunction::emitCoroIDBuiltinCall(mlir::Location loc, if (!builtin) { fnOp = cgm.createCIRBuiltinFunction( loc, cgm.builtinCoroId, - cir::FuncType::get({int32Ty, VoidPtrTy, VoidPtrTy, VoidPtrTy}, int32Ty), + cir::FuncType::get({int32Ty, voidPtrTy, voidPtrTy, voidPtrTy}, int32Ty), /*FD=*/nullptr); assert(fnOp && "should always succeed"); } else { @@ -75,7 +75,7 @@ cir::CallOp CIRGenFunction::emitCoroAllocBuiltinCall(mlir::Location loc) { cir::FuncOp fnOp; if (!builtin) { fnOp = cgm.createCIRBuiltinFunction(loc, cgm.builtinCoroAlloc, - cir::FuncType::get({UInt32Ty}, boolTy), + cir::FuncType::get({uInt32Ty}, boolTy), /*fd=*/nullptr); assert(fnOp && "should always succeed"); } else { @@ -95,7 +95,7 @@ CIRGenFunction::emitCoroBeginBuiltinCall(mlir::Location loc, if (!builtin) { fnOp = cgm.createCIRBuiltinFunction( loc, cgm.builtinCoroBegin, - cir::FuncType::get({UInt32Ty, VoidPtrTy}, VoidPtrTy), + cir::FuncType::get({uInt32Ty, voidPtrTy}, voidPtrTy), /*fd=*/nullptr); assert(fnOp && "should always succeed"); } else { @@ -110,7 +110,7 @@ CIRGenFunction::emitCoroBeginBuiltinCall(mlir::Location loc, mlir::LogicalResult CIRGenFunction::emitCoroutineBody(const CoroutineBodyStmt &s) { mlir::Location openCurlyLoc = getLoc(s.getBeginLoc()); - cir::ConstantOp nullPtrCst = builder.getNullPtr(VoidPtrTy, openCurlyLoc); + cir::ConstantOp nullPtrCst = builder.getNullPtr(voidPtrTy, openCurlyLoc); auto fn = mlir::cast(curFn); fn.setCoroutine(true); diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp index 5667273c00daf..aeea0efeb77c3 100644 --- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp @@ -80,13 +80,13 @@ CIRGenFunction::emitAutoVarAlloca(const VarDecl &d, assert(!cir::MissingFeatures::openMP()); if (!didCallStackSave) { // Save the stack. - cir::PointerType defaultTy = AllocaInt8PtrTy; + cir::PointerType defaultTy = allocaInt8PtrTy; CharUnits align = CharUnits::fromQuantity( cgm.getDataLayout().getAlignment(defaultTy, false)); Address stack = createTempAlloca(defaultTy, align, loc, "saved_stack"); mlir::Value v = builder.createStackSave(loc, defaultTy); - assert(v.getType() == AllocaInt8PtrTy); + assert(v.getType() == allocaInt8PtrTy); builder.createStore(loc, v, stack); didCallStackSave = true; diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index df6ee56eac30b..5ccb431e626ae 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -2529,7 +2529,7 @@ CIRGenFunction::emitConditionalBlocks(const AbstractConditionalOperator *e, // If both arms are void, so be it. if (!yieldTy) - yieldTy = VoidTy; + yieldTy = voidTy; // Insert required yields. for (mlir::OpBuilder::InsertPoint &toInsert : insertPoints) { diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp index 8fe0d9b4a69ef..3d3030ca87e2a 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp @@ -490,7 +490,7 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy, for (uint64_t i = 0; i != numInitElements; ++i) { // Advance to the next element. if (i > 0) { - one = builder.getConstantInt(loc, cgf.PtrDiffTy, i); + one = builder.getConstantInt(loc, cgf.ptrDiffTy, i); element = builder.createPtrStride(loc, begin, one); } @@ -512,7 +512,7 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy, cgf.getTypes().isZeroInitializable(elementType))) { // Advance to the start of the rest of the array. if (numInitElements) { - one = builder.getConstantInt(loc, cgf.PtrDiffTy, 1); + one = builder.getConstantInt(loc, cgf.ptrDiffTy, 1); element = cir::PtrStrideOp::create(builder, loc, cirElementPtrType, element, one); } @@ -526,7 +526,7 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy, // Compute the end of array cir::ConstantOp numArrayElementsConst = builder.getConstInt( - loc, mlir::cast(cgf.PtrDiffTy), numArrayElements); + loc, mlir::cast(cgf.ptrDiffTy), numArrayElements); mlir::Value end = cir::PtrStrideOp::create(builder, loc, cirElementPtrType, begin, numArrayElementsConst); @@ -563,7 +563,7 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy, // Advance pointer and store them to temporary variable cir::ConstantOp one = builder.getConstInt( - loc, mlir::cast(cgf.PtrDiffTy), 1); + loc, mlir::cast(cgf.ptrDiffTy), 1); auto nextElement = cir::PtrStrideOp::create( builder, loc, cirElementPtrType, currentElement, one); cgf.emitStoreThroughLValue(RValue::get(nextElement), tmpLV); diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp index 7a35382e79a93..9dd9b6d550763 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp @@ -257,12 +257,12 @@ static mlir::Value emitCXXNewAllocSize(CIRGenFunction &cgf, const CXXNewExpr *e, if (!e->isArray()) { CharUnits typeSize = cgf.getContext().getTypeSizeInChars(type); sizeWithoutCookie = cgf.getBuilder().getConstant( - loc, cir::IntAttr::get(cgf.SizeTy, typeSize.getQuantity())); + loc, cir::IntAttr::get(cgf.sizeTy, typeSize.getQuantity())); return sizeWithoutCookie; } // The width of size_t. - unsigned sizeWidth = cgf.cgm.getDataLayout().getTypeSizeInBits(cgf.SizeTy); + unsigned sizeWidth = cgf.cgm.getDataLayout().getTypeSizeInBits(cgf.sizeTy); // The number of elements can be have an arbitrary integer type; // essentially, we need to multiply it by a constant factor, add a diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp index 928e5aa821bb5..6af87a0159f0a 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp @@ -46,7 +46,7 @@ namespace { class ConstExprEmitter; static mlir::TypedAttr computePadding(CIRGenModule &cgm, CharUnits size) { - mlir::Type eltTy = cgm.UCharTy; + mlir::Type eltTy = cgm.uCharTy; clang::CharUnits::QuantityType arSize = size.getQuantity(); CIRGenBuilderTy &bld = cgm.getBuilder(); if (size > CharUnits::One()) { diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp index db6878d479366..119314fe27dce 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp @@ -762,9 +762,9 @@ class ScalarExprEmitter : public StmtVisitor { // FIXME(cir): For now lets pretend we shouldn't use the conversion // intrinsics and insert a cast here unconditionally. src = builder.createCast(cgf.getLoc(loc), cir::CastKind::floating, src, - cgf.FloatTy); + cgf.floatTy); srcType = cgf.getContext().FloatTy; - mlirSrcType = cgf.FloatTy; + mlirSrcType = cgf.floatTy; } } @@ -1738,7 +1738,7 @@ mlir::Value ScalarExprEmitter::emitSub(const BinOpInfo &ops) { // // See more in `EmitSub` in CGExprScalar.cpp. assert(!cir::MissingFeatures::llvmLoweringPtrDiffConsidersPointee()); - return cir::PtrDiffOp::create(builder, cgf.getLoc(ops.loc), cgf.PtrDiffTy, + return cir::PtrDiffOp::create(builder, cgf.getLoc(ops.loc), cgf.ptrDiffTy, ops.lhs, ops.rhs); } @@ -2220,7 +2220,7 @@ mlir::Value ScalarExprEmitter::VisitUnaryExprOrTypeTraitExpr( "sizeof operator for VariableArrayType", e->getStmtClassName()); return builder.getConstant( - loc, cir::IntAttr::get(cgf.cgm.UInt64Ty, + loc, cir::IntAttr::get(cgf.cgm.uInt64Ty, llvm::APSInt(llvm::APInt(64, 1), true))); } } else if (e->getKind() == UETT_OpenMPRequiredSimdAlign) { @@ -2228,12 +2228,12 @@ mlir::Value ScalarExprEmitter::VisitUnaryExprOrTypeTraitExpr( e->getSourceRange(), "sizeof operator for OpenMpRequiredSimdAlign", e->getStmtClassName()); return builder.getConstant( - loc, cir::IntAttr::get(cgf.cgm.UInt64Ty, + loc, cir::IntAttr::get(cgf.cgm.uInt64Ty, llvm::APSInt(llvm::APInt(64, 1), true))); } return builder.getConstant( - loc, cir::IntAttr::get(cgf.cgm.UInt64Ty, + loc, cir::IntAttr::get(cgf.cgm.uInt64Ty, e->EvaluateKnownConstInt(cgf.getContext()))); } @@ -2329,14 +2329,14 @@ mlir::Value ScalarExprEmitter::VisitAbstractConditionalOperator( mlir::Value lhs = Visit(lhsExpr); if (!lhs) { - lhs = builder.getNullValue(cgf.VoidTy, loc); + lhs = builder.getNullValue(cgf.voidTy, loc); lhsIsVoid = true; } mlir::Value rhs = Visit(rhsExpr); if (lhsIsVoid) { assert(!rhs && "lhs and rhs types must match"); - rhs = builder.getNullValue(cgf.VoidTy, loc); + rhs = builder.getNullValue(cgf.voidTy, loc); } return builder.createSelect(loc, condV, lhs, rhs); @@ -2381,7 +2381,7 @@ mlir::Value ScalarExprEmitter::VisitAbstractConditionalOperator( if (!insertPoints.empty()) { // If both arms are void, so be it. if (!yieldTy) - yieldTy = cgf.VoidTy; + yieldTy = cgf.voidTy; // Insert required yields. for (mlir::OpBuilder::InsertPoint &toInsert : insertPoints) { diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp index 58feb36f78f23..71ff20a3b0e43 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp @@ -1008,7 +1008,7 @@ CIRGenFunction::emitArrayLength(const clang::ArrayType *origArrayType, if (isa(arrayType)) { assert(cir::MissingFeatures::vlas()); cgm.errorNYI(*currSrcLoc, "VLAs"); - return builder.getConstInt(*currSrcLoc, SizeTy, 0); + return builder.getConstInt(*currSrcLoc, sizeTy, 0); } uint64_t countFromCLAs = 1; @@ -1037,7 +1037,7 @@ CIRGenFunction::emitArrayLength(const clang::ArrayType *origArrayType, } baseType = eltType; - return builder.getConstInt(*currSrcLoc, SizeTy, countFromCLAs); + return builder.getConstInt(*currSrcLoc, sizeTy, countFromCLAs); } mlir::Value CIRGenFunction::emitAlignmentAssumption( @@ -1074,7 +1074,7 @@ CIRGenFunction::getVLASize(const VariableArrayType *type) { elementType = type->getElementType(); mlir::Value vlaSize = vlaSizeMap[type->getSizeExpr()]; assert(vlaSize && "no size for VLA!"); - assert(vlaSize.getType() == SizeTy); + assert(vlaSize.getType() == sizeTy); if (!numElements) { numElements = vlaSize; @@ -1188,7 +1188,7 @@ void CIRGenFunction::emitVariablyModifiedType(QualType type) { // Always zexting here would be wrong if it weren't // undefined behavior to have a negative bound. // FIXME: What about when size's type is larger than size_t? - entry = builder.createIntCast(size, SizeTy); + entry = builder.createIntCast(size, sizeTy); } } type = vat->getElementType(); diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp index 88fedf1acc6a1..f603f5ec4383d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp @@ -1846,13 +1846,13 @@ mlir::Value CIRGenItaniumCXXABI::getVirtualBaseClassOffset( const CXXRecordDecl *classDecl, const CXXRecordDecl *baseClassDecl) { CIRGenBuilderTy &builder = cgf.getBuilder(); mlir::Value vtablePtr = cgf.getVTablePtr(loc, thisAddr, classDecl); - mlir::Value vtableBytePtr = builder.createBitcast(vtablePtr, cgm.UInt8PtrTy); + mlir::Value vtableBytePtr = builder.createBitcast(vtablePtr, cgm.uInt8PtrTy); CharUnits vbaseOffsetOffset = cgm.getItaniumVTableContext().getVirtualBaseOffsetOffset(classDecl, baseClassDecl); mlir::Value offsetVal = builder.getSInt64(vbaseOffsetOffset.getQuantity(), loc); - auto vbaseOffsetPtr = cir::PtrStrideOp::create(builder, loc, cgm.UInt8PtrTy, + auto vbaseOffsetPtr = cir::PtrStrideOp::create(builder, loc, cgm.uInt8PtrTy, vtableBytePtr, offsetVal); mlir::Value vbaseOffset; @@ -1861,9 +1861,9 @@ mlir::Value CIRGenItaniumCXXABI::getVirtualBaseClassOffset( cgm.errorNYI(loc, "getVirtualBaseClassOffset: relative layout"); } else { mlir::Value offsetPtr = builder.createBitcast( - vbaseOffsetPtr, builder.getPointerTo(cgm.PtrDiffTy)); + vbaseOffsetPtr, builder.getPointerTo(cgm.ptrDiffTy)); vbaseOffset = builder.createLoad( - loc, Address(offsetPtr, cgm.PtrDiffTy, cgf.getPointerAlign())); + loc, Address(offsetPtr, cgm.ptrDiffTy, cgf.getPointerAlign())); } return vbaseOffset; } @@ -2244,7 +2244,7 @@ Address CIRGenItaniumCXXABI::initializeArrayCookie(CIRGenFunction &cgf, // Write the number of elements into the appropriate slot. Address numElementsPtr = - cookiePtr.withElementType(cgf.getBuilder(), cgf.SizeTy); + cookiePtr.withElementType(cgf.getBuilder(), cgf.sizeTy); cgf.getBuilder().createStore(loc, numElements, numElementsPtr); // Finally, compute a pointer to the actual data buffer by skipping diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 46adfe28e377a..9f9b2db4771df 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -67,28 +67,28 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext, abi(createCXXABI(*this)), genTypes(*this), vtables(*this) { // Initialize cached types - VoidTy = cir::VoidType::get(&getMLIRContext()); - VoidPtrTy = cir::PointerType::get(VoidTy); - SInt8Ty = cir::IntType::get(&getMLIRContext(), 8, /*isSigned=*/true); - SInt16Ty = cir::IntType::get(&getMLIRContext(), 16, /*isSigned=*/true); - SInt32Ty = cir::IntType::get(&getMLIRContext(), 32, /*isSigned=*/true); - SInt64Ty = cir::IntType::get(&getMLIRContext(), 64, /*isSigned=*/true); - SInt128Ty = cir::IntType::get(&getMLIRContext(), 128, /*isSigned=*/true); - UInt8Ty = cir::IntType::get(&getMLIRContext(), 8, /*isSigned=*/false); - UInt8PtrTy = cir::PointerType::get(UInt8Ty); + voidTy = cir::VoidType::get(&getMLIRContext()); + voidPtrTy = cir::PointerType::get(voidTy); + sInt8Ty = cir::IntType::get(&getMLIRContext(), 8, /*isSigned=*/true); + sInt16Ty = cir::IntType::get(&getMLIRContext(), 16, /*isSigned=*/true); + sInt32Ty = cir::IntType::get(&getMLIRContext(), 32, /*isSigned=*/true); + sInt64Ty = cir::IntType::get(&getMLIRContext(), 64, /*isSigned=*/true); + sInt128Ty = cir::IntType::get(&getMLIRContext(), 128, /*isSigned=*/true); + uInt8Ty = cir::IntType::get(&getMLIRContext(), 8, /*isSigned=*/false); + uInt8PtrTy = cir::PointerType::get(uInt8Ty); cirAllocaAddressSpace = getTargetCIRGenInfo().getCIRAllocaAddressSpace(); - UInt16Ty = cir::IntType::get(&getMLIRContext(), 16, /*isSigned=*/false); - UInt32Ty = cir::IntType::get(&getMLIRContext(), 32, /*isSigned=*/false); - UInt64Ty = cir::IntType::get(&getMLIRContext(), 64, /*isSigned=*/false); - UInt128Ty = cir::IntType::get(&getMLIRContext(), 128, /*isSigned=*/false); - FP16Ty = cir::FP16Type::get(&getMLIRContext()); - BFloat16Ty = cir::BF16Type::get(&getMLIRContext()); - FloatTy = cir::SingleType::get(&getMLIRContext()); - DoubleTy = cir::DoubleType::get(&getMLIRContext()); - FP80Ty = cir::FP80Type::get(&getMLIRContext()); - FP128Ty = cir::FP128Type::get(&getMLIRContext()); - - AllocaInt8PtrTy = cir::PointerType::get(UInt8Ty, cirAllocaAddressSpace); + uInt16Ty = cir::IntType::get(&getMLIRContext(), 16, /*isSigned=*/false); + uInt32Ty = cir::IntType::get(&getMLIRContext(), 32, /*isSigned=*/false); + uInt64Ty = cir::IntType::get(&getMLIRContext(), 64, /*isSigned=*/false); + uInt128Ty = cir::IntType::get(&getMLIRContext(), 128, /*isSigned=*/false); + fP16Ty = cir::FP16Type::get(&getMLIRContext()); + bFloat16Ty = cir::BF16Type::get(&getMLIRContext()); + floatTy = cir::SingleType::get(&getMLIRContext()); + doubleTy = cir::DoubleType::get(&getMLIRContext()); + fP80Ty = cir::FP80Type::get(&getMLIRContext()); + fP128Ty = cir::FP128Type::get(&getMLIRContext()); + + allocaInt8PtrTy = cir::PointerType::get(uInt8Ty, cirAllocaAddressSpace); PointerAlignInBytes = astContext @@ -97,16 +97,16 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext, .getQuantity(); const unsigned charSize = astContext.getTargetInfo().getCharWidth(); - UCharTy = cir::IntType::get(&getMLIRContext(), charSize, /*isSigned=*/false); + uCharTy = cir::IntType::get(&getMLIRContext(), charSize, /*isSigned=*/false); // TODO(CIR): Should be updated once TypeSizeInfoAttr is upstreamed const unsigned sizeTypeSize = astContext.getTypeSize(astContext.getSignedSizeType()); SizeSizeInBytes = astContext.toCharUnitsFromBits(sizeTypeSize).getQuantity(); // In CIRGenTypeCache, UIntPtrTy and SizeType are fields of the same union - UIntPtrTy = + uIntPtrTy = cir::IntType::get(&getMLIRContext(), sizeTypeSize, /*isSigned=*/false); - PtrDiffTy = + ptrDiffTy = cir::IntType::get(&getMLIRContext(), sizeTypeSize, /*isSigned=*/true); std::optional sourceLanguage = getCIRSourceLanguage(); diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp index be063033ddcfc..890f8a6c8339d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp @@ -617,11 +617,11 @@ void OpenACCRecipeBuilderBase::createReductionRecipeCombiner( if (const auto *cat = cgf.getContext().getAsConstantArrayType(origType)) { // If we're in an array, we have to emit the combiner for each element of // the array. - auto itrTy = mlir::cast(cgf.PtrDiffTy); + auto itrTy = mlir::cast(cgf.ptrDiffTy); auto itrPtrTy = cir::PointerType::get(itrTy); mlir::Value zero = - builder.getConstInt(loc, mlir::cast(cgf.PtrDiffTy), 0); + builder.getConstInt(loc, mlir::cast(cgf.ptrDiffTy), 0); mlir::Value itr = cir::AllocaOp::create(builder, loc, itrPtrTy, itrTy, "itr", cgf.cgm.getSize(cgf.getPointerAlign())); @@ -633,7 +633,7 @@ void OpenACCRecipeBuilderBase::createReductionRecipeCombiner( [&](mlir::OpBuilder &b, mlir::Location loc) { auto loadItr = cir::LoadOp::create(builder, loc, {itr}); mlir::Value arraySize = builder.getConstInt( - loc, mlir::cast(cgf.PtrDiffTy), cat->getZExtSize()); + loc, mlir::cast(cgf.ptrDiffTy), cat->getZExtSize()); auto cmp = builder.createCompare(loc, cir::CmpOpKind::lt, loadItr, arraySize); builder.createCondition(cmp); diff --git a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h index ff5842cd86e04..0f63e91f45564 100644 --- a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h +++ b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h @@ -26,47 +26,47 @@ struct CIRGenTypeCache { CIRGenTypeCache() {} // ClangIR void type - cir::VoidType VoidTy; + cir::VoidType voidTy; // ClangIR signed integral types of common sizes - cir::IntType SInt8Ty; - cir::IntType SInt16Ty; - cir::IntType SInt32Ty; - cir::IntType SInt64Ty; - cir::IntType SInt128Ty; + cir::IntType sInt8Ty; + cir::IntType sInt16Ty; + cir::IntType sInt32Ty; + cir::IntType sInt64Ty; + cir::IntType sInt128Ty; // ClangIR unsigned integral type of common sizes - cir::IntType UInt8Ty; - cir::IntType UInt16Ty; - cir::IntType UInt32Ty; - cir::IntType UInt64Ty; - cir::IntType UInt128Ty; + cir::IntType uInt8Ty; + cir::IntType uInt16Ty; + cir::IntType uInt32Ty; + cir::IntType uInt64Ty; + cir::IntType uInt128Ty; // ClangIR floating-point types with fixed formats - cir::FP16Type FP16Ty; - cir::BF16Type BFloat16Ty; - cir::SingleType FloatTy; - cir::DoubleType DoubleTy; - cir::FP80Type FP80Ty; - cir::FP128Type FP128Ty; + cir::FP16Type fP16Ty; + cir::BF16Type bFloat16Ty; + cir::SingleType floatTy; + cir::DoubleType doubleTy; + cir::FP80Type fP80Ty; + cir::FP128Type fP128Ty; /// ClangIR char - mlir::Type UCharTy; + mlir::Type uCharTy; /// intptr_t, size_t, and ptrdiff_t, which we assume are the same size. union { - mlir::Type UIntPtrTy; - mlir::Type SizeTy; + mlir::Type uIntPtrTy; + mlir::Type sizeTy; }; - mlir::Type PtrDiffTy; + mlir::Type ptrDiffTy; /// void* in address space 0 - cir::PointerType VoidPtrTy; - cir::PointerType UInt8PtrTy; + cir::PointerType voidPtrTy; + cir::PointerType uInt8PtrTy; /// void* in alloca address space - cir::PointerType AllocaInt8PtrTy; + cir::PointerType allocaInt8PtrTy; /// The size and alignment of a pointer into the generic address space. union { diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp index d1b91d0c73c04..03618d4a8a8a6 100644 --- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp @@ -71,7 +71,7 @@ mlir::Type CIRGenTypes::convertFunctionTypeInternal(QualType qft) { if (!isFuncTypeConvertible(ft)) { cgm.errorNYI(SourceLocation(), "function type involving an incomplete type", qft); - return cir::FuncType::get(SmallVector{}, cgm.VoidTy); + return cir::FuncType::get(SmallVector{}, cgm.voidTy); } const CIRGenFunctionInfo *fi; @@ -298,7 +298,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) { switch (cast(ty)->getKind()) { // void case BuiltinType::Void: - resultType = cgm.VoidTy; + resultType = cgm.voidTy; break; // bool @@ -338,42 +338,42 @@ mlir::Type CIRGenTypes::convertType(QualType type) { // Floating-point types case BuiltinType::Float16: - resultType = cgm.FP16Ty; + resultType = cgm.fP16Ty; break; case BuiltinType::Half: if (astContext.getLangOpts().NativeHalfType || !astContext.getTargetInfo().useFP16ConversionIntrinsics()) { - resultType = cgm.FP16Ty; + resultType = cgm.fP16Ty; } else { cgm.errorNYI(SourceLocation(), "processing of built-in type", type); - resultType = cgm.SInt32Ty; + resultType = cgm.sInt32Ty; } break; case BuiltinType::BFloat16: - resultType = cgm.BFloat16Ty; + resultType = cgm.bFloat16Ty; break; case BuiltinType::Float: assert(&astContext.getFloatTypeSemantics(type) == &llvm::APFloat::IEEEsingle() && "ClangIR NYI: 'float' in a format other than IEEE 32-bit"); - resultType = cgm.FloatTy; + resultType = cgm.floatTy; break; case BuiltinType::Double: assert(&astContext.getFloatTypeSemantics(type) == &llvm::APFloat::IEEEdouble() && "ClangIR NYI: 'double' in a format other than IEEE 64-bit"); - resultType = cgm.DoubleTy; + resultType = cgm.doubleTy; break; case BuiltinType::LongDouble: resultType = builder.getLongDoubleTy(astContext.getFloatTypeSemantics(type)); break; case BuiltinType::Float128: - resultType = cgm.FP128Ty; + resultType = cgm.fP128Ty; break; case BuiltinType::Ibm128: cgm.errorNYI(SourceLocation(), "processing of built-in type", type); - resultType = cgm.SInt32Ty; + resultType = cgm.sInt32Ty; break; case BuiltinType::NullPtr: @@ -386,7 +386,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) { default: cgm.errorNYI(SourceLocation(), "processing of built-in type", type); - resultType = cgm.SInt32Ty; + resultType = cgm.sInt32Ty; break; } break; @@ -439,7 +439,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) { // int X[] -> [0 x int], unless the element type is not sized. If it is // unsized (e.g. an incomplete record) just use [0 x i8]. if (!cir::isSized(elemTy)) { - elemTy = cgm.SInt8Ty; + elemTy = cgm.sInt8Ty; } resultType = cir::ArrayType::get(elemTy, 0); @@ -454,7 +454,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) { // i8 just to have a concrete type" if (!cir::isSized(elemTy)) { cgm.errorNYI(SourceLocation(), "arrays of undefined struct type", type); - resultType = cgm.UInt32Ty; + resultType = cgm.uInt32Ty; break; } @@ -477,7 +477,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) { // Return a placeholder 'i32' type. This can be changed later when the // type is defined (see UpdateCompletedType), but is likely to be the // "right" answer. - resultType = cgm.UInt32Ty; + resultType = cgm.uInt32Ty; break; } @@ -490,7 +490,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) { const auto *bitIntTy = cast(type); if (bitIntTy->getNumBits() > cir::IntType::maxBitwidth()) { cgm.errorNYI(SourceLocation(), "large _BitInt type", type); - resultType = cgm.SInt32Ty; + resultType = cgm.sInt32Ty; } else { resultType = cir::IntType::get(&getMLIRContext(), bitIntTy->getNumBits(), bitIntTy->isSigned()); @@ -515,7 +515,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) { default: cgm.errorNYI(SourceLocation(), "processing of type", type->getTypeClassName()); - resultType = cgm.SInt32Ty; + resultType = cgm.sInt32Ty; break; } From 9019372e06b88f2a8aee846428b37ad8ef9b4a8b Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Thu, 21 Aug 2025 09:21:38 -0700 Subject: [PATCH 069/539] [MLIR] Apply clang-tidy fixes for llvm-qualified-auto in SwapExtractSliceWithProducerPatterns.cpp (NFC) --- .../Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp index 1e3b377ab85c7..549ac7afca8ca 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp @@ -77,7 +77,7 @@ FailureOr tensor::replaceInsertSlicesWithTiledConsumer( dyn_cast(consumerOperands.front()->getOwner()); if (!consumerOp) return failure(); - for (auto opOperand : consumerOperands.drop_front()) { + for (auto *opOperand : consumerOperands.drop_front()) { if (opOperand->getOwner() != consumerOp) { LLVM_DEBUG({ llvm::dbgs() From 4539306fc67fb09c0b9151b62ace9335c43c6916 Mon Sep 17 00:00:00 2001 From: Sirraide Date: Wed, 29 Oct 2025 02:50:55 +0100 Subject: [PATCH 070/539] [Clang] [NFC] Fix trailing whitespace in Parser.h (#165498) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many editors and IDEs automatically delete trailing whitespace on save, and this particular one has shown up as an unrelated change in several of my patches that I then had to remove later (and I’ve seen it in other people’s patches too); this has wasted too much of my time, so I’m removing it separately. --- clang/include/clang/Parse/Parser.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 0d2316f73fb62..dad8efd0f017f 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -7677,7 +7677,7 @@ class Parser : public CodeCompletionHandler { /// [GNU] asm-clobbers: /// asm-string-literal /// asm-clobbers ',' asm-string-literal - /// \endverbatim + /// \endverbatim /// StmtResult ParseAsmStatement(bool &msAsm); From 9c78533d3d923d86c62c4a7fcc90f5e5c1aa0063 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 28 Oct 2025 19:03:17 -0700 Subject: [PATCH 071/539] [Clang] Freeze padded vectors before storing. (#164821) Currently Clang usually leaves padding bits uninitialized, which means they are undef at the moment. When expanding stores of vector types to include padding, the padding lanes will be poison, hence the padding bits will be poison. This interacts badly with coercion of arguments and return values, where 3 x float vectors will be loaded as i128 integer; poisoning the padding bits will make the whole value poison. Not sure if there's a better way, but I think we have a number of places that currently rely on the padding being undef, not poison. PR: https://github.com/llvm/llvm-project/pull/164821 --- clang/lib/CodeGen/CGExpr.cpp | 8 +++- .../CodeGen/AArch64/ext-vector-coercion.c | 42 +++++++++++++++++++ .../test/CodeGenCXX/matrix-vector-bit-int.cpp | 8 ++-- clang/test/CodeGenOpenCL/preserve_vec3.cl | 20 +++++---- 4 files changed, 63 insertions(+), 15 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/ext-vector-coercion.c diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 301d5770cf78f..01f2161f27555 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -2297,9 +2297,13 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *Value, Address Addr, CGM.getABIInfo().getOptimalVectorMemoryType(VecTy, getLangOpts()); if (!ClangVecTy->isPackedVectorBoolType(getContext()) && VecTy != NewVecTy) { - SmallVector Mask(NewVecTy->getNumElements(), -1); + SmallVector Mask(NewVecTy->getNumElements(), + VecTy->getNumElements()); std::iota(Mask.begin(), Mask.begin() + VecTy->getNumElements(), 0); - Value = Builder.CreateShuffleVector(Value, Mask, "extractVec"); + // Use undef instead of poison for the padding lanes, to make sure no + // padding bits are poisoned, which may break coercion. + Value = Builder.CreateShuffleVector(Value, llvm::UndefValue::get(VecTy), + Mask, "extractVec"); SrcTy = NewVecTy; } if (Addr.getElementType() != SrcTy) diff --git a/clang/test/CodeGen/AArch64/ext-vector-coercion.c b/clang/test/CodeGen/AArch64/ext-vector-coercion.c new file mode 100644 index 0000000000000..354980afe06d7 --- /dev/null +++ b/clang/test/CodeGen/AArch64/ext-vector-coercion.c @@ -0,0 +1,42 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// RUN: %clang_cc1 -fenable-matrix -triple arm64-apple-macosx %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s + +typedef float float3 __attribute__((ext_vector_type(3))); +struct Vec3 { + union { + struct { + float x; + float y; + float z; + }; + float vec __attribute__((ext_vector_type(3))); + }; +}; + +// CHECK-LABEL: define i128 @add( +// CHECK-SAME: i128 [[A_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_VEC3:%.*]], align 16 +// CHECK-NEXT: [[A:%.*]] = alloca [[STRUCT_VEC3]], align 16 +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_VEC3]], ptr [[A]], i32 0, i32 0 +// CHECK-NEXT: store i128 [[A_COERCE]], ptr [[COERCE_DIVE]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_VEC3]], ptr [[A]], i32 0, i32 0 +// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr [[TMP0]], align 16 +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <3 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_VEC3]], ptr [[A]], i32 0, i32 0 +// CHECK-NEXT: [[LOADVECN1:%.*]] = load <4 x float>, ptr [[TMP1]], align 16 +// CHECK-NEXT: [[EXTRACTVEC2:%.*]] = shufflevector <4 x float> [[LOADVECN1]], <4 x float> poison, <3 x i32> +// CHECK-NEXT: [[ADD:%.*]] = fadd <3 x float> [[EXTRACTVEC]], [[EXTRACTVEC2]] +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_VEC3]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <3 x float> [[ADD]], <3 x float> undef, <4 x i32> +// CHECK-NEXT: store <4 x float> [[EXTRACTVEC3]], ptr [[TMP2]], align 16 +// CHECK-NEXT: [[COERCE_DIVE4:%.*]] = getelementptr inbounds nuw [[STRUCT_VEC3]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = load i128, ptr [[COERCE_DIVE4]], align 16 +// CHECK-NEXT: ret i128 [[TMP3]] +// +struct Vec3 add(struct Vec3 a) { + struct Vec3 res; + res.vec = a.vec + a.vec; + return res; +} + diff --git a/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp b/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp index 2e7531b334ecb..4be1cb3067c2f 100644 --- a/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp +++ b/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp @@ -19,7 +19,7 @@ using i4x3x3 = _BitInt(4) __attribute__((matrix_type(3, 3))); // CHECK-NEXT: store i32 [[A_COERCE]], ptr [[A]], align 4 // CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x i8>, ptr [[A]], align 4 // CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x i8> [[LOADVECN]], <4 x i8> poison, <3 x i32> -// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A1]], <3 x i8> poison, <4 x i32> +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A1]], <3 x i8> undef, <4 x i32> // CHECK-NEXT: store <4 x i8> [[EXTRACTVEC]], ptr [[A_ADDR]], align 4 // CHECK-NEXT: [[LOADVECN2:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4 // CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i8> [[LOADVECN2]], <4 x i8> poison, <3 x i32> @@ -38,7 +38,7 @@ i8x3 v1(i8x3 a) { // CHECK-SAME: <3 x i32> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[A_ADDR:%.*]] = alloca <3 x i32>, align 16 -// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[A]], <3 x i32> poison, <4 x i32> +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[A]], <3 x i32> undef, <4 x i32> // CHECK-NEXT: store <4 x i32> [[EXTRACTVEC]], ptr [[A_ADDR]], align 16 // CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16 // CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x i32> [[LOADVECN]], <4 x i32> poison, <3 x i32> @@ -57,7 +57,7 @@ i32x3 v2(i32x3 a) { // CHECK-NEXT: [[A_ADDR:%.*]] = alloca <3 x i512>, align 256 // CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x i512>, ptr [[TMP0]], align 256 // CHECK-NEXT: [[A:%.*]] = shufflevector <4 x i512> [[LOADVECN]], <4 x i512> poison, <3 x i32> -// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i512> [[A]], <3 x i512> poison, <4 x i32> +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i512> [[A]], <3 x i512> undef, <4 x i32> // CHECK-NEXT: store <4 x i512> [[EXTRACTVEC]], ptr [[A_ADDR]], align 256 // CHECK-NEXT: [[LOADVECN1:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256 // CHECK-NEXT: [[EXTRACTVEC2:%.*]] = shufflevector <4 x i512> [[LOADVECN1]], <4 x i512> poison, <3 x i32> @@ -80,7 +80,7 @@ i512x3 v3(i512x3 a) { // CHECK-NEXT: store i32 [[A_COERCE]], ptr [[A]], align 4 // CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x i4>, ptr [[A]], align 4 // CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x i4> [[LOADVECN]], <4 x i4> poison, <3 x i32> -// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i4> [[A1]], <3 x i4> poison, <4 x i32> +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i4> [[A1]], <3 x i4> undef, <4 x i32> // CHECK-NEXT: store <4 x i4> [[EXTRACTVEC]], ptr [[A_ADDR]], align 4 // CHECK-NEXT: [[LOADVECN2:%.*]] = load <4 x i4>, ptr [[A_ADDR]], align 4 // CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i4> [[LOADVECN2]], <4 x i4> poison, <3 x i32> diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl index e76aa81f918cb..0017169b8cf48 100644 --- a/clang/test/CodeGenOpenCL/preserve_vec3.cl +++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl @@ -12,7 +12,7 @@ typedef float float4 __attribute__((ext_vector_type(4))); // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META7:![0-9]+]] !kernel_arg_access_qual [[META8:![0-9]+]] !kernel_arg_type [[META9:![0-9]+]] !kernel_arg_base_type [[META10:![0-9]+]] !kernel_arg_type_qual [[META11:![0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16 -// CHECK-NEXT: [[EXTRACTVEC1_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> +// CHECK-NEXT: [[EXTRACTVEC1_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> , <4 x i32> // CHECK-NEXT: store <4 x float> [[EXTRACTVEC1_I]], ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA12:![0-9]+]] // CHECK-NEXT: ret void // @@ -24,7 +24,7 @@ void kernel foo(global float3 *a, global float3 *b) { // CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META11]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA12]] -// CHECK-NEXT: [[EXTRACTVEC_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> +// CHECK-NEXT: [[EXTRACTVEC_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> , <4 x i32> // CHECK-NEXT: store <4 x float> [[EXTRACTVEC_I]], ptr addrspace(1) [[A]], align 16, !tbaa [[CHAR_TBAA12]] // CHECK-NEXT: ret void // @@ -60,7 +60,7 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) { // CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META18:![0-9]+]] !kernel_arg_type_qual [[META11]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[CHAR_TBAA12]] -// CHECK-NEXT: [[EXTRACTVEC_I:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> +// CHECK-NEXT: [[EXTRACTVEC_I:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> , <4 x i32> // CHECK-NEXT: store <4 x i16> [[EXTRACTVEC_I]], ptr addrspace(1) [[A]], align 8, !tbaa [[CHAR_TBAA12]] // CHECK-NEXT: ret void // @@ -71,8 +71,8 @@ void kernel char8_to_short3(global short3 *a, global char8 *b) { // CHECK-LABEL: define dso_local spir_func void @from_char3( // CHECK-SAME: <3 x i8> noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A]], <3 x i8> poison, <4 x i32> -// CHECK-NEXT: store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[INT_TBAA3:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x i8> [[A]], <3 x i8> poison, <4 x i32> +// CHECK-NEXT: store <4 x i8> [[TMP0]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[INT_TBAA3:![0-9]+]] // CHECK-NEXT: ret void // void from_char3(char3 a, global int *out) { @@ -82,8 +82,8 @@ void from_char3(char3 a, global int *out) { // CHECK-LABEL: define dso_local spir_func void @from_short3( // CHECK-SAME: <3 x i16> noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[A]], <3 x i16> poison, <4 x i32> -// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[LONG_TBAA19:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[A]], <3 x i16> poison, <4 x i32> +// CHECK-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[LONG_TBAA19:![0-9]+]] // CHECK-NEXT: ret void // void from_short3(short3 a, global long *out) { @@ -94,7 +94,8 @@ void from_short3(short3 a, global long *out) { // CHECK-SAME: i32 noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[A]] to <4 x i8> -// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <4 x i32> +// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <3 x i32> +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[ASTYPE]], <3 x i8> , <4 x i32> // CHECK-NEXT: store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[CHAR_TBAA12]] // CHECK-NEXT: ret void // @@ -106,7 +107,8 @@ void scalar_to_char3(int a, global char3 *out) { // CHECK-SAME: i64 noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16> -// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <3 x i32> +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[ASTYPE]], <3 x i16> , <4 x i32> // CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA12]] // CHECK-NEXT: ret void // From 6891df4edd4d9445d94427a503271e8058101fd0 Mon Sep 17 00:00:00 2001 From: Harrison Hao <57025411+harrisonGPU@users.noreply.github.com> Date: Wed, 29 Oct 2025 10:42:15 +0800 Subject: [PATCH 072/539] [AMDGPU] Support image atomic no return instructions (#150742) Add support for no-return variants of image atomic operations (e.g. IMAGE_ATOMIC_ADD_NORTN, IMAGE_ATOMIC_CMPSWAP_NORTN). These variants are generated when the return value of the intrinsic is unused, allowing the backend to select no return type instructions. --- llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h | 1 + .../AMDGPU/AMDGPUInstructionSelector.cpp | 22 +- llvm/lib/Target/AMDGPU/MIMGInstructions.td | 185 +++--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 37 +- .../llvm.amdgcn.image.atomic.dim.ll | 24 +- .../llvm.amdgcn.image.atomic.dim.mir | 36 +- .../llvm.amdgcn.image.atomic.dim.gfx90a.ll | 22 +- .../AMDGPU/llvm.amdgcn.image.atomic.noret.ll | 581 ++++++++++++++++++ .../AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll | 30 +- llvm/test/MC/AMDGPU/buffer-op-swz-operand.s | 8 +- 10 files changed, 786 insertions(+), 160 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 0eb00cbc2f466..529da8d28a3c1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -50,6 +50,7 @@ const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr); struct ImageDimIntrinsicInfo { unsigned Intr; unsigned BaseOpcode; + unsigned AtomicNoRetBaseOpcode; MIMGDim Dim; uint8_t NumOffsetArgs; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 97c2c9c5316b3..9ce12243016f4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2006,19 +2006,27 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); + unsigned IntrOpcode = Intr->BaseOpcode; + + // For image atomic: use no-return opcode if result is unused. + if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) { + Register ResultDef = MI.getOperand(0).getReg(); + if (MRI->use_nodbg_empty(ResultDef)) + IntrOpcode = Intr->AtomicNoRetBaseOpcode; + } const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = - AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); - unsigned IntrOpcode = Intr->BaseOpcode; const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI); const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI); const unsigned ArgOffset = MI.getNumExplicitDefs() + 1; - Register VDataIn, VDataOut; + Register VDataIn = AMDGPU::NoRegister; + Register VDataOut = AMDGPU::NoRegister; LLT VDataTy; int NumVDataDwords = -1; bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 || @@ -2049,7 +2057,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( unsigned DMaskLanes = 0; if (BaseOpcode->Atomic) { - VDataOut = MI.getOperand(0).getReg(); + if (!BaseOpcode->NoReturn) + VDataOut = MI.getOperand(0).getReg(); VDataIn = MI.getOperand(2).getReg(); LLT Ty = MRI->getType(VDataIn); @@ -2099,8 +2108,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(); - if (BaseOpcode->Atomic) - CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization + // Keep GLC only when the atomic's result is actually used. + if (BaseOpcode->Atomic && !BaseOpcode->NoReturn) + CPol |= AMDGPU::CPol::GLC; if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | AMDGPU::CPol::VOLATILE)) return false; diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 5f6d742d245ec..d95013123aced 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -877,69 +877,69 @@ multiclass MIMG_Store { } class MIMG_Atomic_gfx6789_base op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, string dns=""> - : MIMG_gfx6789 { - let Constraints = "$vdst = $vdata"; - + RegisterClass addr_rc, bit noRtn, string dns=""> + : MIMG_gfx6789 { + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da); - let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"; + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"; } class MIMG_Atomic_gfx90a_base op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, string dns=""> - : MIMG_gfx90a .ret:$vdst), dns> { - let Constraints = "$vdst = $vdata"; - + RegisterClass addr_rc, bit noRtn, string dns=""> + : MIMG_gfx90a .ret:$vdst)), dns> { + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins getAlign2RegOp.ret:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da); - let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"; + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"; } class MIMG_Atomic_si - : MIMG_Atomic_gfx6789_base + : MIMG_Atomic_gfx6789_base { let AssemblerPredicate = isGFX6GFX7; } class MIMG_Atomic_vi - : MIMG_Atomic_gfx6789_base { + RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0> + : MIMG_Atomic_gfx6789_base { let AssemblerPredicate = isGFX8GFX9NotGFX90A; let MIMGEncoding = MIMGEncGfx8; } class MIMG_Atomic_gfx90a - : MIMG_Atomic_gfx90a_base { + RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0> + : MIMG_Atomic_gfx90a_base { let AssemblerPredicate = isGFX90APlus; let MIMGEncoding = MIMGEncGfx90a; } class MIMG_Atomic_gfx10 - : MIMG_gfx10 + : MIMG_gfx10 { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); - let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; } class MIMG_Atomic_nsa_gfx10 - : MIMG_nsa_gfx10 + : MIMG_nsa_gfx10 { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$srsrc, DMask:$dmask, @@ -950,24 +950,24 @@ class MIMG_Atomic_nsa_gfx10 - : MIMG_gfx11 + : MIMG_gfx11 { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); - let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; } class MIMG_Atomic_nsa_gfx11 - : MIMG_nsa_gfx11 + : MIMG_nsa_gfx11 { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$srsrc, DMask:$dmask, @@ -977,11 +977,11 @@ class MIMG_Atomic_nsa_gfx11 - : VIMAGE_gfx12 + : VIMAGE_gfx12 { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim, @@ -994,95 +994,96 @@ multiclass MIMG_Atomic_Addr_Helper_m { let hasSideEffects = 1, // FIXME: remove this mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1, - FPAtomic = isFP in { + FPAtomic = isFP, IsAtomicNoRet = noRtn in { let VAddrDwords = 1 in { let ssamp = 0 in { if op.HAS_SI then { - def _V1_si : MIMG_Atomic_si ; + def _V1_si : MIMG_Atomic_si ; } if op.HAS_VI then { - def _V1_vi : MIMG_Atomic_vi ; + def _V1_vi : MIMG_Atomic_vi ; let hasPostISelHook = 1 in - def _V1_gfx90a : MIMG_Atomic_gfx90a ; + def _V1_gfx90a : MIMG_Atomic_gfx90a ; } if op.HAS_GFX10M then { - def _V1_gfx10 : MIMG_Atomic_gfx10 ; + def _V1_gfx10 : MIMG_Atomic_gfx10 ; } if op.HAS_GFX11 then { - def _V1_gfx11 : MIMG_Atomic_gfx11 ; + def _V1_gfx11 : MIMG_Atomic_gfx11 ; } } if op.HAS_GFX12 then { - def _V1_gfx12 : VIMAGE_Atomic_gfx12 ; + def _V1_gfx12 : VIMAGE_Atomic_gfx12 ; } } let VAddrDwords = 2 in { let ssamp = 0 in { if op.HAS_SI then { - def _V2_si : MIMG_Atomic_si ; + def _V2_si : MIMG_Atomic_si ; } if op.HAS_VI then { - def _V2_vi : MIMG_Atomic_vi ; - def _V2_gfx90a : MIMG_Atomic_gfx90a ; + def _V2_vi : MIMG_Atomic_vi ; + def _V2_gfx90a : MIMG_Atomic_gfx90a ; } if op.HAS_GFX10M then { - def _V2_gfx10 : MIMG_Atomic_gfx10 ; - def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 ; + def _V2_gfx10 : MIMG_Atomic_gfx10 ; + def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 ; } if op.HAS_GFX11 then { - def _V2_gfx11 : MIMG_Atomic_gfx11 ; - def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 ; + def _V2_gfx11 : MIMG_Atomic_gfx11 ; + def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 ; } } if op.HAS_GFX12 then { - def _V2_gfx12 : VIMAGE_Atomic_gfx12 ; + def _V2_gfx12 : VIMAGE_Atomic_gfx12 ; } } let VAddrDwords = 3 in { let ssamp = 0 in { if op.HAS_SI then { - def _V3_si : MIMG_Atomic_si ; + def _V3_si : MIMG_Atomic_si ; } if op.HAS_VI then { - def _V3_vi : MIMG_Atomic_vi ; - def _V3_gfx90a : MIMG_Atomic_gfx90a ; + def _V3_vi : MIMG_Atomic_vi ; + def _V3_gfx90a : MIMG_Atomic_gfx90a ; } if op.HAS_GFX10M then { - def _V3_gfx10 : MIMG_Atomic_gfx10 ; - def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 ; + def _V3_gfx10 : MIMG_Atomic_gfx10 ; + def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 ; } if op.HAS_GFX11 then { - def _V3_gfx11 : MIMG_Atomic_gfx11 ; - def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 ; + def _V3_gfx11 : MIMG_Atomic_gfx11 ; + def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 ; } } if op.HAS_GFX12 then { - def _V3_gfx12 : VIMAGE_Atomic_gfx12 ; + def _V3_gfx12 : VIMAGE_Atomic_gfx12 ; } } let VAddrDwords = 4 in { let ssamp = 0 in { if op.HAS_SI then { - def _V4_si : MIMG_Atomic_si ; + def _V4_si : MIMG_Atomic_si ; } if op.HAS_VI then { - def _V4_vi : MIMG_Atomic_vi ; - def _V4_gfx90a : MIMG_Atomic_gfx90a ; + def _V4_vi : MIMG_Atomic_vi ; + def _V4_gfx90a : MIMG_Atomic_gfx90a ; } if op.HAS_GFX10M then { - def _V4_gfx10 : MIMG_Atomic_gfx10 ; - def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 ; + def _V4_gfx10 : MIMG_Atomic_gfx10 ; + def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 ; } if op.HAS_GFX11 then { - def _V4_gfx11 : MIMG_Atomic_gfx11 ; - def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 ; + def _V4_gfx11 : MIMG_Atomic_gfx11 ; + def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 ; } } if op.HAS_GFX12 then { - def _V4_gfx12 : VIMAGE_Atomic_gfx12 ; + def _V4_gfx12 : VIMAGE_Atomic_gfx12 ; } } } @@ -1095,12 +1096,13 @@ multiclass MIMG_Atomic_Addr_Helper_m { // 64-bit atomics - let IsAtomicRet = 1 in { +multiclass MIMG_Atomic_Base { // 64-bit atomics + let IsAtomicRet = !not(noRtn) in { def "" : MIMGBaseOpcode { let Atomic = 1; let AtomicX2 = isCmpSwap; + let NoReturn = noRtn; } let BaseOpcode = !cast(NAME) in { @@ -1109,22 +1111,28 @@ multiclass MIMG_Atomic ; + defm _V1 : MIMG_Atomic_Addr_Helper_m ; } let VDataDwords = 2 in - defm _V2 : MIMG_Atomic_Addr_Helper_m ; + defm _V2 : MIMG_Atomic_Addr_Helper_m ; let VDataDwords = 3 in - defm _V3 : MIMG_Atomic_Addr_Helper_m ; + defm _V3 : MIMG_Atomic_Addr_Helper_m ; if isCmpSwap then { let VDataDwords = 4 in - defm _V4 : MIMG_Atomic_Addr_Helper_m ; + defm _V4 : MIMG_Atomic_Addr_Helper_m ; let VDataDwords = 5 in - defm _V5 : MIMG_Atomic_Addr_Helper_m ; + defm _V5 : MIMG_Atomic_Addr_Helper_m ; } } - } // End IsAtomicRet = 1 + } +} + +multiclass MIMG_Atomic { + defm "" : MIMG_Atomic_Base ; + defm "_NORTN" : MIMG_Atomic_Base ; } multiclass MIMG_Atomic_Renamed { Intrinsic Intr = I; MIMGBaseOpcode BaseOpcode = !cast(!strconcat("IMAGE_", I.P.OpMod)); + MIMGBaseOpcode AtomicNoRetBaseOpcode = BaseOpcode; AMDGPUDimProps Dim = I.P.Dim; AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval; @@ -1855,13 +1864,20 @@ class ImageDimIntrinsicInfo { bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0)); } +class ImageDimAtomicIntrinsicInfo + : ImageDimIntrinsicInfo { + MIMGBaseOpcode AtomicNoRetBaseOpcode = + !cast(!strconcat("IMAGE_", I.P.OpMod, "_NORTN")); +} + def ImageDimIntrinsicTable : GenericTable { let FilterClass = "ImageDimIntrinsicInfo"; - let Fields = ["Intr", "BaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs", - "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd", - "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex", + let Fields = ["Intr", "BaseOpcode", "AtomicNoRetBaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", + "NumVAddrs", "NumArgs", "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", + "VAddrEnd", "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex", "BiasTyArg", "GradientTyArg", "CoordTyArg"]; string TypeOf_BaseOpcode = "MIMGBaseOpcode"; + string TypeOf_AtomicNoRetBaseOpcode = "MIMGBaseOpcode"; string TypeOf_Dim = "MIMGDim"; let PrimaryKey = ["Intr"]; @@ -1874,11 +1890,14 @@ def getImageDimIntrinsicByBaseOpcode : SearchIndex { let Key = ["BaseOpcode", "Dim"]; } -foreach intr = !listconcat(AMDGPUImageDimIntrinsics, - AMDGPUImageDimAtomicIntrinsics) in { +foreach intr = AMDGPUImageDimIntrinsics in { def : ImageDimIntrinsicInfo; } +foreach intr = AMDGPUImageDimAtomicIntrinsics in { + def : ImageDimAtomicIntrinsicInfo; +} + // L to LZ Optimization Mapping def : MIMGLZMapping; def : MIMGLZMapping; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index be4229155c983..b34ab2a7e08e5 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9134,16 +9134,23 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SDLoc DL(Op); MachineFunction &MF = DAG.getMachineFunction(); const GCNSubtarget *ST = &MF.getSubtarget(); + unsigned IntrOpcode = Intr->BaseOpcode; + // For image atomic: use no-return opcode if result is unused. + if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode && + !Op.getNode()->hasAnyUseOfValue(0)) + IntrOpcode = Intr->AtomicNoRetBaseOpcode; const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = - AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); - unsigned IntrOpcode = Intr->BaseOpcode; bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); SmallVector ResultTypes(Op->values()); SmallVector OrigResultTypes(Op->values()); + if (BaseOpcode->NoReturn && BaseOpcode->Atomic) + ResultTypes.erase(&ResultTypes[0]); + bool IsD16 = false; bool IsG16 = false; bool IsA16 = false; @@ -9162,8 +9169,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op, VData = Op.getOperand(2); IsAtomicPacked16Bit = - (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || - Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); + (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN); bool Is64Bit = VData.getValueSizeInBits() == 64; if (BaseOpcode->AtomicX2) { @@ -9173,7 +9182,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, if (Is64Bit) VData = DAG.getBitcast(MVT::v4i32, VData); - ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; + if (!BaseOpcode->NoReturn) + ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; + DMask = Is64Bit ? 0xf : 0x3; NumVDataDwords = Is64Bit ? 4 : 2; } else { @@ -9399,8 +9410,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex); - if (BaseOpcode->Atomic) - CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization + // Keep GLC only when the atomic's result is actually used. + if (BaseOpcode->Atomic && !BaseOpcode->NoReturn) + CPol |= AMDGPU::CPol::GLC; if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | AMDGPU::CPol::VOLATILE)) return Op; @@ -9512,13 +9524,20 @@ SDValue SITargetLowering::lowerImage(SDValue Op, DAG.setNodeMemRefs(NewNode, {MemRef}); } + if (BaseOpcode->NoReturn) { + if (BaseOpcode->Atomic) + return DAG.getMergeValues( + {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL); + + return SDValue(NewNode, 0); + } + if (BaseOpcode->AtomicX2) { SmallVector Elt; DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1); return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL); } - if (BaseOpcode->NoReturn) - return SDValue(NewNode, 0); + return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail, Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes, NumVDataDwords, IsAtomicPacked16Bit, DL); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll index 221e2fd4f00f7..09e1fca3f2677 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll @@ -1200,7 +1200,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 -; GFX6-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc +; GFX6-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_cmpswap_i32_1d_no_return: @@ -1213,7 +1213,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX8-NEXT: s_mov_b32 s5, s7 ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 -; GFX8-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc +; GFX8-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm ; GFX8-NEXT: s_endpgm ; ; GFX900-LABEL: atomic_cmpswap_i32_1d_no_return: @@ -1226,7 +1226,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX900-NEXT: s_mov_b32 s5, s7 ; GFX900-NEXT: s_mov_b32 s6, s8 ; GFX900-NEXT: s_mov_b32 s7, s9 -; GFX900-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc +; GFX900-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm ; GFX900-NEXT: s_endpgm ; ; GFX90A-LABEL: atomic_cmpswap_i32_1d_no_return: @@ -1239,7 +1239,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX90A-NEXT: s_mov_b32 s5, s7 ; GFX90A-NEXT: s_mov_b32 s6, s8 ; GFX90A-NEXT: s_mov_b32 s7, s9 -; GFX90A-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc +; GFX90A-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm ; GFX90A-NEXT: s_endpgm ; ; GFX10PLUS-LABEL: atomic_cmpswap_i32_1d_no_return: @@ -1252,7 +1252,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX10PLUS-NEXT: s_mov_b32 s5, s7 ; GFX10PLUS-NEXT: s_mov_b32 s6, s8 ; GFX10PLUS-NEXT: s_mov_b32 s7, s9 -; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc +; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm ; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpswap_i32_1d_no_return: @@ -1265,7 +1265,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX12-NEXT: s_mov_b32 s5, s7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 -; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN +; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D ; GFX12-NEXT: s_endpgm main_body: %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -3194,7 +3194,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 -; GFX6-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc +; GFX6-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_cmpswap_i64_1d_no_return: @@ -3207,7 +3207,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX8-NEXT: s_mov_b32 s5, s7 ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 -; GFX8-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc +; GFX8-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm ; GFX8-NEXT: s_endpgm ; ; GFX900-LABEL: atomic_cmpswap_i64_1d_no_return: @@ -3220,7 +3220,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX900-NEXT: s_mov_b32 s5, s7 ; GFX900-NEXT: s_mov_b32 s6, s8 ; GFX900-NEXT: s_mov_b32 s7, s9 -; GFX900-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc +; GFX900-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm ; GFX900-NEXT: s_endpgm ; ; GFX90A-LABEL: atomic_cmpswap_i64_1d_no_return: @@ -3233,7 +3233,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX90A-NEXT: s_mov_b32 s5, s7 ; GFX90A-NEXT: s_mov_b32 s6, s8 ; GFX90A-NEXT: s_mov_b32 s7, s9 -; GFX90A-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc +; GFX90A-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm ; GFX90A-NEXT: s_endpgm ; ; GFX10PLUS-LABEL: atomic_cmpswap_i64_1d_no_return: @@ -3246,7 +3246,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX10PLUS-NEXT: s_mov_b32 s5, s7 ; GFX10PLUS-NEXT: s_mov_b32 s6, s8 ; GFX10PLUS-NEXT: s_mov_b32 s7, s9 -; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc +; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpswap_i64_1d_no_return: @@ -3259,7 +3259,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX12-NEXT: s_mov_b32 s5, s7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 -; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN +; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX12-NEXT: s_endpgm main_body: %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir index 292fa4be1ca1d..4f160b6cb4b1b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir @@ -25,6 +25,7 @@ body: | ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_si]].sub0 ; GFX6-NEXT: $vgpr0 = COPY [[COPY3]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; ; GFX8-LABEL: name: atomic_cmpswap_i32_1d ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} @@ -35,6 +36,7 @@ body: | ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_vi]].sub0 ; GFX8-NEXT: $vgpr0 = COPY [[COPY3]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; ; GFX10-LABEL: name: atomic_cmpswap_i32_1d ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -45,6 +47,7 @@ body: | ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10_]].sub0 ; GFX10-NEXT: $vgpr0 = COPY [[COPY3]] ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; ; GFX11-LABEL: name: atomic_cmpswap_i32_1d ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -55,6 +58,7 @@ body: | ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11_]].sub0 ; GFX11-NEXT: $vgpr0 = COPY [[COPY3]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; ; GFX12-LABEL: name: atomic_cmpswap_i32_1d ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX12-NEXT: {{ $}} @@ -89,39 +93,43 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_si:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_si [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) + ; GFX6-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_si [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) ; GFX6-NEXT: S_ENDPGM 0 + ; ; GFX8-LABEL: name: atomic_cmpswap_i32_1d_no_return ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V1_V1_vi:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) + ; GFX8-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) ; GFX8-NEXT: S_ENDPGM 0 + ; ; GFX10-LABEL: name: atomic_cmpswap_i32_1d_no_return ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) + ; GFX10-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) ; GFX10-NEXT: S_ENDPGM 0 + ; ; GFX11-LABEL: name: atomic_cmpswap_i32_1d_no_return ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) + ; GFX11-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) ; GFX11-NEXT: S_ENDPGM 0 + ; ; GFX12-LABEL: name: atomic_cmpswap_i32_1d_no_return ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX12-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx12_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) + ; GFX12-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) ; GFX12-NEXT: S_ENDPGM 0 %0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 %1:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 @@ -150,6 +158,7 @@ body: | ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_si]].sub0_sub1 ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1 + ; ; GFX8-LABEL: name: atomic_cmpswap_i64_1d ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX8-NEXT: {{ $}} @@ -160,6 +169,7 @@ body: | ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_vi]].sub0_sub1 ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1 + ; ; GFX10-LABEL: name: atomic_cmpswap_i64_1d ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX10-NEXT: {{ $}} @@ -170,6 +180,7 @@ body: | ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10_]].sub0_sub1 ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]] ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1 + ; ; GFX11-LABEL: name: atomic_cmpswap_i64_1d ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX11-NEXT: {{ $}} @@ -180,6 +191,7 @@ body: | ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11_]].sub0_sub1 ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1 + ; ; GFX12-LABEL: name: atomic_cmpswap_i64_1d ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX12-NEXT: {{ $}} @@ -214,39 +226,43 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX6-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_si:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_si [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) + ; GFX6-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_si [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) ; GFX6-NEXT: S_ENDPGM 0 + ; ; GFX8-LABEL: name: atomic_cmpswap_i64_1d_no_return ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX8-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_vi:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) + ; GFX8-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) ; GFX8-NEXT: S_ENDPGM 0 + ; ; GFX10-LABEL: name: atomic_cmpswap_i64_1d_no_return ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX10-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) + ; GFX10-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) ; GFX10-NEXT: S_ENDPGM 0 + ; ; GFX11-LABEL: name: atomic_cmpswap_i64_1d_no_return ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX11-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) + ; GFX11-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) ; GFX11-NEXT: S_ENDPGM 0 + ; ; GFX12-LABEL: name: atomic_cmpswap_i64_1d_no_return ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX12-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) + ; GFX12-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) ; GFX12-NEXT: S_ENDPGM 0 %0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 %1:vgpr(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll index 49607e320bd0a..83f0229aea326 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll @@ -92,8 +92,7 @@ define amdgpu_ps void @atomic_swap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 -; GFX90A-NEXT: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc +; GFX90A-NEXT: image_atomic_swap a0, v0, s[0:7] dmask:0x1 unorm ; GFX90A-NEXT: s_endpgm %data = call i32 asm "; def $0", "=a"() %unused = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -106,8 +105,7 @@ define amdgpu_ps void @atomic_add_2d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s, i ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: image_atomic_add v2, v[0:1], s[0:7] dmask:0x1 unorm glc +; GFX90A-NEXT: image_atomic_add a0, v[0:1], s[0:7] dmask:0x1 unorm ; GFX90A-NEXT: s_endpgm %data = call i32 asm "; def $0", "=a"() %unused = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -123,9 +121,7 @@ define amdgpu_ps void @atomic_cmpswap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 % ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a1 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: image_atomic_cmpswap v[2:3], v0, s[0:7] dmask:0x3 unorm glc +; GFX90A-NEXT: image_atomic_cmpswap a[0:1], v0, s[0:7] dmask:0x3 unorm ; GFX90A-NEXT: s_endpgm %cmp = call i32 asm "; def $0", "=a"() %swap = call i32 asm "; def $0", "=a"() @@ -139,9 +135,7 @@ define amdgpu_ps void @atomic_swap_1d_i64_agpr_noret(<8 x i32> inreg %rsrc, i32 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: image_atomic_swap v[2:3], v0, s[0:7] dmask:0x3 unorm glc +; GFX90A-NEXT: image_atomic_swap a[0:1], v0, s[0:7] dmask:0x3 unorm ; GFX90A-NEXT: s_endpgm %data = call i64 asm "; def $0", "=a"() %unused = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -154,14 +148,10 @@ define amdgpu_ps void @atomic_cmpswap_1d_64_agpr_noret(<8 x i32> inreg %rsrc, i3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ; def a[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX90A-NEXT: image_atomic_cmpswap v[2:5], v0, s[0:7] dmask:0xf unorm glc +; GFX90A-NEXT: image_atomic_cmpswap a[0:3], v0, s[0:7] dmask:0xf unorm ; GFX90A-NEXT: s_endpgm %cmp = call i64 asm "; def $0", "=a"() %swap = call i64 asm "; def $0", "=a"() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll new file mode 100644 index 0000000000000..6c58a1a30bd4c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll @@ -0,0 +1,581 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS-GISE %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISE %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s + +define amdgpu_ps void @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_swap_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_swap_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_swap_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_swap_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_swap_1d_i64(<8 x i32> inreg %rsrc, i64 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_swap_1d_i64: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_swap_1d_i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_swap_1d_i64: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_swap_1d_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_swap_1d_float(<8 x i32> inreg %rsrc, float %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_swap_1d_float: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_swap_1d_float: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_swap_1d_float: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_swap_1d_float: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call float @llvm.amdgcn.image.atomic.swap.1d.f32.i32(float %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_add_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_sub_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_sub v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_sub_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_sub v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_sub_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_sub_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_sub_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_smin_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_smin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_smin_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_smin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_smin_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_min_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_smin_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_min_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_umin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_umin_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_umin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_umin_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_umin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_umin_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_min_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_min_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_smax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_smax_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_smax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_smax_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_smax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_smax_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_max_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_smax_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_max_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_umax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_umax_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_umax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_umax_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_umax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_umax_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_max_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_max_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_and_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_and_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_and_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_and_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_or_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_or_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_or_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_or_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_xor_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_xor_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_xor_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_xor_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_inc_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_inc_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_inc v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_inc_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_inc v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_inc_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_inc_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_inc_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_dec_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_dec_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_dec v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_dec_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_dec v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_dec_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_dec_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_dec_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %swap, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_cmpswap_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_cmpswap_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_cmpswap_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpswap_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_cmpswap_1d_64(<8 x i32> inreg %rsrc, i64 %cmp, i64 %swap, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_cmpswap_1d_64: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_cmpswap_1d_64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_cmpswap_1d_64: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpswap_1d_64: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_2d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t) { +; GFX10PLUS-GISE-LABEL: atomic_add_2d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_2d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_2d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_2d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %r) { +; GFX10PLUS-GISE-LABEL: atomic_add_3d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_3d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_3d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_3d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %face) { +; GFX10PLUS-GISE-LABEL: atomic_add_cube: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_cube: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_cube: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_cube: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_1darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %slice) { +; GFX10PLUS-GISE-LABEL: atomic_add_1darray: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_1darray: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_1darray: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_1darray: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice) { +; GFX10PLUS-GISE-LABEL: atomic_add_2darray: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_2darray: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_2darray: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_2darray: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %fragid) { +; GFX10PLUS-GISE-LABEL: atomic_add_2dmsaa: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_2dmsaa: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_2dmsaa: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_2dmsaa: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +; GFX10PLUS-GISE-LABEL: atomic_add_2darraymsaa: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_2darraymsaa: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_2darraymsaa: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3, v4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_2darraymsaa: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3, v4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_1d_slc(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_add_1d_slc: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm slc +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_1d_slc: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm slc +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_1d_slc: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_1d_slc: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll index 3d1d6c87eb98d..0ba62e49cabc3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll @@ -41,15 +41,13 @@ main_body: define amdgpu_ps float @atomic_pk_add_f16_1d_v2_noret(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) { ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2_noret: ; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v2_noret: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: @@ -79,15 +77,13 @@ main_body: define amdgpu_ps float @atomic_pk_add_f16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 x half> %data, i32 %s) { ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v4_noret: ; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v4_noret: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: @@ -126,15 +122,13 @@ main_body: define amdgpu_ps float @atomic_pk_add_bf16_1d_v2_noret(<8 x i32> inreg %rsrc, <2 x bfloat> %data, i32 %s) { ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v2_noret: ; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v2_noret: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: @@ -173,15 +167,13 @@ main_body: define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 x bfloat> %data, i32 %s) { ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4_noret: ; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_noret: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: @@ -192,15 +184,13 @@ main_body: define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_nt(<8 x i32> inreg %rsrc, <4 x bfloat> %data, i32 %s) { ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4_nt: ; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_nt: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/MC/AMDGPU/buffer-op-swz-operand.s b/llvm/test/MC/AMDGPU/buffer-op-swz-operand.s index 8bd91484d149c..4542027b0df90 100644 --- a/llvm/test/MC/AMDGPU/buffer-op-swz-operand.s +++ b/llvm/test/MC/AMDGPU/buffer-op-swz-operand.s @@ -2,7 +2,7 @@ // CHECK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1100" buffer_load_dwordx4 v[0:3], v0, s[0:3], 0, offen offset:4092 slc -// CHECK: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4092 slc ; // CHECK-NEXT: ; // CHECK-NEXT: ; @@ -11,7 +11,7 @@ buffer_load_dwordx4 v[0:3], v0, s[0:3], 0, offen offset:4092 slc // CHECK-NEXT: ; // CHECK-NEXT: ; > buffer_store_dword v0, v1, s[0:3], 0 offen slc -// CHECK: buffer_store_b32 v0, v1, s[0:3], 0 offen slc ; // CHECK-NEXT: ; // CHECK-NEXT: ; @@ -22,7 +22,7 @@ buffer_store_dword v0, v1, s[0:3], 0 offen slc ; tbuffer ops use autogenerate asm parsers tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_FMT_32_32_SINT] offen offset:4092 slc -// CHECK: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_FMT_32_32_SINT] offen offset:4092 slc ; // CHECK-NEXT: ; // CHECK-NEXT: ; @@ -32,7 +32,7 @@ tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_FMT_32_32_SINT] offen // CHECK-NEXT: ; // CHECK-NEXT: ; > tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] offen slc -// CHECK: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] offen slc ; // CHECK-NEXT: ; // CHECK-NEXT: ; From ccc748ebac7e1446b2eb65de0d815f54a7fdcad3 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Wed, 29 Oct 2025 10:52:28 +0800 Subject: [PATCH 073/539] [SimplifyCFG] Use range check in simplifyBranchOnICmpChain if possible (#165105) In `simplifyBranchOnICmpChain`, if we can merge the comparisons into a range check, use a conditional branch instead. This change also breaks the cycle found in https://github.com/llvm/llvm-project/issues/165088. Closes https://github.com/llvm/llvm-project/issues/165088. Detailed description of the cycle: ``` define void @pr165088_cycle_1(i8 %x) { entry: %switch = icmp uge i8 %x, 2 %cond1 = icmp ugt i8 %x, 1 %or.cond = and i1 %switch, %cond1 br i1 %or.cond, label %block3, label %block2 block1: %cond2 = icmp ugt i8 %x, 1 br i1 %cond2, label %block3, label %block2 block2: br label %block3 block3: %cond3 = icmp eq i8 %x, 0 br i1 %cond3, label %exit, label %block1 exit: ret void } ``` `simplifyBranchOnICmpChain` folds the branch in `entry` to a switch. Then we get: ``` entry: switch i8 %x, label %block3 [ i8 1, label %block2 i8 0, label %block2 ] ... ``` `performValueComparisonIntoPredecessorFolding` redirects the default target `block3` into `block1` because `%x` is never zero. ``` entry: switch i8 %x, label %block1 [ i8 1, label %block2 i8 0, label %block2 ] ... ``` Then `turnSwitchRangeIntoICmp` will convert the switch back into a branch on icmp. ``` entry: %switch = icmp ult i8 %x, 2 br i1 %switch, label %block2, label %block1 ... ``` Since `block1` and `block2` share the same successor `block3`, `performBranchToCommonDestFolding` merges the conditions of `entry` and `block1`, resulting in the original pattern again. --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 72 ++++--- llvm/test/Transforms/SimplifyCFG/pr165088.ll | 186 +++++++++++++++++++ 2 files changed, 232 insertions(+), 26 deletions(-) create mode 100644 llvm/test/Transforms/SimplifyCFG/pr165088.ll diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index c537be5cba37c..4fac5d36ddb3f 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -5228,32 +5228,52 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, CompVal, DL.getIntPtrType(CompVal->getType()), "magicptr"); } - // Create the new switch instruction now. - SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size()); - if (HasProfile) { - // We know the weight of the default case. We don't know the weight of the - // other cases, but rather than completely lose profiling info, we split - // the remaining probability equally over them. - SmallVector NewWeights(Values.size() + 1); - NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped if - // TrueWhenEqual. - for (auto &V : drop_begin(NewWeights)) - V = BranchWeights[0] / Values.size(); - setBranchWeights(*New, NewWeights, /*IsExpected=*/false); - } - - // Add all of the 'cases' to the switch instruction. - for (ConstantInt *Val : Values) - New->addCase(Val, EdgeBB); - - // We added edges from PI to the EdgeBB. As such, if there were any - // PHI nodes in EdgeBB, they need entries to be added corresponding to - // the number of edges added. - for (BasicBlock::iterator BBI = EdgeBB->begin(); isa(BBI); ++BBI) { - PHINode *PN = cast(BBI); - Value *InVal = PN->getIncomingValueForBlock(BB); - for (unsigned i = 0, e = Values.size() - 1; i != e; ++i) - PN->addIncoming(InVal, BB); + // Check if we can represent the values as a contiguous range. If so, we use a + // range check + conditional branch instead of a switch. + if (Values.front()->getValue() - Values.back()->getValue() == + Values.size() - 1) { + ConstantRange RangeToCheck = ConstantRange::getNonEmpty( + Values.back()->getValue(), Values.front()->getValue() + 1); + APInt Offset, RHS; + ICmpInst::Predicate Pred; + RangeToCheck.getEquivalentICmp(Pred, RHS, Offset); + Value *X = CompVal; + if (!Offset.isZero()) + X = Builder.CreateAdd(X, ConstantInt::get(CompVal->getType(), Offset)); + Value *Cond = + Builder.CreateICmp(Pred, X, ConstantInt::get(CompVal->getType(), RHS)); + BranchInst *NewBI = Builder.CreateCondBr(Cond, EdgeBB, DefaultBB); + if (HasProfile) + setBranchWeights(*NewBI, BranchWeights, /*IsExpected=*/false); + // We don't need to update PHI nodes since we don't add any new edges. + } else { + // Create the new switch instruction now. + SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size()); + if (HasProfile) { + // We know the weight of the default case. We don't know the weight of the + // other cases, but rather than completely lose profiling info, we split + // the remaining probability equally over them. + SmallVector NewWeights(Values.size() + 1); + NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped + // if TrueWhenEqual. + for (auto &V : drop_begin(NewWeights)) + V = BranchWeights[0] / Values.size(); + setBranchWeights(*New, NewWeights, /*IsExpected=*/false); + } + + // Add all of the 'cases' to the switch instruction. + for (ConstantInt *Val : Values) + New->addCase(Val, EdgeBB); + + // We added edges from PI to the EdgeBB. As such, if there were any + // PHI nodes in EdgeBB, they need entries to be added corresponding to + // the number of edges added. + for (BasicBlock::iterator BBI = EdgeBB->begin(); isa(BBI); ++BBI) { + PHINode *PN = cast(BBI); + Value *InVal = PN->getIncomingValueForBlock(BB); + for (unsigned i = 0, e = Values.size() - 1; i != e; ++i) + PN->addIncoming(InVal, BB); + } } // Erase the old branch instruction. diff --git a/llvm/test/Transforms/SimplifyCFG/pr165088.ll b/llvm/test/Transforms/SimplifyCFG/pr165088.ll new file mode 100644 index 0000000000000..4514a1927b586 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/pr165088.ll @@ -0,0 +1,186 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes="simplifycfg" < %s | FileCheck %s + +; Avoid getting stuck in the cycle pr165088_cycle_[1-4]. + +define void @pr165088_cycle_1(i8 %x) { +; CHECK-LABEL: define void @pr165088_cycle_1( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i8 [[X]], 2 +; CHECK-NEXT: br i1 [[TMP0]], label %[[BLOCK2:.*]], label %[[BLOCK3:.*]] +; CHECK: [[BLOCK1:.*]]: +; CHECK-NEXT: [[COND2:%.*]] = icmp ugt i8 [[X]], 1 +; CHECK-NEXT: br i1 [[COND2]], label %[[BLOCK3]], label %[[BLOCK2]] +; CHECK: [[BLOCK2]]: +; CHECK-NEXT: br label %[[BLOCK3]] +; CHECK: [[BLOCK3]]: +; CHECK-NEXT: [[COND3:%.*]] = icmp eq i8 [[X]], 0 +; CHECK-NEXT: br i1 [[COND3]], label %[[EXIT:.*]], label %[[BLOCK1]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %switch = icmp uge i8 %x, 2 + %cond1 = icmp ugt i8 %x, 1 + %or.cond = and i1 %switch, %cond1 + br i1 %or.cond, label %block3, label %block2 + +block1: + %cond2 = icmp ugt i8 %x, 1 + br i1 %cond2, label %block3, label %block2 + +block2: + br label %block3 + +block3: + %cond3 = icmp eq i8 %x, 0 + br i1 %cond3, label %exit, label %block1 + +exit: + ret void +} + +define void @pr165088_cycle_2(i8 %x) { +; CHECK-LABEL: define void @pr165088_cycle_2( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SWITCH:%.*]] = icmp ult i8 [[X]], 2 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[BLOCK2:.*]], label %[[BLOCK3:.*]] +; CHECK: [[BLOCK1:.*]]: +; CHECK-NEXT: [[COND2:%.*]] = icmp ugt i8 [[X]], 1 +; CHECK-NEXT: br i1 [[COND2]], label %[[BLOCK3]], label %[[BLOCK2]] +; CHECK: [[BLOCK2]]: +; CHECK-NEXT: br label %[[BLOCK3]] +; CHECK: [[BLOCK3]]: +; CHECK-NEXT: [[COND3:%.*]] = icmp eq i8 [[X]], 0 +; CHECK-NEXT: br i1 [[COND3]], label %[[EXIT:.*]], label %[[BLOCK1]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + switch i8 %x, label %block3 [ + i8 1, label %block2 + i8 0, label %block2 + ] + +block1: ; preds = %block3 + %cond2 = icmp ugt i8 %x, 1 + br i1 %cond2, label %block3, label %block2 + +block2: ; preds = %entry, %entry, %block1 + br label %block3 + +block3: ; preds = %entry, %block2, %block1 + %cond3 = icmp eq i8 %x, 0 + br i1 %cond3, label %exit, label %block1 + +exit: ; preds = %block3 + ret void +} + +define void @pr165088_cycle_3(i8 %x) { +; CHECK-LABEL: define void @pr165088_cycle_3( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[BLOCK3:.*]] +; CHECK: [[BLOCK3]]: +; CHECK-NEXT: [[COND3:%.*]] = icmp eq i8 [[X]], 0 +; CHECK-NEXT: br i1 [[COND3]], label %[[EXIT:.*]], label %[[BLOCK3]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + switch i8 %x, label %block1 [ + i8 1, label %block2 + i8 0, label %block2 + ] + +block1: ; preds = %entry, %block3 + %cond2 = icmp ugt i8 %x, 1 + br i1 %cond2, label %block3, label %block2 + +block2: ; preds = %entry, %entry, %block1 + br label %block3 + +block3: ; preds = %block2, %block1 + %cond3 = icmp eq i8 %x, 0 + br i1 %cond3, label %exit, label %block1 + +exit: ; preds = %block3 + ret void +} + +define void @pr165088_cycle_4(i8 %x) { +; CHECK-LABEL: define void @pr165088_cycle_4( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i8 [[X]], 2 +; CHECK-NEXT: br i1 [[TMP0]], label %[[BLOCK2:.*]], label %[[BLOCK3:.*]] +; CHECK: [[BLOCK1:.*]]: +; CHECK-NEXT: [[COND2_OLD:%.*]] = icmp ugt i8 [[X]], 1 +; CHECK-NEXT: br i1 [[COND2_OLD]], label %[[BLOCK3]], label %[[BLOCK2]] +; CHECK: [[BLOCK2]]: +; CHECK-NEXT: br label %[[BLOCK3]] +; CHECK: [[BLOCK3]]: +; CHECK-NEXT: [[COND3:%.*]] = icmp eq i8 [[X]], 0 +; CHECK-NEXT: br i1 [[COND3]], label %[[EXIT:.*]], label %[[BLOCK1]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %switch = icmp ult i8 %x, 2 + br i1 %switch, label %block2, label %block1 + +block1: ; preds = %entry, %block3 + %cond2 = icmp ugt i8 %x, 1 + br i1 %cond2, label %block3, label %block2 + +block2: ; preds = %entry, %block1 + br label %block3 + +block3: ; preds = %block2, %block1 + %cond3 = icmp eq i8 %x, 0 + br i1 %cond3, label %exit, label %block1 + +exit: ; preds = %block3 + ret void +} + +define void @pr165088_original(i8 %x) { +; CHECK-LABEL: define void @pr165088_original( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i8 [[X]], 2 +; CHECK-NEXT: br i1 [[TMP0]], label %[[BLOCK2:.*]], label %[[BLOCK3:.*]] +; CHECK: [[BLOCK1:.*]]: +; CHECK-NEXT: [[COND3_OLD_OLD:%.*]] = icmp ugt i8 [[X]], 1 +; CHECK-NEXT: br i1 [[COND3_OLD_OLD]], label %[[BLOCK3]], label %[[BLOCK2]] +; CHECK: [[BLOCK2]]: +; CHECK-NEXT: br label %[[BLOCK3]] +; CHECK: [[BLOCK3]]: +; CHECK-NEXT: [[COND4:%.*]] = icmp eq i8 [[X]], 0 +; CHECK-NEXT: br i1 [[COND4]], label %[[EXIT:.*]], label %[[BLOCK1]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %cond = icmp ne i8 %x, 0 + %cond3 = icmp ne i8 %x, 0 + %or.cond = and i1 %cond, %cond3 + br i1 %or.cond, label %block3, label %block2 + +block1: ; preds = %block3 + %cond3.old = icmp ugt i8 %x, 1 + br i1 %cond3.old, label %block3, label %block2 + +block2: ; preds = %block1, %entry + br label %block3 + +block3: ; preds = %block2, %block1, %entry + %cond4 = icmp eq i8 %x, 0 + br i1 %cond4, label %exit, label %block1 + +exit: ; preds = %block3 + ret void +} From 166807b854395dcb10dbc7f7461af0129c178521 Mon Sep 17 00:00:00 2001 From: owenca Date: Tue, 28 Oct 2025 20:16:24 -0700 Subject: [PATCH 074/539] [clang-format] Fix a bug in annotating class member names (#165351) For declarations like `Type* ::Class::member...`, `Class` was not annotated as `TT_StartOfName`as it should be. This prevented `member` from being updated to `TT_FunctionDeclarationName` if `member` was a function name. Fixes #164866 --- clang/lib/Format/TokenAnnotator.cpp | 5 ++++- clang/unittests/Format/TokenAnnotatorTest.cpp | 9 +++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 1d0dfd0b9c151..021d8c658eb11 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -2674,8 +2674,11 @@ class AnnotatingParser { } // *a or &a or &&a. - if (PreviousNotConst->is(TT_PointerOrReference)) + if (PreviousNotConst->is(TT_PointerOrReference) || + PreviousNotConst->endsSequence(tok::coloncolon, + TT_PointerOrReference)) { return true; + } // MyClass a; if (PreviousNotConst->isTypeName(LangOpts)) diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index ca99940890984..c046142c613b0 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -1119,6 +1119,11 @@ TEST_F(TokenAnnotatorTest, UnderstandsOverloadedOperators) { EXPECT_TOKEN(Tokens[8], tok::amp, TT_PointerOrReference); EXPECT_TOKEN(Tokens[12], tok::amp, TT_PointerOrReference); + Tokens = annotate("::foo::bar& ::foo::bar::operator=(::foo::bar& other);"); + ASSERT_EQ(Tokens.size(), 22u) << Tokens; + EXPECT_TOKEN(Tokens[6], tok::identifier, TT_FunctionDeclarationName); + EXPECT_TOKEN(Tokens[17], tok::amp, TT_PointerOrReference); + Tokens = annotate("SomeLoooooooooooooooooType::Awaitable\n" "SomeLoooooooooooooooooType::operator co_await();"); ASSERT_EQ(Tokens.size(), 11u) << Tokens; @@ -3484,6 +3489,10 @@ TEST_F(TokenAnnotatorTest, StartOfName) { ASSERT_EQ(Tokens.size(), 8u) << Tokens; EXPECT_TOKEN(Tokens[2], tok::identifier, TT_Unknown); // Not StartOfName + Tokens = annotate("int* ::foo::bar;"); + ASSERT_EQ(Tokens.size(), 8u) << Tokens; + EXPECT_TOKEN(Tokens[3], tok::identifier, TT_StartOfName); + auto Style = getLLVMStyle(); Style.StatementAttributeLikeMacros.push_back("emit"); Tokens = annotate("emit foo = 0;", Style); From 58f17eba8df8a1e92ee2c9449b86b326b9af66fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 28 Oct 2025 17:19:28 -1000 Subject: [PATCH 075/539] [flang][cuda] Support gpu.launch_func with async token in target rewrite pass (#165485) --- flang/lib/Optimizer/CodeGen/TargetRewrite.cpp | 9 +++++++-- flang/test/Fir/CUDA/cuda-target-rewrite.mlir | 20 +++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp index 0776346870c72..8ca2869993443 100644 --- a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp +++ b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp @@ -143,7 +143,8 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { llvm::SmallVector operandsTypes; for (auto arg : gpuLaunchFunc.getKernelOperands()) operandsTypes.push_back(arg.getType()); - auto fctTy = mlir::FunctionType::get(&context, operandsTypes, {}); + auto fctTy = mlir::FunctionType::get(&context, operandsTypes, + gpuLaunchFunc.getResultTypes()); if (!hasPortableSignature(fctTy, op)) convertCallOp(gpuLaunchFunc, fctTy); } else if (auto addr = mlir::dyn_cast(op)) { @@ -520,10 +521,14 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { llvm::SmallVector newCallResults; // TODO propagate/update call argument and result attributes. if constexpr (std::is_same_v, mlir::gpu::LaunchFuncOp>) { + mlir::Value asyncToken = callOp.getAsyncToken(); auto newCall = A::create(*rewriter, loc, callOp.getKernel(), callOp.getGridSizeOperandValues(), callOp.getBlockSizeOperandValues(), - callOp.getDynamicSharedMemorySize(), newOpers); + callOp.getDynamicSharedMemorySize(), newOpers, + asyncToken ? asyncToken.getType() : nullptr, + callOp.getAsyncDependencies(), + /*clusterSize=*/std::nullopt); if (callOp.getClusterSizeX()) newCall.getClusterSizeXMutable().assign(callOp.getClusterSizeX()); if (callOp.getClusterSizeY()) diff --git a/flang/test/Fir/CUDA/cuda-target-rewrite.mlir b/flang/test/Fir/CUDA/cuda-target-rewrite.mlir index 48fee10f3db97..5562e00085526 100644 --- a/flang/test/Fir/CUDA/cuda-target-rewrite.mlir +++ b/flang/test/Fir/CUDA/cuda-target-rewrite.mlir @@ -108,3 +108,23 @@ module attributes {gpu.container_module, fir.defaultkind = "a1c4d8i4l4r4", fir.k } } +// ----- + +module attributes {gpu.container_module, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { + gpu.module @testmod { + gpu.func @_QPtest(%arg0: complex) -> () kernel { + gpu.return + } + } + func.func @main(%arg0: complex) { + %0 = llvm.mlir.constant(0 : i64) : i64 + %1 = llvm.mlir.constant(0 : i32) : i32 + %2 = fir.alloca i64 + %3 = cuf.stream_cast %2 : !fir.ref + %4 = gpu.launch_func async [%3] @testmod::@_QPtest blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 dynamic_shared_memory_size %1 args(%arg0 : complex) {cuf.proc_attr = #cuf.cuda_proc} + return + } +} + +// CHECK-LABEL: func.func @main +// CHECK: %{{.*}} = gpu.launch_func async [%{{.*}}] @testmod::@_QPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64 dynamic_shared_memory_size %{{.*}} args(%{{.*}} : !fir.vector<2:f32>) {cuf.proc_attr = #cuf.cuda_proc} From adb46acdb7283372d00416f4018260ff93718ee3 Mon Sep 17 00:00:00 2001 From: Zhaoxin Yang Date: Wed, 29 Oct 2025 11:23:11 +0800 Subject: [PATCH 076/539] [LoongArch][NFC] Add tests for vector fminnum/fmaxnum (#162767) --- .../test/CodeGen/LoongArch/lasx/fp-max-min.ll | 160 ++++++++++++++++++ llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll | 112 ++++++++++++ 2 files changed, 272 insertions(+) create mode 100644 llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll create mode 100644 llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll new file mode 100644 index 0000000000000..48ec98c3a74bb --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll @@ -0,0 +1,160 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +define void @minnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: minnum_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 5 +; CHECK-NEXT: xvpickve.w $xr3, $xr1, 5 +; CHECK-NEXT: fmin.s $fa2, $fa3, $fa2 +; CHECK-NEXT: xvpickve.w $xr3, $xr0, 4 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 4 +; CHECK-NEXT: fmin.s $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 16 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 6 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 6 +; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 32 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 7 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 7 +; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 48 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 1 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 1 +; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2 +; CHECK-NEXT: xvpickve.w $xr4, $xr0, 0 +; CHECK-NEXT: xvpickve.w $xr5, $xr1, 0 +; CHECK-NEXT: fmin.s $fa4, $fa5, $fa4 +; CHECK-NEXT: vextrins.w $vr4, $vr2, 16 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 2 +; CHECK-NEXT: xvpickve.w $xr5, $xr1, 2 +; CHECK-NEXT: fmin.s $fa2, $fa5, $fa2 +; CHECK-NEXT: vextrins.w $vr4, $vr2, 32 +; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 +; CHECK-NEXT: xvpickve.w $xr1, $xr1, 3 +; CHECK-NEXT: fmin.s $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.w $vr4, $vr0, 48 +; CHECK-NEXT: xvpermi.q $xr4, $xr3, 2 +; CHECK-NEXT: xvst $xr4, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %x + %v1 = load <8 x float>, ptr %y + %r = call <8 x float> @llvm.minnum.v8f32(<8 x float> %v0, <8 x float> %v1) + store <8 x float> %r, ptr %res + ret void +} + +define void @minnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: minnum_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: xvpickve.d $xr2, $xr0, 3 +; CHECK-NEXT: xvpickve.d $xr3, $xr1, 3 +; CHECK-NEXT: fmin.d $fa2, $fa3, $fa2 +; CHECK-NEXT: xvpickve.d $xr3, $xr0, 2 +; CHECK-NEXT: xvpickve.d $xr4, $xr1, 2 +; CHECK-NEXT: fmin.d $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.d $vr3, $vr2, 16 +; CHECK-NEXT: xvpickve.d $xr2, $xr0, 1 +; CHECK-NEXT: xvpickve.d $xr4, $xr1, 1 +; CHECK-NEXT: fmin.d $fa2, $fa4, $fa2 +; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 +; CHECK-NEXT: xvpickve.d $xr1, $xr1, 0 +; CHECK-NEXT: fmin.d $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.d $vr0, $vr2, 16 +; CHECK-NEXT: xvpermi.q $xr0, $xr3, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %x + %v1 = load <4 x double>, ptr %y + %r = call <4 x double> @llvm.minnum.v4f64(<4 x double> %v0, <4 x double> %v1) + store <4 x double> %r, ptr %res + ret void +} + +define void @maxnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: maxnum_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 5 +; CHECK-NEXT: xvpickve.w $xr3, $xr1, 5 +; CHECK-NEXT: fmax.s $fa2, $fa3, $fa2 +; CHECK-NEXT: xvpickve.w $xr3, $xr0, 4 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 4 +; CHECK-NEXT: fmax.s $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 16 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 6 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 6 +; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 32 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 7 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 7 +; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 48 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 1 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 1 +; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2 +; CHECK-NEXT: xvpickve.w $xr4, $xr0, 0 +; CHECK-NEXT: xvpickve.w $xr5, $xr1, 0 +; CHECK-NEXT: fmax.s $fa4, $fa5, $fa4 +; CHECK-NEXT: vextrins.w $vr4, $vr2, 16 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 2 +; CHECK-NEXT: xvpickve.w $xr5, $xr1, 2 +; CHECK-NEXT: fmax.s $fa2, $fa5, $fa2 +; CHECK-NEXT: vextrins.w $vr4, $vr2, 32 +; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 +; CHECK-NEXT: xvpickve.w $xr1, $xr1, 3 +; CHECK-NEXT: fmax.s $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.w $vr4, $vr0, 48 +; CHECK-NEXT: xvpermi.q $xr4, $xr3, 2 +; CHECK-NEXT: xvst $xr4, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %x + %v1 = load <8 x float>, ptr %y + %r = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %v0, <8 x float> %v1) + store <8 x float> %r, ptr %res + ret void +} + +define void @maxnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: maxnum_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: xvpickve.d $xr2, $xr0, 3 +; CHECK-NEXT: xvpickve.d $xr3, $xr1, 3 +; CHECK-NEXT: fmax.d $fa2, $fa3, $fa2 +; CHECK-NEXT: xvpickve.d $xr3, $xr0, 2 +; CHECK-NEXT: xvpickve.d $xr4, $xr1, 2 +; CHECK-NEXT: fmax.d $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.d $vr3, $vr2, 16 +; CHECK-NEXT: xvpickve.d $xr2, $xr0, 1 +; CHECK-NEXT: xvpickve.d $xr4, $xr1, 1 +; CHECK-NEXT: fmax.d $fa2, $fa4, $fa2 +; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 +; CHECK-NEXT: xvpickve.d $xr1, $xr1, 0 +; CHECK-NEXT: fmax.d $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.d $vr0, $vr2, 16 +; CHECK-NEXT: xvpermi.q $xr0, $xr3, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %x + %v1 = load <4 x double>, ptr %y + %r = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %v0, <4 x double> %v1) + store <4 x double> %r, ptr %res + ret void +} + +declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) +declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) +declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) +declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll new file mode 100644 index 0000000000000..27ecb759c2ea3 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +define void @minnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: minnum_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vreplvei.w $vr2, $vr0, 1 +; CHECK-NEXT: vreplvei.w $vr3, $vr1, 1 +; CHECK-NEXT: fmin.s $fa2, $fa3, $fa2 +; CHECK-NEXT: vreplvei.w $vr3, $vr0, 0 +; CHECK-NEXT: vreplvei.w $vr4, $vr1, 0 +; CHECK-NEXT: fmin.s $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 16 +; CHECK-NEXT: vreplvei.w $vr2, $vr0, 2 +; CHECK-NEXT: vreplvei.w $vr4, $vr1, 2 +; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 32 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 3 +; CHECK-NEXT: fmin.s $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 +; CHECK-NEXT: vst $vr3, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %x + %v1 = load <4 x float>, ptr %y + %r = call <4 x float> @llvm.minnum.v4f32(<4 x float> %v0, <4 x float> %v1) + store <4 x float> %r, ptr %res + ret void +} + +define void @minnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: minnum_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vreplvei.d $vr2, $vr0, 1 +; CHECK-NEXT: vreplvei.d $vr3, $vr1, 1 +; CHECK-NEXT: fmin.d $fa2, $fa3, $fa2 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: fmin.d $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.d $vr0, $vr2, 16 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %x + %v1 = load <2 x double>, ptr %y + %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> %v0, <2 x double> %v1) + store <2 x double> %r, ptr %res + ret void +} + +define void @maxnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: maxnum_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vreplvei.w $vr2, $vr0, 1 +; CHECK-NEXT: vreplvei.w $vr3, $vr1, 1 +; CHECK-NEXT: fmax.s $fa2, $fa3, $fa2 +; CHECK-NEXT: vreplvei.w $vr3, $vr0, 0 +; CHECK-NEXT: vreplvei.w $vr4, $vr1, 0 +; CHECK-NEXT: fmax.s $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 16 +; CHECK-NEXT: vreplvei.w $vr2, $vr0, 2 +; CHECK-NEXT: vreplvei.w $vr4, $vr1, 2 +; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 32 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 3 +; CHECK-NEXT: fmax.s $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 +; CHECK-NEXT: vst $vr3, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %x + %v1 = load <4 x float>, ptr %y + %r = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %v0, <4 x float> %v1) + store <4 x float> %r, ptr %res + ret void +} + +define void @maxnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: maxnum_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vreplvei.d $vr2, $vr0, 1 +; CHECK-NEXT: vreplvei.d $vr3, $vr1, 1 +; CHECK-NEXT: fmax.d $fa2, $fa3, $fa2 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: fmax.d $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.d $vr0, $vr2, 16 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %x + %v1 = load <2 x double>, ptr %y + %r = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %v0, <2 x double> %v1) + store <2 x double> %r, ptr %res + ret void +} + +declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) +declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) From 2144d093c4efbaa2e567bc94b00335bd6f581701 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 29 Oct 2025 06:16:59 +0100 Subject: [PATCH 077/539] [clang] Use a formatted_raw_ostream in TextDiagnostic (#164935) So we can use `getColumn()` to get the column without the bytes for color codes. Fixes #164933 --- clang/include/clang/Frontend/TextDiagnostic.h | 8 +++++--- clang/lib/Frontend/TextDiagnostic.cpp | 10 ++++++---- clang/test/Frontend/diag-wrap-colors.cpp | 6 ++++++ llvm/include/llvm/Support/FormattedStream.h | 3 ++- 4 files changed, 19 insertions(+), 8 deletions(-) create mode 100644 clang/test/Frontend/diag-wrap-colors.cpp diff --git a/clang/include/clang/Frontend/TextDiagnostic.h b/clang/include/clang/Frontend/TextDiagnostic.h index e2e88d4d648a2..10028186d27f3 100644 --- a/clang/include/clang/Frontend/TextDiagnostic.h +++ b/clang/include/clang/Frontend/TextDiagnostic.h @@ -16,10 +16,12 @@ #define LLVM_CLANG_FRONTEND_TEXTDIAGNOSTIC_H #include "clang/Frontend/DiagnosticRenderer.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/FormattedStream.h" namespace clang { +using llvm::formatted_raw_ostream; + /// Class to encapsulate the logic for formatting and printing a textual /// diagnostic message. /// @@ -33,7 +35,7 @@ namespace clang { /// DiagnosticClient is implemented through this class as is diagnostic /// printing coming out of libclang. class TextDiagnostic : public DiagnosticRenderer { - raw_ostream &OS; + formatted_raw_ostream OS; const Preprocessor *PP; public: @@ -47,7 +49,7 @@ class TextDiagnostic : public DiagnosticRenderer { unsigned End; enum llvm::raw_ostream::Colors Color; StyleRange(unsigned S, unsigned E, enum llvm::raw_ostream::Colors C) - : Start(S), End(E), Color(C){}; + : Start(S), End(E), Color(C) {}; }; /// Print the diagonstic level to a raw_ostream. diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp index 58885712fbdcc..f5add2a941f72 100644 --- a/clang/lib/Frontend/TextDiagnostic.cpp +++ b/clang/lib/Frontend/TextDiagnostic.cpp @@ -17,7 +17,6 @@ #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Locale.h" -#include "llvm/Support/raw_ostream.h" #include #include @@ -662,7 +661,7 @@ void TextDiagnostic::emitDiagnosticMessage( FullSourceLoc Loc, PresumedLoc PLoc, DiagnosticsEngine::Level Level, StringRef Message, ArrayRef Ranges, DiagOrStoredDiag D) { - uint64_t StartOfLocationInfo = OS.tell(); + uint64_t StartOfLocationInfo = OS.getColumn(); // Emit the location of this particular diagnostic. if (Loc.isValid()) @@ -675,8 +674,11 @@ void TextDiagnostic::emitDiagnosticMessage( printDiagnosticLevel(OS, Level, DiagOpts.ShowColors); printDiagnosticMessage(OS, /*IsSupplemental*/ Level == DiagnosticsEngine::Note, - Message, OS.tell() - StartOfLocationInfo, + Message, OS.getColumn() - StartOfLocationInfo, DiagOpts.MessageLength, DiagOpts.ShowColors); + // We use a formatted ostream, which does its own buffering. Flush here + // so we keep the proper order of output. + OS.flush(); } /*static*/ void @@ -1485,7 +1487,7 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine, if (CharStyle != Styles.end()) { if (!CurrentColor || (CurrentColor && *CurrentColor != CharStyle->Color)) { - OS.changeColor(CharStyle->Color, false); + OS.changeColor(CharStyle->Color); CurrentColor = CharStyle->Color; } } else if (CurrentColor) { diff --git a/clang/test/Frontend/diag-wrap-colors.cpp b/clang/test/Frontend/diag-wrap-colors.cpp new file mode 100644 index 0000000000000..e3dccb1bd2dee --- /dev/null +++ b/clang/test/Frontend/diag-wrap-colors.cpp @@ -0,0 +1,6 @@ +// RUN: not %clang_cc1 %s -fmessage-length=50 -fcolor-diagnostics -fno-show-source-location -o - 2>&1 | FileCheck %s + +struct F { + float a : 10; +}; +// CHECK: bit-field 'a' has non-integral type 'float' diff --git a/llvm/include/llvm/Support/FormattedStream.h b/llvm/include/llvm/Support/FormattedStream.h index 011a6aea238e3..402cd3e3235dc 100644 --- a/llvm/include/llvm/Support/FormattedStream.h +++ b/llvm/include/llvm/Support/FormattedStream.h @@ -180,7 +180,8 @@ class LLVM_ABI formatted_raw_ostream : public raw_ostream { return *this; } - raw_ostream &changeColor(enum Colors Color, bool Bold, bool BG) override { + raw_ostream &changeColor(enum Colors Color, bool Bold = false, + bool BG = false) override { if (colors_enabled()) { DisableScanScope S(this); raw_ostream::changeColor(Color, Bold, BG); From 5d122829d55f8ef73eb9784608541cb3006c9396 Mon Sep 17 00:00:00 2001 From: Pankaj Dwivedi Date: Wed, 29 Oct 2025 11:56:43 +0530 Subject: [PATCH 078/539] [AMDGPU] make AMDGPUUniformIntrinsicCombine a function pass (#165265) There has been an issue(using function analysis inside the module pass in OPM) integrating this pass into the LLC pipeline, which currently lacks NPM support. I tried finding a way to get the per-function analysis, but it seems that in OPM, we don't have that option. So the best approach would be to make it a function pass. Ref: https://github.com/llvm/llvm-project/pull/116953 --- llvm/lib/Target/AMDGPU/AMDGPU.h | 6 +- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 +- .../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 78 +++++++++++++------ .../amdgpu-simplify-uniform-waterfall.ll | 16 ++++ .../amdgpu-uniform-intrinsic-combine.ll | 4 + 6 files changed, 85 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index ce2b4a5f6f2e9..cd8b2495a4250 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -562,9 +562,13 @@ class AMDGPURewriteAGPRCopyMFMAPass void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &); extern char &AMDGPURewriteAGPRCopyMFMALegacyID; +void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &); +extern char &AMDGPUUniformIntrinsicCombineLegacyPassID; +FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass(); + struct AMDGPUUniformIntrinsicCombinePass : public PassInfoMixin { - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; namespace AMDGPU { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index a6074eaf78fd0..bf6f1a9dbf576 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -30,7 +30,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass( MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) -MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass()) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS @@ -69,6 +68,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes", AMDGPUUnifyDivergentExitNodesPass()) FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass()) FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast(this))) +FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass()) #undef FUNCTION_PASS #ifndef FUNCTION_ANALYSIS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 6214f4db87e1e..75a94ac891819 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -619,6 +619,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR); initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR); initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR); + initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -887,9 +888,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (EarlyInlineAll && !EnableFunctionCalls) PM.addPass(AMDGPUAlwaysInlinePass()); - - if (EnableUniformIntrinsicCombine) - PM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerPeepholeEPCallback( @@ -900,6 +898,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { FPM.addPass(AMDGPUUseNativeCallsPass()); if (EnableLibCallSimplify) FPM.addPass(AMDGPUSimplifyLibCallsPass()); + + if (EnableUniformIntrinsicCombine) + FPM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerCGSCCOptimizerLateEPCallback( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp index 50c78d8c67251..65e6ed9d1d428 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -16,12 +16,6 @@ /// uniformity. And every instruction that's downstream and cares about dynamic /// uniformity must be convergent (and isel will introduce v_readfirstlane for /// them if their operands can't be proven statically uniform). -/// -/// This pass is implemented as a ModulePass because intrinsic declarations -/// exist at the module scope, allowing us to skip processing entirely if no -/// declarations are present and to traverse their user lists directly when -/// they are. A FunctionPass would instead require scanning every instruction -/// in every function to find relevant intrinsics, which is far less efficient. //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -97,14 +91,12 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, Tracker[NotOp] = true; // NOT preserves uniformity LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n'); ICmp->replaceAllUsesWith(NotOp); - ICmp->eraseFromParent(); Changed = true; } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) { // Case: (icmp ne %ballot, 0) -> %ballot_arg LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: " << *Src << '\n'); ICmp->replaceAllUsesWith(Src); - ICmp->eraseFromParent(); Changed = true; } } @@ -120,15 +112,17 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, return false; } -/// Iterates over intrinsic declarations in the module to optimize their uses. -static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) { +/// Iterates over intrinsic calls in the Function to optimize. +static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) { bool IsChanged = false; ValueMap Tracker; - FunctionAnalysisManager &FAM = - AM.getResult(M).getManager(); - for (Function &F : M) { - switch (F.getIntrinsicID()) { + for (Instruction &I : make_early_inc_range(instructions(F))) { + auto *II = dyn_cast(&I); + if (!II) + continue; + + switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_permlane64: case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: @@ -137,23 +131,61 @@ static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) { default: continue; } - - for (User *U : make_early_inc_range(F.users())) { - auto *II = cast(U); - Function *ParentF = II->getFunction(); - const auto &UI = FAM.getResult(*ParentF); - IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); - } + IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); } return IsChanged; } PreservedAnalyses -AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) { - if (!runUniformIntrinsicCombine(M, AM)) +AMDGPUUniformIntrinsicCombinePass::run(Function &F, + FunctionAnalysisManager &AM) { + const auto &UI = AM.getResult(F); + if (!runUniformIntrinsicCombine(F, UI)) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserve(); return PA; } + +namespace { +class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass { +public: + static char ID; + AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) { + initializeAMDGPUUniformIntrinsicCombineLegacyPass( + *PassRegistry::getPassRegistry()); + } + +private: + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + } +}; +} // namespace + +char AMDGPUUniformIntrinsicCombineLegacy::ID = 0; +char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID = + AMDGPUUniformIntrinsicCombineLegacy::ID; + +bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + const UniformityInfo &UI = + getAnalysis().getUniformityInfo(); + return runUniformIntrinsicCombine(F, UI); +} + +INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE, + "AMDGPU Uniform Intrinsic Combine", false, false) +INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE, + "AMDGPU Uniform Intrinsic Combine", false, false) + +FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() { + return new AMDGPUUniformIntrinsicCombineLegacy(); +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll index 6c4f504f3456c..33ce278028bba 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll @@ -23,7 +23,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1) ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] ; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]]) ; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0 ; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 @@ -75,7 +77,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op(ptr addrs ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] ; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]]) ; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 0, [[BALLOT]] ; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 @@ -126,6 +130,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero(ptr addrspace(1) ; PASS-CHECK-NEXT: br label %[[WHILE:.*]] ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]]) +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i64 0, [[BALLOT]] ; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 @@ -175,6 +181,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap(ptr addrspac ; PASS-CHECK-NEXT: br label %[[WHILE:.*]] ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]]) +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i64 [[BALLOT]], 0 ; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 @@ -225,7 +233,9 @@ define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1) ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ] ; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]]) ; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0 ; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0 @@ -292,7 +302,9 @@ define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i3 ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ] ; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]]) ; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0 ; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]] @@ -359,7 +371,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(ptr addrspace ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] ; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[NOT_DONE]]) ; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i32 [[BALLOT]], 0 ; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 @@ -410,6 +424,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(ptr addrspace ; PASS-CHECK-NEXT: br label %[[WHILE:.*]] ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[DONE]]) +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i32 0, [[BALLOT]] ; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll index aa11574517520..a3e42e564376c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll @@ -595,6 +595,8 @@ define amdgpu_kernel void @ballot_i32(i32 %v, ptr addrspace(1) %out) { ; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i32( ; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { ; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[C]]) +; PASS-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[BALLOT]], 0 ; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1 ; PASS-CHECK-NEXT: ret void ; @@ -623,6 +625,8 @@ define amdgpu_kernel void @ballot_i64(i32 %v, ptr addrspace(1) %out) { ; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i64( ; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { ; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[C]]) +; PASS-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i64 [[BALLOT]], 0 ; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1 ; PASS-CHECK-NEXT: ret void ; From e962c53a2367a4c3f9eea912ca45ee47c8caa051 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 29 Oct 2025 07:30:11 +0100 Subject: [PATCH 079/539] [clang][bytecode] Check builtin carryops for non-block out pointers (#165512) Fixes https://github.com/llvm/llvm-project/issues/165372 --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 2 +- clang/test/AST/ByteCode/builtin-functions.cpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index ab6b3ed1be0aa..b3ab82da5e01a 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -859,7 +859,7 @@ static bool interp__builtin_carryop(InterpState &S, CodePtr OpPC, APSInt RHS = popToAPSInt(S.Stk, RHST); APSInt LHS = popToAPSInt(S.Stk, LHST); - if (CarryOutPtr.isDummy()) + if (CarryOutPtr.isDummy() || !CarryOutPtr.isBlockPointer()) return false; APSInt CarryOut; diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp index e9093b2f23f74..a90f636b5134b 100644 --- a/clang/test/AST/ByteCode/builtin-functions.cpp +++ b/clang/test/AST/ByteCode/builtin-functions.cpp @@ -1856,7 +1856,8 @@ namespace InitParam { #endif -namespace SAddOverflowInt { +namespace NonBlockPointerStore { int a; void foo(void) { a *= __builtin_sadd_overflow(1, 2, 0); } + void foo2(void) { a *= __builtin_addc(1, 2, 0, 0); } } From 4f784898082ed78725cd8edccff2b57646a20955 Mon Sep 17 00:00:00 2001 From: Baranov Victor Date: Wed, 29 Oct 2025 09:52:08 +0300 Subject: [PATCH 080/539] [clang-tidy] Fix param-pack fix-its for 'performance-unnecessary-value-param' check (#164130) Closes https://github.com/llvm/llvm-project/issues/154755. --- .../clang-tidy/utils/FixItHintUtils.cpp | 5 +++ clang-tools-extra/docs/ReleaseNotes.rst | 3 +- .../unnecessary-value-param-templates.cpp | 31 +++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp b/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp index 086c7f3a15d45..b30c83e3aeb35 100644 --- a/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp +++ b/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp @@ -21,6 +21,11 @@ FixItHint changeVarDeclToReference(const VarDecl &Var, ASTContext &Context) { SourceLocation AmpLocation = Var.getLocation(); auto Token = utils::lexer::getPreviousToken( AmpLocation, Context.getSourceManager(), Context.getLangOpts()); + + // For parameter packs the '&' must go before the '...' token + if (Token.is(tok::ellipsis)) + return FixItHint::CreateInsertion(Token.getLocation(), "&"); + if (!Token.is(tok::unknown)) AmpLocation = Lexer::getLocForEndOfToken(Token.getLocation(), 0, Context.getSourceManager(), diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 915b79329dac4..835de7418bdd6 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -407,7 +407,8 @@ Changes in existing checks - Improved :doc:`performance-unnecessary-value-param ` by printing - the type of the diagnosed variable. + the type of the diagnosed variable and correctly generating fix-it hints for + parameter-pack arguments. - Improved :doc:`portability-template-virtual-member-function ` check to diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-templates.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-templates.cpp index 688c79bbaa9ac..61758c5dac071 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-templates.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-templates.cpp @@ -96,3 +96,34 @@ void lambdaNonConstAutoValue() { }; fn(ExpensiveToCopyType()); } + +template +void ParameterPack(Args... args) { + // CHECK-MESSAGES: [[@LINE-1]]:28: warning: the parameter 'args' of type 'ExpensiveToCopyType' + // CHECK-FIXES: void ParameterPack(const Args&... args) { +} + +template +void ParameterPackConst(Args const... args) { + // CHECK-MESSAGES: [[@LINE-1]]:39: warning: the const qualified parameter 'args' of type 'const ExpensiveToCopyType' + // CHECK-FIXES: void ParameterPackConst(Args const&... args) { +} + +template +void ParameterPackWithParams(const ExpensiveToCopyType E1, ExpensiveToCopyType E2, Args... args) { + // CHECK-MESSAGES: [[@LINE-1]]:56: warning: the const qualified parameter 'E1' + // CHECK-MESSAGES: [[@LINE-2]]:80: warning: the parameter 'E2' + // CHECK-MESSAGES: [[@LINE-3]]:92: warning: the parameter 'args' + // CHECK-FIXES: void ParameterPackWithParams(const ExpensiveToCopyType& E1, const ExpensiveToCopyType& E2, const Args&... args) { +} + +template +void PackWithNonExpensive(int x, Args... args) {} + +void instantiatedParameterPack() { + ExpensiveToCopyType E; + ParameterPack(E); + ParameterPackConst(E); + ParameterPackWithParams(E, E, E); + PackWithNonExpensive(5, 5); +} From 55f222d352c37389a66838bbb8417f58f445bcaf Mon Sep 17 00:00:00 2001 From: Baranov Victor Date: Wed, 29 Oct 2025 09:53:28 +0300 Subject: [PATCH 081/539] [clang-tidy] Emit warnings from user headers by default (#164165) Closes https://github.com/llvm/llvm-project/issues/158132. --- .../clang-tidy/ClangTidyOptions.cpp | 2 +- .../clang-tidy/tool/ClangTidyMain.cpp | 14 +++++----- clang-tools-extra/docs/ReleaseNotes.rst | 10 +++++++ clang-tools-extra/docs/clang-tidy/index.rst | 6 +++-- .../abseil/no-internal-dependencies.cpp | 2 +- .../checkers/abseil/no-namespace.cpp | 2 +- .../checkers/bugprone/reserved-identifier.cpp | 5 ++-- .../google/upgrade-googletest-case.cpp | 4 +-- .../checkers/modernize/replace-auto-ptr.cpp | 2 +- .../checkers/modernize/use-using.cpp | 2 +- .../readability/duplicate-include.cpp | 4 ++- .../readability/identifier-naming.cpp | 7 ++--- .../infrastructure/default-header-filter.cpp | 27 +++++++++++++++++++ .../clang-tidy/infrastructure/file-filter.cpp | 2 +- 14 files changed, 67 insertions(+), 22 deletions(-) create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/default-header-filter.cpp diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp index 21455db7c7e7b..c4b47a440e44b 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp @@ -247,7 +247,7 @@ ClangTidyOptions ClangTidyOptions::getDefaults() { Options.WarningsAsErrors = ""; Options.HeaderFileExtensions = {"", "h", "hh", "hpp", "hxx"}; Options.ImplementationFileExtensions = {"c", "cc", "cpp", "cxx"}; - Options.HeaderFilterRegex = ""; + Options.HeaderFilterRegex = ".*"; Options.ExcludeHeaderFilterRegex = ""; Options.SystemHeaders = false; Options.FormatStyle = "none"; diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp index 64157f530b8c0..1ae8756c339e7 100644 --- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp +++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp @@ -93,7 +93,7 @@ Configuration files: WarningsAsErrors: '' HeaderFileExtensions: ['', 'h','hh','hpp','hxx'] ImplementationFileExtensions: ['c','cc','cpp','cxx'] - HeaderFilterRegex: '' + HeaderFilterRegex: '.*' FormatStyle: none InheritParentConfig: true User: user @@ -132,14 +132,16 @@ file, if any. static cl::opt HeaderFilter("header-filter", desc(R"( Regular expression matching the names of the -headers to output diagnostics from. Diagnostics +headers to output diagnostics from. The default +value is '.*', i.e. diagnostics from all non-system +headers are displayed by default. Diagnostics from the main file of each translation unit are always displayed. Can be used together with -line-filter. This option overrides the 'HeaderFilterRegex' option in .clang-tidy file, if any. )"), - cl::init(""), + cl::init(".*"), cl::cat(ClangTidyCategory)); static cl::opt ExcludeHeaderFilter("exclude-header-filter", @@ -379,9 +381,9 @@ static void printStats(const ClangTidyStats &Stats) { << " with check filters"; llvm::errs() << ").\n"; if (Stats.ErrorsIgnoredNonUserCode) - llvm::errs() << "Use -header-filter=.* to display errors from all " - "non-system headers. Use -system-headers to display " - "errors from system headers as well.\n"; + llvm::errs() << "Use -header-filter=.* or leave it as default to display " + "errors from all non-system headers. Use -system-headers " + "to display errors from system headers as well.\n"; } } diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 835de7418bdd6..8f4be0d1cb259 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -70,6 +70,11 @@ Potentially Breaking Changes :doc:`bugprone-signed-char-misuse ` +- :program:`clang-tidy` now displays warnings from all non-system headers by + default. Previously, users had to explicitly opt-in to header warnings using + `-header-filter='.*'`. To disable warnings from non-system, set `-header-filter` + to an empty string. + Improvements to clangd ---------------------- @@ -132,6 +137,11 @@ Improvements to clang-tidy when run over C files. If ``-std`` is not specified, it defaults to ``c99-or-later``. +- :program:`clang-tidy` now displays warnings from all non-system headers by + default. Previously, users had to explicitly opt-in to header warnings using + `-header-filter='.*'`. To disable warnings from non-system, set `-header-filter` + to an empty string. + - :program:`clang-tidy` no longer attempts to analyze code from system headers by default, greatly improving performance. This behavior is disabled if the `SystemHeaders` option is enabled. diff --git a/clang-tools-extra/docs/clang-tidy/index.rst b/clang-tools-extra/docs/clang-tidy/index.rst index bd2c40e948f34..6ff82bf230f4b 100644 --- a/clang-tools-extra/docs/clang-tidy/index.rst +++ b/clang-tools-extra/docs/clang-tidy/index.rst @@ -215,7 +215,9 @@ An overview of all the command-line options: This option overrides the 'FormatStyle` option in .clang-tidy file, if any. --header-filter= - Regular expression matching the names of the - headers to output diagnostics from. Diagnostics + headers to output diagnostics from. The default + value is '.*', i.e. diagnostics from all non-system + headers are displayed by default. Diagnostics from the main file of each translation unit are always displayed. Can be used together with -line-filter. @@ -338,7 +340,7 @@ An overview of all the command-line options: WarningsAsErrors: '' HeaderFileExtensions: ['', 'h','hh','hpp','hxx'] ImplementationFileExtensions: ['c','cc','cpp','cxx'] - HeaderFilterRegex: '' + HeaderFilterRegex: '.*' FormatStyle: none InheritParentConfig: true User: user diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/no-internal-dependencies.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil/no-internal-dependencies.cpp index 2949d7fdd0274..f6eb7c5e25949 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/abseil/no-internal-dependencies.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/no-internal-dependencies.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s abseil-no-internal-dependencies %t, -- -- -I %S/Inputs +// RUN: %check_clang_tidy %s abseil-no-internal-dependencies %t, -- -header-filter='' -- -I %S/Inputs // RUN: clang-tidy -checks='-*, abseil-no-internal-dependencies' -header-filter='.*' %s -- -I %S/Inputs 2>&1 | FileCheck %s #include "absl/strings/internal-file.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/no-namespace.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil/no-namespace.cpp index 78821c373f5c4..c8a5752ed86a6 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/abseil/no-namespace.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/no-namespace.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s abseil-no-namespace %t -- -- -I %S/Inputs +// RUN: %check_clang_tidy %s abseil-no-namespace %t -- -header-filter='' -- -I %S/Inputs // RUN: clang-tidy -checks='-*, abseil-no-namespace' -header-filter='.*' %s -- -I %S/Inputs 2>&1 | FileCheck %s /// Warning will not be triggered on internal Abseil code that is included. diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/reserved-identifier.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/reserved-identifier.cpp index 0f36efe656bf9..b17e8903c41c2 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/reserved-identifier.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/reserved-identifier.cpp @@ -1,8 +1,9 @@ -// RUN: %check_clang_tidy %s bugprone-reserved-identifier %t -- -- \ +// RUN: %check_clang_tidy %s bugprone-reserved-identifier %t -- \ +// RUN: -header-filter='' -- \ // RUN: -I%S/Inputs/reserved-identifier \ // RUN: -isystem %S/Inputs/reserved-identifier/system -// no warnings expected without -header-filter= +// no warnings expected with -header-filter='' #include "user-header.h" #include diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp b/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp index edb11b9863532..5b30541a96a42 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp @@ -1,5 +1,5 @@ -// RUN: %check_clang_tidy %s google-upgrade-googletest-case %t -- -- -I%S/Inputs -// RUN: %check_clang_tidy -check-suffix=NOSUITE %s google-upgrade-googletest-case %t -- -- -DNOSUITE -I%S/Inputs/gtest/nosuite +// RUN: %check_clang_tidy %s google-upgrade-googletest-case %t -- -- -isystem%S/Inputs +// RUN: %check_clang_tidy -check-suffix=NOSUITE %s google-upgrade-googletest-case %t -- -- -DNOSUITE -isystem%S/Inputs/gtest/nosuite #include "gtest/gtest.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp index 2281c1acad94f..371f3ddf6d650 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s modernize-replace-auto-ptr %t -- -- -I %S/Inputs/replace-auto-ptr +// RUN: %check_clang_tidy %s modernize-replace-auto-ptr %t -- -- -isystem %S/Inputs/replace-auto-ptr // CHECK-FIXES: #include diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp index 8288f39126a11..5b8eca2825645 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s modernize-use-using %t -- -- -fno-delayed-template-parsing -I %S/Inputs/use-using/ +// RUN: %check_clang_tidy %s modernize-use-using %t -- -- -fno-delayed-template-parsing -isystem %S/Inputs/use-using/ typedef int Type; // CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef' [modernize-use-using] diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/duplicate-include.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/duplicate-include.cpp index 223f07724c5d0..c452f69fad07d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/duplicate-include.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/duplicate-include.cpp @@ -1,4 +1,6 @@ -// RUN: %check_clang_tidy %s readability-duplicate-include %t -- -- -isystem %S/Inputs/duplicate-include/system -I %S/Inputs/duplicate-include +// RUN: %check_clang_tidy %s readability-duplicate-include %t -- \ +// RUN: -header-filter='' \ +// RUN: -- -isystem %S/Inputs/duplicate-include/system -I %S/Inputs/duplicate-include int a; #include diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming.cpp index 91807337176d9..1d06df3bbfaf2 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming.cpp @@ -86,7 +86,9 @@ // RUN: readability-identifier-naming.LocalPointerPrefix: 'l_', \ // RUN: readability-identifier-naming.LocalConstantPointerCase: CamelCase, \ // RUN: readability-identifier-naming.LocalConstantPointerPrefix: 'lc_', \ -// RUN: }}' -- -fno-delayed-template-parsing -Dbad_macro \ +// RUN: }}' \ +// RUN: -header-filter='' \ +// RUN: -- -fno-delayed-template-parsing -Dbad_macro \ // RUN: -I%S/Inputs/identifier-naming \ // RUN: -isystem %S/Inputs/identifier-naming/system @@ -95,8 +97,7 @@ #include #include #include "user-header.h" -// NO warnings or fixes expected from declarations within header files without -// the -header-filter= option +// NO warnings or fixes expected from declarations with the -header-filter='' option namespace FOO_NS { // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: invalid case style for namespace 'FOO_NS' [readability-identifier-naming] diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/default-header-filter.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/default-header-filter.cpp new file mode 100644 index 0000000000000..489b302ac0512 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/infrastructure/default-header-filter.cpp @@ -0,0 +1,27 @@ + +// RUN: clang-tidy -checks='-*,google-explicit-constructor' --config='{}' %s -- -I %S/Inputs/file-filter -isystem %S/Inputs/file-filter/system 2>&1 | FileCheck --check-prefix=CHECK-DEFAULT %s +// RUN: clang-tidy -checks='-*,google-explicit-constructor' --config='{}' -header-filter='' %s -- -I %S/Inputs/file-filter -isystem %S/Inputs/file-filter/system 2>&1 | FileCheck --check-prefix=CHECK-EMPTY %s +// RUN: clang-tidy -checks='-*,google-explicit-constructor' --config='{}' -header-filter='.*' %s -- -I %S/Inputs/file-filter -isystem %S/Inputs/file-filter/system 2>&1 | FileCheck --check-prefix=CHECK-EXPLICIT %s +// RUN: clang-tidy -checks='-*,google-explicit-constructor' --config='{}' %s -- -I %S/Inputs/file-filter -isystem %S/Inputs/file-filter/system 2>&1 | FileCheck --check-prefix=CHECK-NO-SYSTEM %s +// RUN: clang-tidy -checks='-*,google-explicit-constructor' --config='{}' -system-headers %s -- -I %S/Inputs/file-filter -isystem %S/Inputs/file-filter/system 2>&1 | FileCheck --check-prefix=CHECK-WITH-SYSTEM %s + +#include "header1.h" +// CHECK-DEFAULT: header1.h:1:12: warning: single-argument constructors must be marked explicit +// CHECK-EMPTY-NOT: header1.h:1:12: warning: +// CHECK-EXPLICIT: header1.h:1:12: warning: single-argument constructors must be marked explicit +// CHECK-NO-SYSTEM: header1.h:1:12: warning: single-argument constructors must be marked explicit +// CHECK-WITH-SYSTEM-DAG: header1.h:1:12: warning: single-argument constructors must be marked explicit + +#include +// CHECK-DEFAULT-NOT: system-header.h:1:12: warning: +// CHECK-EMPTY-NOT: system-header.h:1:12: warning: +// CHECK-EXPLICIT-NOT: system-header.h:1:12: warning: +// CHECK-NO-SYSTEM-NOT: system-header.h:1:12: warning: +// CHECK-WITH-SYSTEM-DAG: system-header.h:1:12: warning: single-argument constructors must be marked explicit + +class A { A(int); }; +// CHECK-DEFAULT: :[[@LINE-1]]:11: warning: single-argument constructors must be marked explicit +// CHECK-EMPTY: :[[@LINE-2]]:11: warning: single-argument constructors must be marked explicit +// CHECK-EXPLICIT: :[[@LINE-3]]:11: warning: single-argument constructors must be marked explicit +// CHECK-NO-SYSTEM: :[[@LINE-4]]:11: warning: single-argument constructors must be marked explicit +// CHECK-WITH-SYSTEM: :[[@LINE-5]]:11: warning: single-argument constructors must be marked explicit diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/file-filter.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/file-filter.cpp index d9ec1049963b0..485e9fb1f0cb7 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/file-filter.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/file-filter.cpp @@ -66,7 +66,7 @@ class A { A(int); }; // CHECK4-NOT: warning: // CHECK4-QUIET-NOT: warning: -// CHECK: Use -header-filter=.* to display errors from all non-system headers. +// CHECK: Use -header-filter=.* or leave it as default to display errors from all non-system headers. // CHECK-QUIET-NOT: Suppressed // CHECK2-QUIET-NOT: Suppressed // CHECK3: Use -header-filter=.* {{.*}} From 69b5ab86b0e8eae921bcd0f39cfc03ac3258671b Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Wed, 29 Oct 2025 00:31:56 -0700 Subject: [PATCH 082/539] [BOLT][NFC] Drop unused profile staleness stats (#165489) Equal number of blocks in a function/instructions in a block between stale profile and the binary isn't used in the matching. Remove these stats to declutter the output. Test Plan: NFC --- bolt/include/bolt/Core/BinaryContext.h | 5 ----- bolt/lib/Passes/BinaryPasses.cpp | 17 ----------------- bolt/lib/Profile/YAMLProfileReader.cpp | 3 --- 3 files changed, 25 deletions(-) diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h index 8960b1984745f..5cbc28fb38a33 100644 --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -781,11 +781,6 @@ class BinaryContext { uint64_t PseudoProbeLooseMatchedSampleCount{0}; /// the count of call matched samples uint64_t CallMatchedSampleCount{0}; - /// the number of stale functions that have matching number of blocks in - /// the profile - uint64_t NumStaleFuncsWithEqualBlockCount{0}; - /// the number of blocks that have matching size but a differing hash - uint64_t NumStaleBlocksWithEqualIcount{0}; } Stats; // Original binary execution count stats. diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index e1a1856b506cf..1d187de11c35e 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -1508,12 +1508,6 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) { if (NumAllStaleFunctions) { const float PctStale = NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f; - const float PctStaleFuncsWithEqualBlockCount = - (float)BC.Stats.NumStaleFuncsWithEqualBlockCount / - NumAllStaleFunctions * 100.0f; - const float PctStaleBlocksWithEqualIcount = - (float)BC.Stats.NumStaleBlocksWithEqualIcount / - BC.Stats.NumStaleBlocks * 100.0f; auto printErrorOrWarning = [&]() { if (PctStale > opts::StaleThreshold) BC.errs() << "BOLT-ERROR: "; @@ -1536,17 +1530,6 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) { << "%) belong to functions with invalid" " (possibly stale) profile.\n"; } - BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount - << " stale function" - << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s") - << format(" (%.1f%% of all stale)", - PctStaleFuncsWithEqualBlockCount) - << " have matching block count.\n"; - BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount - << " stale block" - << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s") - << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount) - << " have matching icount.\n"; if (PctStale > opts::StaleThreshold) { return createFatalBOLTError( Twine("BOLT-ERROR: stale functions exceed specified threshold of ") + diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp index 086e47b661e10..f0f87f9baec38 100644 --- a/bolt/lib/Profile/YAMLProfileReader.cpp +++ b/bolt/lib/Profile/YAMLProfileReader.cpp @@ -350,9 +350,6 @@ bool YAMLProfileReader::parseFunctionProfile( << MismatchedCalls << " calls, and " << MismatchedEdges << " edges in profile did not match function " << BF << '\n'; - if (YamlBF.NumBasicBlocks != BF.size()) - ++BC.Stats.NumStaleFuncsWithEqualBlockCount; - if (!opts::InferStaleProfile) return false; ArrayRef ProbeMatchSpecs; From d6a5c67848d0635ba7377890aef640020efe1dfe Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 29 Oct 2025 07:47:41 +0000 Subject: [PATCH 083/539] [AArch64][GlobalISel] Add a constant funnel shift post-legalizer combine. (#151912) We want to be able to produce extr instructions post-legalization. They are legal for scalars, acting as a funnel shift with a constant shift amount. Unfortunately I'm not sure if there is a way currently to represent that in the legalization rules, but it might be useful for several operations - to be able to treat and test operands with constant operands as legal or not. This adds a change to the existing matchOrShiftToFunnelShift so that AArch64 can generate such instructions post-legalization providing that the operation is scalar and the shift amount is constant. --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 3 +- .../include/llvm/Target/GlobalISel/Combine.td | 10 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 9 +- llvm/lib/Target/AArch64/AArch64Combine.td | 5 +- .../GlobalISel/split-wide-shifts-multiway.ll | 888 +++++++----------- llvm/test/CodeGen/AArch64/adc.ll | 6 +- llvm/test/CodeGen/AArch64/fsh.ll | 473 +++++----- llvm/test/CodeGen/AArch64/funnel-shift.ll | 55 +- llvm/test/CodeGen/AArch64/rem-by-const.ll | 173 ++-- 9 files changed, 703 insertions(+), 919 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index b0601eb72ba3f..36cb90b1bc134 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -640,7 +640,8 @@ class CombinerHelper { /// This variant does not erase \p MI after calling the build function. void applyBuildFnNoErase(MachineInstr &MI, BuildFnTy &MatchInfo) const; - bool matchOrShiftToFunnelShift(MachineInstr &MI, BuildFnTy &MatchInfo) const; + bool matchOrShiftToFunnelShift(MachineInstr &MI, bool AllowScalarConstants, + BuildFnTy &MatchInfo) const; bool matchFunnelShiftToRotate(MachineInstr &MI) const; void applyFunnelShiftToRotate(MachineInstr &MI) const; bool matchRotateOutOfRange(MachineInstr &MI) const; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 47d5d68174b38..119695e53c3cb 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1013,10 +1013,18 @@ def extract_vec_elt_combines : GICombineGroup<[ def funnel_shift_from_or_shift : GICombineRule< (defs root:$root, build_fn_matchinfo:$info), (match (wip_match_opcode G_OR):$root, - [{ return Helper.matchOrShiftToFunnelShift(*${root}, ${info}); }]), + [{ return Helper.matchOrShiftToFunnelShift(*${root}, false, ${info}); }]), (apply [{ Helper.applyBuildFn(*${root}, ${info}); }]) >; +def funnel_shift_from_or_shift_constants_are_legal : GICombineRule< + (defs root:$root, build_fn_matchinfo:$info), + (match (wip_match_opcode G_OR):$root, + [{ return Helper.matchOrShiftToFunnelShift(*${root}, true, ${info}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${info}); }]) +>; + + def funnel_shift_to_rotate : GICombineRule< (defs root:$root), (match (wip_match_opcode G_FSHL, G_FSHR):$root, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 1f104784a97ec..9ace7d65413ad 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -4425,6 +4425,7 @@ void CombinerHelper::applyBuildFnNoErase( } bool CombinerHelper::matchOrShiftToFunnelShift(MachineInstr &MI, + bool AllowScalarConstants, BuildFnTy &MatchInfo) const { assert(MI.getOpcode() == TargetOpcode::G_OR); @@ -4444,31 +4445,29 @@ bool CombinerHelper::matchOrShiftToFunnelShift(MachineInstr &MI, // Given constants C0 and C1 such that C0 + C1 is bit-width: // (or (shl x, C0), (lshr y, C1)) -> (fshl x, y, C0) or (fshr x, y, C1) - int64_t CstShlAmt, CstLShrAmt; + int64_t CstShlAmt = 0, CstLShrAmt; if (mi_match(ShlAmt, MRI, m_ICstOrSplat(CstShlAmt)) && mi_match(LShrAmt, MRI, m_ICstOrSplat(CstLShrAmt)) && CstShlAmt + CstLShrAmt == BitWidth) { FshOpc = TargetOpcode::G_FSHR; Amt = LShrAmt; - } else if (mi_match(LShrAmt, MRI, m_GSub(m_SpecificICstOrSplat(BitWidth), m_Reg(Amt))) && ShlAmt == Amt) { // (or (shl x, amt), (lshr y, (sub bw, amt))) -> (fshl x, y, amt) FshOpc = TargetOpcode::G_FSHL; - } else if (mi_match(ShlAmt, MRI, m_GSub(m_SpecificICstOrSplat(BitWidth), m_Reg(Amt))) && LShrAmt == Amt) { // (or (shl x, (sub bw, amt)), (lshr y, amt)) -> (fshr x, y, amt) FshOpc = TargetOpcode::G_FSHR; - } else { return false; } LLT AmtTy = MRI.getType(Amt); - if (!isLegalOrBeforeLegalizer({FshOpc, {Ty, AmtTy}})) + if (!isLegalOrBeforeLegalizer({FshOpc, {Ty, AmtTy}}) && + (!AllowScalarConstants || CstShlAmt == 0 || !Ty.isScalar())) return false; MatchInfo = [=](MachineIRBuilder &B) { diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index b3ec65cab51fa..278314792bfb9 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -366,6 +366,7 @@ def AArch64PostLegalizerCombiner select_to_minmax, or_to_bsp, combine_concat_vector, commute_constant_to_rhs, extract_vec_elt_combines, push_freeze_to_prevent_poison_from_propagating, - combine_mul_cmlt, combine_use_vector_truncate, - extmultomull, truncsat_combines, lshr_of_trunc_of_lshr]> { + combine_mul_cmlt, combine_use_vector_truncate, + extmultomull, truncsat_combines, lshr_of_trunc_of_lshr, + funnel_shift_from_or_shift_constants_are_legal]> { } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll index 41f7ab89094ad..480fcbd6a9788 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll @@ -4992,28 +4992,21 @@ define void @test_shl_i512_const_32(ptr %result, ptr %input) { ; GISEL-LABEL: test_shl_i512_const_32: ; GISEL: ; %bb.0: ; %entry ; GISEL-NEXT: ldp x8, x9, [x1] -; GISEL-NEXT: ldp x11, x12, [x1, #16] -; GISEL-NEXT: ldp x14, x15, [x1, #32] -; GISEL-NEXT: lsr x10, x8, #32 -; GISEL-NEXT: lsr x13, x9, #32 -; GISEL-NEXT: lsl x8, x8, #32 -; GISEL-NEXT: orr x9, x10, x9, lsl #32 -; GISEL-NEXT: lsr x10, x11, #32 -; GISEL-NEXT: orr x11, x13, x11, lsl #32 -; GISEL-NEXT: ldp x13, x16, [x1, #48] -; GISEL-NEXT: stp x8, x9, [x0] -; GISEL-NEXT: lsr x8, x12, #32 -; GISEL-NEXT: orr x10, x10, x12, lsl #32 -; GISEL-NEXT: lsr x12, x14, #32 -; GISEL-NEXT: lsr x9, x15, #32 -; GISEL-NEXT: orr x8, x8, x14, lsl #32 -; GISEL-NEXT: stp x11, x10, [x0, #16] -; GISEL-NEXT: orr x11, x12, x15, lsl #32 -; GISEL-NEXT: lsr x12, x13, #32 -; GISEL-NEXT: orr x9, x9, x13, lsl #32 -; GISEL-NEXT: stp x8, x11, [x0, #32] -; GISEL-NEXT: orr x8, x12, x16, lsl #32 -; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x13, x14, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #32 +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: stp x12, x8, [x0] +; GISEL-NEXT: extr x8, x13, x11, #32 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: extr x9, x14, x13, #32 +; GISEL-NEXT: extr x10, x15, x14, #32 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: extr x8, x16, x15, #32 +; GISEL-NEXT: stp x10, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5044,30 +5037,22 @@ define void @test_lshr_i512_const_32(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_32: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x14, [x1, #24] -; GISEL-NEXT: ldr x16, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #32 -; GISEL-NEXT: lsl x13, x9, #32 -; GISEL-NEXT: lsl x15, x10, #32 -; GISEL-NEXT: orr x11, x12, x11, lsr #32 -; GISEL-NEXT: orr x8, x13, x8, lsr #32 -; GISEL-NEXT: lsl x13, x14, #32 -; GISEL-NEXT: orr x9, x15, x9, lsr #32 -; GISEL-NEXT: ldp x12, x15, [x1, #40] -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: orr x10, x13, x10, lsr #32 -; GISEL-NEXT: lsl x8, x16, #32 -; GISEL-NEXT: lsl x11, x12, #32 -; GISEL-NEXT: lsl x13, x15, #32 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x8, x8, x15, lsr #32 -; GISEL-NEXT: lsr x10, x16, #32 -; GISEL-NEXT: orr x11, x11, x14, lsr #32 -; GISEL-NEXT: orr x9, x13, x12, lsr #32 -; GISEL-NEXT: stp x8, x10, [x0, #48] -; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #32 +; GISEL-NEXT: extr x9, x13, x12, #32 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #32 +; GISEL-NEXT: extr x8, x15, x14, #32 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: lsr x9, x15, #32 +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5098,32 +5083,24 @@ define void @test_ashr_i512_const_32(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_32: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x13, [x1, #24] -; GISEL-NEXT: ldr x17, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #32 -; GISEL-NEXT: lsl x15, x9, #32 -; GISEL-NEXT: lsl x16, x10, #32 -; GISEL-NEXT: orr x11, x12, x11, lsr #32 -; GISEL-NEXT: ldp x14, x12, [x1, #40] -; GISEL-NEXT: orr x8, x15, x8, lsr #32 -; GISEL-NEXT: lsl x15, x13, #32 -; GISEL-NEXT: orr x9, x16, x9, lsr #32 -; GISEL-NEXT: asr x16, x17, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x14, #32 -; GISEL-NEXT: orr x10, x15, x10, lsr #32 -; GISEL-NEXT: lsl x15, x12, #32 -; GISEL-NEXT: orr x8, x11, x13, lsr #32 -; GISEL-NEXT: lsl x11, x17, #32 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x15, x14, lsr #32 -; GISEL-NEXT: lsl x13, x16, #32 -; GISEL-NEXT: orr x10, x11, x12, lsr #32 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: orr x8, x13, x17, asr #32 -; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: ldp x14, x15, [x1, #32] +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: asr x8, x13, #63 +; GISEL-NEXT: extr x11, x14, x11, #32 +; GISEL-NEXT: extr x9, x15, x14, #32 +; GISEL-NEXT: lsl x8, x8, #32 +; GISEL-NEXT: stp x10, x11, [x0, #16] +; GISEL-NEXT: extr x10, x12, x15, #32 +; GISEL-NEXT: extr x11, x13, x12, #32 +; GISEL-NEXT: orr x8, x8, x13, asr #32 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x11, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5252,23 +5229,17 @@ define void @test_shl_i512_const_96(ptr %result, ptr %input) { ; GISEL-NEXT: ldr x15, [x1, #48] ; GISEL-NEXT: ldp x10, x11, [x1, #16] ; GISEL-NEXT: ldp x12, x13, [x1, #32] -; GISEL-NEXT: lsr x14, x8, #32 -; GISEL-NEXT: lsr x16, x9, #32 -; GISEL-NEXT: lsl x8, x8, #32 -; GISEL-NEXT: orr x9, x14, x9, lsl #32 -; GISEL-NEXT: lsr x14, x10, #32 -; GISEL-NEXT: orr x10, x16, x10, lsl #32 -; GISEL-NEXT: stp xzr, x8, [x0] -; GISEL-NEXT: lsr x8, x11, #32 -; GISEL-NEXT: orr x11, x14, x11, lsl #32 -; GISEL-NEXT: lsr x14, x12, #32 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: lsr x9, x13, #32 -; GISEL-NEXT: orr x8, x8, x12, lsl #32 -; GISEL-NEXT: orr x10, x14, x13, lsl #32 -; GISEL-NEXT: orr x9, x9, x15, lsl #32 -; GISEL-NEXT: stp x11, x8, [x0, #32] -; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: lsl x14, x8, #32 +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: stp xzr, x14, [x0] +; GISEL-NEXT: stp x8, x9, [x0, #16] +; GISEL-NEXT: extr x8, x12, x11, #32 +; GISEL-NEXT: extr x9, x13, x12, #32 +; GISEL-NEXT: stp x10, x8, [x0, #32] +; GISEL-NEXT: extr x10, x15, x13, #32 +; GISEL-NEXT: stp x9, x10, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5297,27 +5268,21 @@ define void @test_lshr_i512_const_96(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_96: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #32 -; GISEL-NEXT: lsl x13, x9, #32 -; GISEL-NEXT: orr x10, x12, x10, lsr #32 -; GISEL-NEXT: lsl x12, x11, #32 -; GISEL-NEXT: orr x8, x13, x8, lsr #32 -; GISEL-NEXT: lsl x13, x14, #32 -; GISEL-NEXT: orr x9, x12, x9, lsr #32 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x15, #32 -; GISEL-NEXT: orr x11, x13, x11, lsr #32 -; GISEL-NEXT: lsl x12, x16, #32 -; GISEL-NEXT: orr x8, x10, x14, lsr #32 -; GISEL-NEXT: lsr x10, x16, #32 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x12, x15, lsr #32 -; GISEL-NEXT: stp x10, xzr, [x0, #48] -; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #32 +; GISEL-NEXT: extr x9, x13, x12, #32 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #32 +; GISEL-NEXT: lsr x8, x14, #32 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, xzr, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5347,29 +5312,23 @@ define void @test_ashr_i512_const_96(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_96: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x11, [x1, #8] -; GISEL-NEXT: ldp x10, x13, [x1, #32] -; GISEL-NEXT: lsl x12, x8, #32 -; GISEL-NEXT: lsl x14, x9, #32 -; GISEL-NEXT: lsl x15, x10, #32 -; GISEL-NEXT: orr x11, x12, x11, lsr #32 -; GISEL-NEXT: ldp x12, x16, [x1, #48] -; GISEL-NEXT: orr x8, x14, x8, lsr #32 -; GISEL-NEXT: lsl x14, x13, #32 -; GISEL-NEXT: orr x9, x15, x9, lsr #32 -; GISEL-NEXT: asr x15, x16, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x12, #32 -; GISEL-NEXT: orr x10, x14, x10, lsr #32 -; GISEL-NEXT: lsl x14, x16, #32 -; GISEL-NEXT: orr x8, x11, x13, lsr #32 +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x13, [x1, #40] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x14, x12, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: asr x15, x12, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x13, x11, #32 +; GISEL-NEXT: extr x9, x14, x13, #32 ; GISEL-NEXT: lsl x11, x15, #32 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x14, x12, lsr #32 -; GISEL-NEXT: orr x10, x11, x16, asr #32 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: stp x10, x15, [x0, #48] +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x12, x14, #32 +; GISEL-NEXT: orr x8, x11, x12, asr #32 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, x15, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5404,28 +5363,21 @@ define void @test_shl_i512_const_1(ptr %result, ptr %input) { ; GISEL-LABEL: test_shl_i512_const_1: ; GISEL: ; %bb.0: ; %entry ; GISEL-NEXT: ldp x8, x9, [x1] -; GISEL-NEXT: ldp x11, x12, [x1, #16] -; GISEL-NEXT: ldp x14, x15, [x1, #32] -; GISEL-NEXT: lsr x10, x8, #63 -; GISEL-NEXT: lsr x13, x9, #63 -; GISEL-NEXT: lsl x8, x8, #1 -; GISEL-NEXT: orr x9, x10, x9, lsl #1 -; GISEL-NEXT: lsr x10, x11, #63 -; GISEL-NEXT: orr x11, x13, x11, lsl #1 -; GISEL-NEXT: ldp x13, x16, [x1, #48] -; GISEL-NEXT: stp x8, x9, [x0] -; GISEL-NEXT: lsr x8, x12, #63 -; GISEL-NEXT: orr x10, x10, x12, lsl #1 -; GISEL-NEXT: lsr x12, x14, #63 -; GISEL-NEXT: lsr x9, x15, #63 -; GISEL-NEXT: orr x8, x8, x14, lsl #1 -; GISEL-NEXT: stp x11, x10, [x0, #16] -; GISEL-NEXT: orr x11, x12, x15, lsl #1 -; GISEL-NEXT: lsr x12, x13, #63 -; GISEL-NEXT: orr x9, x9, x13, lsl #1 -; GISEL-NEXT: stp x8, x11, [x0, #32] -; GISEL-NEXT: orr x8, x12, x16, lsl #1 -; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x13, x14, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #1 +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: stp x12, x8, [x0] +; GISEL-NEXT: extr x8, x13, x11, #63 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: extr x9, x14, x13, #63 +; GISEL-NEXT: extr x10, x15, x14, #63 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: extr x8, x16, x15, #63 +; GISEL-NEXT: stp x10, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5457,30 +5409,22 @@ define void @test_lshr_i512_const_1(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_1: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x14, [x1, #24] -; GISEL-NEXT: ldr x16, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #63 -; GISEL-NEXT: lsl x13, x9, #63 -; GISEL-NEXT: lsl x15, x10, #63 -; GISEL-NEXT: orr x11, x12, x11, lsr #1 -; GISEL-NEXT: orr x8, x13, x8, lsr #1 -; GISEL-NEXT: lsl x13, x14, #63 -; GISEL-NEXT: orr x9, x15, x9, lsr #1 -; GISEL-NEXT: ldp x12, x15, [x1, #40] -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: orr x10, x13, x10, lsr #1 -; GISEL-NEXT: lsl x8, x16, #63 -; GISEL-NEXT: lsl x11, x12, #63 -; GISEL-NEXT: lsl x13, x15, #63 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x8, x8, x15, lsr #1 -; GISEL-NEXT: lsr x10, x16, #1 -; GISEL-NEXT: orr x11, x11, x14, lsr #1 -; GISEL-NEXT: orr x9, x13, x12, lsr #1 -; GISEL-NEXT: stp x8, x10, [x0, #48] -; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #1 +; GISEL-NEXT: extr x9, x13, x12, #1 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #1 +; GISEL-NEXT: extr x8, x15, x14, #1 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: lsr x9, x15, #1 +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5512,32 +5456,24 @@ define void @test_ashr_i512_const_1(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_1: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x13, [x1, #24] -; GISEL-NEXT: ldr x17, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #63 -; GISEL-NEXT: lsl x15, x9, #63 -; GISEL-NEXT: lsl x16, x10, #63 -; GISEL-NEXT: orr x11, x12, x11, lsr #1 -; GISEL-NEXT: ldp x14, x12, [x1, #40] -; GISEL-NEXT: orr x8, x15, x8, lsr #1 -; GISEL-NEXT: lsl x15, x13, #63 -; GISEL-NEXT: orr x9, x16, x9, lsr #1 -; GISEL-NEXT: asr x16, x17, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x14, #63 -; GISEL-NEXT: orr x10, x15, x10, lsr #1 -; GISEL-NEXT: lsl x15, x12, #63 -; GISEL-NEXT: orr x8, x11, x13, lsr #1 -; GISEL-NEXT: lsl x11, x17, #63 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x15, x14, lsr #1 -; GISEL-NEXT: lsl x13, x16, #63 -; GISEL-NEXT: orr x10, x11, x12, lsr #1 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: orr x8, x13, x17, asr #1 -; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: ldp x14, x15, [x1, #32] +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: asr x8, x13, #63 +; GISEL-NEXT: extr x11, x14, x11, #1 +; GISEL-NEXT: extr x9, x15, x14, #1 +; GISEL-NEXT: lsl x8, x8, #63 +; GISEL-NEXT: stp x10, x11, [x0, #16] +; GISEL-NEXT: extr x10, x12, x15, #1 +; GISEL-NEXT: extr x11, x13, x12, #1 +; GISEL-NEXT: orr x8, x8, x13, asr #1 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x11, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5571,28 +5507,21 @@ define void @test_shl_i512_const_15(ptr %result, ptr %input) { ; GISEL-LABEL: test_shl_i512_const_15: ; GISEL: ; %bb.0: ; %entry ; GISEL-NEXT: ldp x8, x9, [x1] -; GISEL-NEXT: ldp x11, x12, [x1, #16] -; GISEL-NEXT: ldp x14, x15, [x1, #32] -; GISEL-NEXT: lsr x10, x8, #49 -; GISEL-NEXT: lsr x13, x9, #49 -; GISEL-NEXT: lsl x8, x8, #15 -; GISEL-NEXT: orr x9, x10, x9, lsl #15 -; GISEL-NEXT: lsr x10, x11, #49 -; GISEL-NEXT: orr x11, x13, x11, lsl #15 -; GISEL-NEXT: ldp x13, x16, [x1, #48] -; GISEL-NEXT: stp x8, x9, [x0] -; GISEL-NEXT: lsr x8, x12, #49 -; GISEL-NEXT: orr x10, x10, x12, lsl #15 -; GISEL-NEXT: lsr x12, x14, #49 -; GISEL-NEXT: lsr x9, x15, #49 -; GISEL-NEXT: orr x8, x8, x14, lsl #15 -; GISEL-NEXT: stp x11, x10, [x0, #16] -; GISEL-NEXT: orr x11, x12, x15, lsl #15 -; GISEL-NEXT: lsr x12, x13, #49 -; GISEL-NEXT: orr x9, x9, x13, lsl #15 -; GISEL-NEXT: stp x8, x11, [x0, #32] -; GISEL-NEXT: orr x8, x12, x16, lsl #15 -; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x13, x14, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #15 +; GISEL-NEXT: extr x8, x9, x8, #49 +; GISEL-NEXT: extr x9, x10, x9, #49 +; GISEL-NEXT: extr x10, x11, x10, #49 +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: stp x12, x8, [x0] +; GISEL-NEXT: extr x8, x13, x11, #49 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: extr x9, x14, x13, #49 +; GISEL-NEXT: extr x10, x15, x14, #49 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: extr x8, x16, x15, #49 +; GISEL-NEXT: stp x10, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5624,30 +5553,22 @@ define void @test_lshr_i512_const_15(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_15: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x14, [x1, #24] -; GISEL-NEXT: ldr x16, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #49 -; GISEL-NEXT: lsl x13, x9, #49 -; GISEL-NEXT: lsl x15, x10, #49 -; GISEL-NEXT: orr x11, x12, x11, lsr #15 -; GISEL-NEXT: orr x8, x13, x8, lsr #15 -; GISEL-NEXT: lsl x13, x14, #49 -; GISEL-NEXT: orr x9, x15, x9, lsr #15 -; GISEL-NEXT: ldp x12, x15, [x1, #40] -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: orr x10, x13, x10, lsr #15 -; GISEL-NEXT: lsl x8, x16, #49 -; GISEL-NEXT: lsl x11, x12, #49 -; GISEL-NEXT: lsl x13, x15, #49 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x8, x8, x15, lsr #15 -; GISEL-NEXT: lsr x10, x16, #15 -; GISEL-NEXT: orr x11, x11, x14, lsr #15 -; GISEL-NEXT: orr x9, x13, x12, lsr #15 -; GISEL-NEXT: stp x8, x10, [x0, #48] -; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #15 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #15 +; GISEL-NEXT: extr x10, x11, x10, #15 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #15 +; GISEL-NEXT: extr x9, x13, x12, #15 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #15 +; GISEL-NEXT: extr x8, x15, x14, #15 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: lsr x9, x15, #15 +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5679,32 +5600,24 @@ define void @test_ashr_i512_const_15(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_15: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x13, [x1, #24] -; GISEL-NEXT: ldr x17, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #49 -; GISEL-NEXT: lsl x15, x9, #49 -; GISEL-NEXT: lsl x16, x10, #49 -; GISEL-NEXT: orr x11, x12, x11, lsr #15 -; GISEL-NEXT: ldp x14, x12, [x1, #40] -; GISEL-NEXT: orr x8, x15, x8, lsr #15 -; GISEL-NEXT: lsl x15, x13, #49 -; GISEL-NEXT: orr x9, x16, x9, lsr #15 -; GISEL-NEXT: asr x16, x17, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x14, #49 -; GISEL-NEXT: orr x10, x15, x10, lsr #15 -; GISEL-NEXT: lsl x15, x12, #49 -; GISEL-NEXT: orr x8, x11, x13, lsr #15 -; GISEL-NEXT: lsl x11, x17, #49 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x15, x14, lsr #15 -; GISEL-NEXT: lsl x13, x16, #49 -; GISEL-NEXT: orr x10, x11, x12, lsr #15 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: orr x8, x13, x17, asr #15 -; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #15 +; GISEL-NEXT: ldp x14, x15, [x1, #32] +; GISEL-NEXT: extr x9, x10, x9, #15 +; GISEL-NEXT: extr x10, x11, x10, #15 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: asr x8, x13, #63 +; GISEL-NEXT: extr x11, x14, x11, #15 +; GISEL-NEXT: extr x9, x15, x14, #15 +; GISEL-NEXT: lsl x8, x8, #49 +; GISEL-NEXT: stp x10, x11, [x0, #16] +; GISEL-NEXT: extr x10, x12, x15, #15 +; GISEL-NEXT: extr x11, x13, x12, #15 +; GISEL-NEXT: orr x8, x8, x13, asr #15 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x11, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5738,28 +5651,21 @@ define void @test_shl_i512_const_63(ptr %result, ptr %input) { ; GISEL-LABEL: test_shl_i512_const_63: ; GISEL: ; %bb.0: ; %entry ; GISEL-NEXT: ldp x8, x9, [x1] -; GISEL-NEXT: ldp x11, x12, [x1, #16] -; GISEL-NEXT: ldp x14, x15, [x1, #32] -; GISEL-NEXT: lsr x10, x8, #1 -; GISEL-NEXT: lsr x13, x9, #1 -; GISEL-NEXT: lsl x8, x8, #63 -; GISEL-NEXT: orr x9, x10, x9, lsl #63 -; GISEL-NEXT: lsr x10, x11, #1 -; GISEL-NEXT: orr x11, x13, x11, lsl #63 -; GISEL-NEXT: ldp x13, x16, [x1, #48] -; GISEL-NEXT: stp x8, x9, [x0] -; GISEL-NEXT: lsr x8, x12, #1 -; GISEL-NEXT: orr x10, x10, x12, lsl #63 -; GISEL-NEXT: lsr x12, x14, #1 -; GISEL-NEXT: lsr x9, x15, #1 -; GISEL-NEXT: orr x8, x8, x14, lsl #63 -; GISEL-NEXT: stp x11, x10, [x0, #16] -; GISEL-NEXT: orr x11, x12, x15, lsl #63 -; GISEL-NEXT: lsr x12, x13, #1 -; GISEL-NEXT: orr x9, x9, x13, lsl #63 -; GISEL-NEXT: stp x8, x11, [x0, #32] -; GISEL-NEXT: orr x8, x12, x16, lsl #63 -; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x13, x14, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #63 +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: stp x12, x8, [x0] +; GISEL-NEXT: extr x8, x13, x11, #1 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: extr x9, x14, x13, #1 +; GISEL-NEXT: extr x10, x15, x14, #1 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: extr x8, x16, x15, #1 +; GISEL-NEXT: stp x10, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5791,30 +5697,22 @@ define void @test_lshr_i512_const_63(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_63: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x14, [x1, #24] -; GISEL-NEXT: ldr x16, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #1 -; GISEL-NEXT: lsl x13, x9, #1 -; GISEL-NEXT: lsl x15, x10, #1 -; GISEL-NEXT: orr x11, x12, x11, lsr #63 -; GISEL-NEXT: orr x8, x13, x8, lsr #63 -; GISEL-NEXT: lsl x13, x14, #1 -; GISEL-NEXT: orr x9, x15, x9, lsr #63 -; GISEL-NEXT: ldp x12, x15, [x1, #40] -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: orr x10, x13, x10, lsr #63 -; GISEL-NEXT: lsl x8, x16, #1 -; GISEL-NEXT: lsl x11, x12, #1 -; GISEL-NEXT: lsl x13, x15, #1 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x8, x8, x15, lsr #63 -; GISEL-NEXT: lsr x10, x16, #63 -; GISEL-NEXT: orr x11, x11, x14, lsr #63 -; GISEL-NEXT: orr x9, x13, x12, lsr #63 -; GISEL-NEXT: stp x8, x10, [x0, #48] -; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: extr x9, x13, x12, #63 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #63 +; GISEL-NEXT: extr x8, x15, x14, #63 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: lsr x9, x15, #63 +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5846,30 +5744,22 @@ define void @test_ashr_i512_const_63(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_63: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x10, [x1] -; GISEL-NEXT: ldp x11, x13, [x1, #24] -; GISEL-NEXT: ldr x17, [x1, #56] -; GISEL-NEXT: lsl x15, x9, #1 -; GISEL-NEXT: lsl x12, x8, #1 -; GISEL-NEXT: lsl x16, x11, #1 -; GISEL-NEXT: orr x8, x15, x8, lsr #63 -; GISEL-NEXT: lsl x15, x13, #1 -; GISEL-NEXT: orr x10, x12, x10, lsr #63 -; GISEL-NEXT: ldp x14, x12, [x1, #40] -; GISEL-NEXT: orr x9, x16, x9, lsr #63 -; GISEL-NEXT: orr x11, x15, x11, lsr #63 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x8, x17, #1 -; GISEL-NEXT: lsl x16, x14, #1 -; GISEL-NEXT: lsl x10, x12, #1 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: asr x9, x17, #63 -; GISEL-NEXT: orr x8, x8, x12, lsr #63 -; GISEL-NEXT: orr x13, x16, x13, lsr #63 -; GISEL-NEXT: orr x10, x10, x14, lsr #63 -; GISEL-NEXT: orr x9, x9, x9, lsl #1 -; GISEL-NEXT: stp x13, x10, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: extr x9, x13, x12, #63 +; GISEL-NEXT: extr x11, x14, x13, #63 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: asr x10, x15, #63 +; GISEL-NEXT: extr x8, x15, x14, #63 +; GISEL-NEXT: stp x9, x11, [x0, #32] +; GISEL-NEXT: orr x9, x10, x10, lsl #1 ; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: @@ -5906,23 +5796,17 @@ define void @test_shl_i512_const_65(ptr %result, ptr %input) { ; GISEL-NEXT: ldr x15, [x1, #48] ; GISEL-NEXT: ldp x10, x11, [x1, #16] ; GISEL-NEXT: ldp x12, x13, [x1, #32] -; GISEL-NEXT: lsr x14, x8, #63 -; GISEL-NEXT: lsr x16, x9, #63 -; GISEL-NEXT: lsl x8, x8, #1 -; GISEL-NEXT: orr x9, x14, x9, lsl #1 -; GISEL-NEXT: lsr x14, x10, #63 -; GISEL-NEXT: orr x10, x16, x10, lsl #1 -; GISEL-NEXT: stp xzr, x8, [x0] -; GISEL-NEXT: lsr x8, x11, #63 -; GISEL-NEXT: orr x11, x14, x11, lsl #1 -; GISEL-NEXT: lsr x14, x12, #63 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: lsr x9, x13, #63 -; GISEL-NEXT: orr x8, x8, x12, lsl #1 -; GISEL-NEXT: orr x10, x14, x13, lsl #1 -; GISEL-NEXT: orr x9, x9, x15, lsl #1 -; GISEL-NEXT: stp x11, x8, [x0, #32] -; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: lsl x14, x8, #1 +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp xzr, x14, [x0] +; GISEL-NEXT: stp x8, x9, [x0, #16] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: extr x9, x13, x12, #63 +; GISEL-NEXT: stp x10, x8, [x0, #32] +; GISEL-NEXT: extr x10, x15, x13, #63 +; GISEL-NEXT: stp x9, x10, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5953,27 +5837,21 @@ define void @test_lshr_i512_const_65(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_65: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #63 -; GISEL-NEXT: lsl x13, x9, #63 -; GISEL-NEXT: orr x10, x12, x10, lsr #1 -; GISEL-NEXT: lsl x12, x11, #63 -; GISEL-NEXT: orr x8, x13, x8, lsr #1 -; GISEL-NEXT: lsl x13, x14, #63 -; GISEL-NEXT: orr x9, x12, x9, lsr #1 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x15, #63 -; GISEL-NEXT: orr x11, x13, x11, lsr #1 -; GISEL-NEXT: lsl x12, x16, #63 -; GISEL-NEXT: orr x8, x10, x14, lsr #1 -; GISEL-NEXT: lsr x10, x16, #1 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x12, x15, lsr #1 -; GISEL-NEXT: stp x10, xzr, [x0, #48] -; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #1 +; GISEL-NEXT: extr x9, x13, x12, #1 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #1 +; GISEL-NEXT: lsr x8, x14, #1 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, xzr, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6005,29 +5883,23 @@ define void @test_ashr_i512_const_65(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_65: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x11, [x1, #8] -; GISEL-NEXT: ldp x10, x13, [x1, #32] -; GISEL-NEXT: lsl x12, x8, #63 -; GISEL-NEXT: lsl x14, x9, #63 -; GISEL-NEXT: lsl x15, x10, #63 -; GISEL-NEXT: orr x11, x12, x11, lsr #1 -; GISEL-NEXT: ldp x12, x16, [x1, #48] -; GISEL-NEXT: orr x8, x14, x8, lsr #1 -; GISEL-NEXT: lsl x14, x13, #63 -; GISEL-NEXT: orr x9, x15, x9, lsr #1 -; GISEL-NEXT: asr x15, x16, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x12, #63 -; GISEL-NEXT: orr x10, x14, x10, lsr #1 -; GISEL-NEXT: lsl x14, x16, #63 -; GISEL-NEXT: orr x8, x11, x13, lsr #1 +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x13, [x1, #40] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x14, x12, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: asr x15, x12, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x13, x11, #1 +; GISEL-NEXT: extr x9, x14, x13, #1 ; GISEL-NEXT: lsl x11, x15, #63 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x14, x12, lsr #1 -; GISEL-NEXT: orr x10, x11, x16, asr #1 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: stp x10, x15, [x0, #48] +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x12, x14, #1 +; GISEL-NEXT: orr x8, x11, x12, asr #1 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, x15, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6062,23 +5934,17 @@ define void @test_shl_i512_const_100(ptr %result, ptr %input) { ; GISEL-NEXT: ldr x15, [x1, #48] ; GISEL-NEXT: ldp x10, x11, [x1, #16] ; GISEL-NEXT: ldp x12, x13, [x1, #32] -; GISEL-NEXT: lsr x14, x8, #28 -; GISEL-NEXT: lsr x16, x9, #28 -; GISEL-NEXT: lsl x8, x8, #36 -; GISEL-NEXT: orr x9, x14, x9, lsl #36 -; GISEL-NEXT: lsr x14, x10, #28 -; GISEL-NEXT: orr x10, x16, x10, lsl #36 -; GISEL-NEXT: stp xzr, x8, [x0] -; GISEL-NEXT: lsr x8, x11, #28 -; GISEL-NEXT: orr x11, x14, x11, lsl #36 -; GISEL-NEXT: lsr x14, x12, #28 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: lsr x9, x13, #28 -; GISEL-NEXT: orr x8, x8, x12, lsl #36 -; GISEL-NEXT: orr x10, x14, x13, lsl #36 -; GISEL-NEXT: orr x9, x9, x15, lsl #36 -; GISEL-NEXT: stp x11, x8, [x0, #32] -; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: lsl x14, x8, #36 +; GISEL-NEXT: extr x8, x9, x8, #28 +; GISEL-NEXT: extr x9, x10, x9, #28 +; GISEL-NEXT: extr x10, x11, x10, #28 +; GISEL-NEXT: stp xzr, x14, [x0] +; GISEL-NEXT: stp x8, x9, [x0, #16] +; GISEL-NEXT: extr x8, x12, x11, #28 +; GISEL-NEXT: extr x9, x13, x12, #28 +; GISEL-NEXT: stp x10, x8, [x0, #32] +; GISEL-NEXT: extr x10, x15, x13, #28 +; GISEL-NEXT: stp x9, x10, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6109,27 +5975,21 @@ define void @test_lshr_i512_const_100(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_100: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #28 -; GISEL-NEXT: lsl x13, x9, #28 -; GISEL-NEXT: orr x10, x12, x10, lsr #36 -; GISEL-NEXT: lsl x12, x11, #28 -; GISEL-NEXT: orr x8, x13, x8, lsr #36 -; GISEL-NEXT: lsl x13, x14, #28 -; GISEL-NEXT: orr x9, x12, x9, lsr #36 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x15, #28 -; GISEL-NEXT: orr x11, x13, x11, lsr #36 -; GISEL-NEXT: lsl x12, x16, #28 -; GISEL-NEXT: orr x8, x10, x14, lsr #36 -; GISEL-NEXT: lsr x10, x16, #36 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x12, x15, lsr #36 -; GISEL-NEXT: stp x10, xzr, [x0, #48] -; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #36 +; GISEL-NEXT: extr x9, x10, x9, #36 +; GISEL-NEXT: extr x10, x11, x10, #36 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #36 +; GISEL-NEXT: extr x9, x13, x12, #36 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #36 +; GISEL-NEXT: lsr x8, x14, #36 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, xzr, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6161,29 +6021,23 @@ define void @test_ashr_i512_const_100(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_100: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x11, [x1, #8] -; GISEL-NEXT: ldp x10, x13, [x1, #32] -; GISEL-NEXT: lsl x12, x8, #28 -; GISEL-NEXT: lsl x14, x9, #28 -; GISEL-NEXT: lsl x15, x10, #28 -; GISEL-NEXT: orr x11, x12, x11, lsr #36 -; GISEL-NEXT: ldp x12, x16, [x1, #48] -; GISEL-NEXT: orr x8, x14, x8, lsr #36 -; GISEL-NEXT: lsl x14, x13, #28 -; GISEL-NEXT: orr x9, x15, x9, lsr #36 -; GISEL-NEXT: asr x15, x16, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x12, #28 -; GISEL-NEXT: orr x10, x14, x10, lsr #36 -; GISEL-NEXT: lsl x14, x16, #28 -; GISEL-NEXT: orr x8, x11, x13, lsr #36 +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x13, [x1, #40] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x14, x12, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #36 +; GISEL-NEXT: extr x9, x10, x9, #36 +; GISEL-NEXT: extr x10, x11, x10, #36 +; GISEL-NEXT: asr x15, x12, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x13, x11, #36 +; GISEL-NEXT: extr x9, x14, x13, #36 ; GISEL-NEXT: lsl x11, x15, #28 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x14, x12, lsr #36 -; GISEL-NEXT: orr x10, x11, x16, asr #36 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: stp x10, x15, [x0, #48] +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x12, x14, #36 +; GISEL-NEXT: orr x8, x11, x12, asr #36 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, x15, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6219,23 +6073,17 @@ define void @test_shl_i512_const_127(ptr %result, ptr %input) { ; GISEL-NEXT: ldr x15, [x1, #48] ; GISEL-NEXT: ldp x10, x11, [x1, #16] ; GISEL-NEXT: ldp x12, x13, [x1, #32] -; GISEL-NEXT: lsr x14, x8, #1 -; GISEL-NEXT: lsr x16, x9, #1 -; GISEL-NEXT: lsl x8, x8, #63 -; GISEL-NEXT: orr x9, x14, x9, lsl #63 -; GISEL-NEXT: lsr x14, x10, #1 -; GISEL-NEXT: orr x10, x16, x10, lsl #63 -; GISEL-NEXT: stp xzr, x8, [x0] -; GISEL-NEXT: lsr x8, x11, #1 -; GISEL-NEXT: orr x11, x14, x11, lsl #63 -; GISEL-NEXT: lsr x14, x12, #1 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: lsr x9, x13, #1 -; GISEL-NEXT: orr x8, x8, x12, lsl #63 -; GISEL-NEXT: orr x10, x14, x13, lsl #63 -; GISEL-NEXT: orr x9, x9, x15, lsl #63 -; GISEL-NEXT: stp x11, x8, [x0, #32] -; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: lsl x14, x8, #63 +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: stp xzr, x14, [x0] +; GISEL-NEXT: stp x8, x9, [x0, #16] +; GISEL-NEXT: extr x8, x12, x11, #1 +; GISEL-NEXT: extr x9, x13, x12, #1 +; GISEL-NEXT: stp x10, x8, [x0, #32] +; GISEL-NEXT: extr x10, x15, x13, #1 +; GISEL-NEXT: stp x9, x10, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6266,27 +6114,21 @@ define void @test_lshr_i512_const_127(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_127: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #1 -; GISEL-NEXT: lsl x13, x9, #1 -; GISEL-NEXT: orr x10, x12, x10, lsr #63 -; GISEL-NEXT: lsl x12, x11, #1 -; GISEL-NEXT: orr x8, x13, x8, lsr #63 -; GISEL-NEXT: lsl x13, x14, #1 -; GISEL-NEXT: orr x9, x12, x9, lsr #63 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x15, #1 -; GISEL-NEXT: orr x11, x13, x11, lsr #63 -; GISEL-NEXT: lsl x12, x16, #1 -; GISEL-NEXT: orr x8, x10, x14, lsr #63 -; GISEL-NEXT: lsr x10, x16, #63 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x12, x15, lsr #63 -; GISEL-NEXT: stp x10, xzr, [x0, #48] -; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: extr x9, x13, x12, #63 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #63 +; GISEL-NEXT: lsr x8, x14, #63 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, xzr, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6317,28 +6159,22 @@ define void @test_ashr_i512_const_127(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_127: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #1 -; GISEL-NEXT: lsl x13, x9, #1 -; GISEL-NEXT: orr x10, x12, x10, lsr #63 -; GISEL-NEXT: lsl x12, x11, #1 -; GISEL-NEXT: orr x8, x13, x8, lsr #63 -; GISEL-NEXT: lsl x13, x14, #1 -; GISEL-NEXT: orr x9, x12, x9, lsr #63 -; GISEL-NEXT: lsl x12, x15, #1 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x16, #1 -; GISEL-NEXT: orr x11, x13, x11, lsr #63 -; GISEL-NEXT: asr x8, x16, #63 -; GISEL-NEXT: orr x12, x12, x14, lsr #63 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x10, x15, lsr #63 -; GISEL-NEXT: orr x10, x8, x8, lsl #1 -; GISEL-NEXT: stp x12, x9, [x0, #32] -; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: asr x9, x14, #63 +; GISEL-NEXT: extr x11, x13, x12, #63 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #63 +; GISEL-NEXT: orr x8, x9, x9, lsl #1 +; GISEL-NEXT: stp x11, x10, [x0, #32] +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 diff --git a/llvm/test/CodeGen/AArch64/adc.ll b/llvm/test/CodeGen/AArch64/adc.ll index 12e8bf26c9eac..03f3cf192102d 100644 --- a/llvm/test/CodeGen/AArch64/adc.ll +++ b/llvm/test/CodeGen/AArch64/adc.ll @@ -71,9 +71,8 @@ define i128 @test_shifted(i128 %a, i128 %b) { ; ; CHECK-GI-LABEL: test_shifted: ; CHECK-GI: ; %bb.0: -; CHECK-GI-NEXT: lsr x8, x2, #19 +; CHECK-GI-NEXT: extr x8, x3, x2, #19 ; CHECK-GI-NEXT: adds x0, x0, x2, lsl #45 -; CHECK-GI-NEXT: orr x8, x8, x3, lsl #45 ; CHECK-GI-NEXT: adc x1, x1, x8 ; CHECK-GI-NEXT: ret %rhs = shl i128 %b, 45 @@ -108,8 +107,7 @@ define i128 @test_extended(i128 %a, i16 %b) { ; CHECK-GI-NEXT: sxth x8, w2 ; CHECK-GI-NEXT: adds x0, x0, w2, sxth #3 ; CHECK-GI-NEXT: asr x9, x8, #63 -; CHECK-GI-NEXT: lsr x8, x8, #61 -; CHECK-GI-NEXT: orr x8, x8, x9, lsl #3 +; CHECK-GI-NEXT: extr x8, x9, x8, #61 ; CHECK-GI-NEXT: adc x1, x1, x8 ; CHECK-GI-NEXT: ret %ext = sext i16 %b to i128 diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll index 765f6b77b41a9..7f07ef476b8aa 100644 --- a/llvm/test/CodeGen/AArch64/fsh.ll +++ b/llvm/test/CodeGen/AArch64/fsh.ll @@ -510,41 +510,40 @@ define i128 @fshl_i128(i128 %a, i128 %b, i128 %c) { ; ; CHECK-GI-LABEL: fshl_i128: ; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #64 // =0x40 ; CHECK-GI-NEXT: and x9, x4, #0x7f -; CHECK-GI-NEXT: mov w10, #64 // =0x40 -; CHECK-GI-NEXT: lsl x14, x3, #63 -; CHECK-GI-NEXT: sub x12, x10, x9 +; CHECK-GI-NEXT: mov w10, #127 // =0x7f +; CHECK-GI-NEXT: sub x12, x8, x9 ; CHECK-GI-NEXT: lsl x13, x1, x9 -; CHECK-GI-NEXT: mov w8, #127 // =0x7f +; CHECK-GI-NEXT: bic x10, x10, x4 ; CHECK-GI-NEXT: lsr x12, x0, x12 -; CHECK-GI-NEXT: bic x8, x8, x4 -; CHECK-GI-NEXT: sub x15, x9, #64 +; CHECK-GI-NEXT: sub x14, x9, #64 +; CHECK-GI-NEXT: lsl x15, x0, x9 +; CHECK-GI-NEXT: extr x16, x3, x2, #1 ; CHECK-GI-NEXT: cmp x9, #64 -; CHECK-GI-NEXT: lsl x9, x0, x9 -; CHECK-GI-NEXT: lsl x15, x0, x15 -; CHECK-GI-NEXT: orr x12, x12, x13 -; CHECK-GI-NEXT: orr x13, x14, x2, lsr #1 -; CHECK-GI-NEXT: lsr x14, x3, #1 -; CHECK-GI-NEXT: sub x10, x10, x8 -; CHECK-GI-NEXT: sub x16, x8, #64 -; CHECK-GI-NEXT: csel x9, x9, xzr, lo -; CHECK-GI-NEXT: lsr x17, x13, x8 -; CHECK-GI-NEXT: lsl x10, x14, x10 -; CHECK-GI-NEXT: csel x12, x12, x15, lo +; CHECK-GI-NEXT: sub x8, x8, x10 +; CHECK-GI-NEXT: orr x9, x12, x13 +; CHECK-GI-NEXT: lsr x12, x3, #1 +; CHECK-GI-NEXT: lsl x13, x0, x14 +; CHECK-GI-NEXT: csel x14, x15, xzr, lo +; CHECK-GI-NEXT: sub x15, x10, #64 +; CHECK-GI-NEXT: lsr x17, x16, x10 +; CHECK-GI-NEXT: lsl x8, x12, x8 +; CHECK-GI-NEXT: csel x9, x9, x13, lo ; CHECK-GI-NEXT: tst x4, #0x7f -; CHECK-GI-NEXT: lsr x15, x14, x16 +; CHECK-GI-NEXT: lsr x13, x12, x15 ; CHECK-GI-NEXT: mvn x11, x4 -; CHECK-GI-NEXT: csel x12, x1, x12, eq -; CHECK-GI-NEXT: orr x10, x17, x10 -; CHECK-GI-NEXT: cmp x8, #64 -; CHECK-GI-NEXT: lsr x14, x14, x8 -; CHECK-GI-NEXT: csel x10, x10, x15, lo +; CHECK-GI-NEXT: csel x9, x1, x9, eq +; CHECK-GI-NEXT: orr x8, x17, x8 +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: lsr x12, x12, x10 +; CHECK-GI-NEXT: csel x8, x8, x13, lo ; CHECK-GI-NEXT: tst x11, #0x7f -; CHECK-GI-NEXT: csel x10, x13, x10, eq -; CHECK-GI-NEXT: cmp x8, #64 -; CHECK-GI-NEXT: csel x8, x14, xzr, lo -; CHECK-GI-NEXT: orr x0, x9, x10 -; CHECK-GI-NEXT: orr x1, x12, x8 +; CHECK-GI-NEXT: csel x8, x16, x8, eq +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: csel x10, x12, xzr, lo +; CHECK-GI-NEXT: orr x0, x14, x8 +; CHECK-GI-NEXT: orr x1, x9, x10 ; CHECK-GI-NEXT: ret entry: %d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 %c) @@ -571,41 +570,40 @@ define i128 @fshr_i128(i128 %a, i128 %b, i128 %c) { ; ; CHECK-GI-LABEL: fshr_i128: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x0, #63 -; CHECK-GI-NEXT: mov w9, #127 // =0x7f -; CHECK-GI-NEXT: mov w10, #64 // =0x40 -; CHECK-GI-NEXT: bic x9, x9, x4 -; CHECK-GI-NEXT: lsl x11, x0, #1 -; CHECK-GI-NEXT: and x12, x4, #0x7f -; CHECK-GI-NEXT: orr x8, x8, x1, lsl #1 -; CHECK-GI-NEXT: sub x14, x10, x9 -; CHECK-GI-NEXT: sub x17, x9, #64 -; CHECK-GI-NEXT: lsl x15, x11, x9 -; CHECK-GI-NEXT: lsr x14, x11, x14 -; CHECK-GI-NEXT: cmp x9, #64 -; CHECK-GI-NEXT: lsl x16, x8, x9 -; CHECK-GI-NEXT: sub x9, x10, x12 -; CHECK-GI-NEXT: lsl x10, x11, x17 -; CHECK-GI-NEXT: mvn x13, x4 -; CHECK-GI-NEXT: csel x11, x15, xzr, lo -; CHECK-GI-NEXT: sub x15, x12, #64 -; CHECK-GI-NEXT: orr x14, x14, x16 -; CHECK-GI-NEXT: lsr x16, x2, x12 -; CHECK-GI-NEXT: lsl x9, x3, x9 -; CHECK-GI-NEXT: csel x10, x14, x10, lo -; CHECK-GI-NEXT: tst x13, #0x7f -; CHECK-GI-NEXT: lsr x13, x3, x15 -; CHECK-GI-NEXT: csel x8, x8, x10, eq -; CHECK-GI-NEXT: orr x9, x16, x9 -; CHECK-GI-NEXT: cmp x12, #64 -; CHECK-GI-NEXT: lsr x10, x3, x12 -; CHECK-GI-NEXT: csel x9, x9, x13, lo +; CHECK-GI-NEXT: mov w8, #127 // =0x7f +; CHECK-GI-NEXT: lsl x9, x0, #1 +; CHECK-GI-NEXT: extr x10, x1, x0, #63 +; CHECK-GI-NEXT: bic x8, x8, x4 +; CHECK-GI-NEXT: mov w11, #64 // =0x40 +; CHECK-GI-NEXT: and x14, x4, #0x7f +; CHECK-GI-NEXT: sub x12, x11, x8 +; CHECK-GI-NEXT: lsl x13, x10, x8 +; CHECK-GI-NEXT: lsl x16, x9, x8 +; CHECK-GI-NEXT: lsr x12, x9, x12 +; CHECK-GI-NEXT: sub x17, x8, #64 +; CHECK-GI-NEXT: cmp x8, #64 +; CHECK-GI-NEXT: lsl x8, x9, x17 +; CHECK-GI-NEXT: sub x11, x11, x14 +; CHECK-GI-NEXT: mvn x15, x4 +; CHECK-GI-NEXT: orr x12, x12, x13 +; CHECK-GI-NEXT: csel x9, x16, xzr, lo +; CHECK-GI-NEXT: sub x13, x14, #64 +; CHECK-GI-NEXT: lsr x16, x2, x14 +; CHECK-GI-NEXT: lsl x11, x3, x11 +; CHECK-GI-NEXT: csel x8, x12, x8, lo +; CHECK-GI-NEXT: tst x15, #0x7f +; CHECK-GI-NEXT: lsr x12, x3, x13 +; CHECK-GI-NEXT: csel x8, x10, x8, eq +; CHECK-GI-NEXT: orr x10, x16, x11 +; CHECK-GI-NEXT: cmp x14, #64 +; CHECK-GI-NEXT: lsr x11, x3, x14 +; CHECK-GI-NEXT: csel x10, x10, x12, lo ; CHECK-GI-NEXT: tst x4, #0x7f -; CHECK-GI-NEXT: csel x9, x2, x9, eq -; CHECK-GI-NEXT: cmp x12, #64 -; CHECK-GI-NEXT: csel x10, x10, xzr, lo -; CHECK-GI-NEXT: orr x0, x11, x9 -; CHECK-GI-NEXT: orr x1, x8, x10 +; CHECK-GI-NEXT: csel x10, x2, x10, eq +; CHECK-GI-NEXT: cmp x14, #64 +; CHECK-GI-NEXT: csel x11, x11, xzr, lo +; CHECK-GI-NEXT: orr x0, x9, x10 +; CHECK-GI-NEXT: orr x1, x8, x11 ; CHECK-GI-NEXT: ret entry: %d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 %c) @@ -720,10 +718,9 @@ define i128 @rotl_i128_c(i128 %a) { ; ; CHECK-GI-LABEL: rotl_i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x0, #61 -; CHECK-GI-NEXT: lsr x9, x1, #61 -; CHECK-GI-NEXT: orr x1, x8, x1, lsl #3 -; CHECK-GI-NEXT: orr x0, x9, x0, lsl #3 +; CHECK-GI-NEXT: extr x8, x1, x0, #61 +; CHECK-GI-NEXT: extr x0, x0, x1, #61 +; CHECK-GI-NEXT: mov x1, x8 ; CHECK-GI-NEXT: ret entry: %d = call i128 @llvm.fshl(i128 %a, i128 %a, i128 3) @@ -731,20 +728,12 @@ entry: } define i128 @rotr_i128_c(i128 %a) { -; CHECK-SD-LABEL: rotr_i128_c: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: extr x8, x1, x0, #3 -; CHECK-SD-NEXT: extr x1, x0, x1, #3 -; CHECK-SD-NEXT: mov x0, x8 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: rotr_i128_c: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl x8, x1, #61 -; CHECK-GI-NEXT: lsl x9, x0, #61 -; CHECK-GI-NEXT: orr x0, x8, x0, lsr #3 -; CHECK-GI-NEXT: orr x1, x9, x1, lsr #3 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: rotr_i128_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: extr x8, x1, x0, #3 +; CHECK-NEXT: extr x1, x0, x1, #3 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret entry: %d = call i128 @llvm.fshr(i128 %a, i128 %a, i128 3) ret i128 %d @@ -868,10 +857,8 @@ define i128 @fshl_i128_c(i128 %a, i128 %b) { ; ; CHECK-GI-LABEL: fshl_i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x0, #61 -; CHECK-GI-NEXT: lsr x9, x3, #61 -; CHECK-GI-NEXT: orr x1, x8, x1, lsl #3 -; CHECK-GI-NEXT: orr x0, x9, x0, lsl #3 +; CHECK-GI-NEXT: extr x1, x1, x0, #61 +; CHECK-GI-NEXT: extr x0, x0, x3, #61 ; CHECK-GI-NEXT: ret entry: %d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 3) @@ -879,21 +866,12 @@ entry: } define i128 @fshr_i128_c(i128 %a, i128 %b) { -; CHECK-SD-LABEL: fshr_i128_c: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: extr x8, x3, x2, #3 -; CHECK-SD-NEXT: extr x1, x0, x3, #3 -; CHECK-SD-NEXT: mov x0, x8 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fshr_i128_c: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl x8, x3, #61 -; CHECK-GI-NEXT: lsr x9, x3, #3 -; CHECK-GI-NEXT: orr x8, x8, x2, lsr #3 -; CHECK-GI-NEXT: orr x1, x9, x0, lsl #61 -; CHECK-GI-NEXT: mov x0, x8 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fshr_i128_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: extr x8, x3, x2, #3 +; CHECK-NEXT: extr x1, x0, x3, #3 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret entry: %d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 3) ret i128 %d @@ -3013,75 +2991,73 @@ define <2 x i128> @fshl_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) { ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GI-NEXT: .cfi_offset w19, -16 ; CHECK-GI-NEXT: ldr x11, [sp, #16] -; CHECK-GI-NEXT: mov w10, #64 // =0x40 +; CHECK-GI-NEXT: mov w9, #64 // =0x40 ; CHECK-GI-NEXT: ldr x12, [sp, #32] ; CHECK-GI-NEXT: mov w13, #127 // =0x7f -; CHECK-GI-NEXT: and x9, x11, #0x7f +; CHECK-GI-NEXT: and x8, x11, #0x7f ; CHECK-GI-NEXT: and x14, x12, #0x7f -; CHECK-GI-NEXT: mvn x15, x11 -; CHECK-GI-NEXT: sub x8, x10, x9 -; CHECK-GI-NEXT: sub x16, x9, #64 -; CHECK-GI-NEXT: lsl x19, x1, x9 -; CHECK-GI-NEXT: lsr x18, x0, x8 -; CHECK-GI-NEXT: lsl x17, x0, x9 -; CHECK-GI-NEXT: lsl x16, x0, x16 -; CHECK-GI-NEXT: cmp x9, #64 -; CHECK-GI-NEXT: bic x0, x13, x11 -; CHECK-GI-NEXT: mvn x8, x12 -; CHECK-GI-NEXT: orr x18, x18, x19 -; CHECK-GI-NEXT: csel x9, x17, xzr, lo +; CHECK-GI-NEXT: mvn x18, x11 +; CHECK-GI-NEXT: sub x10, x9, x8 +; CHECK-GI-NEXT: sub x15, x8, #64 +; CHECK-GI-NEXT: lsl x17, x1, x8 +; CHECK-GI-NEXT: lsr x16, x0, x10 +; CHECK-GI-NEXT: lsl x15, x0, x15 +; CHECK-GI-NEXT: cmp x8, #64 +; CHECK-GI-NEXT: lsl x19, x0, x8 +; CHECK-GI-NEXT: lsl x0, x3, x14 +; CHECK-GI-NEXT: mvn x10, x12 +; CHECK-GI-NEXT: orr x16, x16, x17 ; CHECK-GI-NEXT: sub x17, x14, #64 -; CHECK-GI-NEXT: csel x16, x18, x16, lo +; CHECK-GI-NEXT: csel x15, x16, x15, lo +; CHECK-GI-NEXT: sub x16, x9, x14 +; CHECK-GI-NEXT: csel x8, x19, xzr, lo +; CHECK-GI-NEXT: lsr x16, x2, x16 ; CHECK-GI-NEXT: tst x11, #0x7f -; CHECK-GI-NEXT: sub x11, x10, x14 -; CHECK-GI-NEXT: lsr x11, x2, x11 -; CHECK-GI-NEXT: lsl x18, x3, x14 -; CHECK-GI-NEXT: csel x16, x1, x16, eq -; CHECK-GI-NEXT: lsl x1, x2, x14 +; CHECK-GI-NEXT: lsl x19, x2, x14 ; CHECK-GI-NEXT: lsl x17, x2, x17 +; CHECK-GI-NEXT: csel x15, x1, x15, eq ; CHECK-GI-NEXT: cmp x14, #64 -; CHECK-GI-NEXT: lsl x14, x5, #63 -; CHECK-GI-NEXT: orr x11, x11, x18 -; CHECK-GI-NEXT: bic x13, x13, x12 -; CHECK-GI-NEXT: csel x18, x1, xzr, lo -; CHECK-GI-NEXT: csel x11, x11, x17, lo +; CHECK-GI-NEXT: orr x16, x16, x0 +; CHECK-GI-NEXT: bic x11, x13, x11 +; CHECK-GI-NEXT: csel x14, x19, xzr, lo +; CHECK-GI-NEXT: csel x16, x16, x17, lo ; CHECK-GI-NEXT: tst x12, #0x7f -; CHECK-GI-NEXT: lsr x12, x5, #1 -; CHECK-GI-NEXT: orr x14, x14, x4, lsr #1 -; CHECK-GI-NEXT: lsl x17, x7, #63 -; CHECK-GI-NEXT: sub x1, x10, x0 -; CHECK-GI-NEXT: csel x11, x3, x11, eq -; CHECK-GI-NEXT: sub x2, x0, #64 -; CHECK-GI-NEXT: lsr x3, x14, x0 -; CHECK-GI-NEXT: lsl x1, x12, x1 -; CHECK-GI-NEXT: lsr x4, x7, #1 -; CHECK-GI-NEXT: orr x17, x17, x6, lsr #1 -; CHECK-GI-NEXT: lsr x2, x12, x2 -; CHECK-GI-NEXT: cmp x0, #64 -; CHECK-GI-NEXT: orr x1, x3, x1 -; CHECK-GI-NEXT: sub x10, x10, x13 -; CHECK-GI-NEXT: lsr x12, x12, x0 -; CHECK-GI-NEXT: csel x1, x1, x2, lo -; CHECK-GI-NEXT: tst x15, #0x7f -; CHECK-GI-NEXT: sub x15, x13, #64 -; CHECK-GI-NEXT: lsr x2, x17, x13 -; CHECK-GI-NEXT: lsl x10, x4, x10 -; CHECK-GI-NEXT: csel x14, x14, x1, eq -; CHECK-GI-NEXT: cmp x0, #64 -; CHECK-GI-NEXT: lsr x15, x4, x15 -; CHECK-GI-NEXT: lsr x0, x4, x13 -; CHECK-GI-NEXT: csel x12, x12, xzr, lo -; CHECK-GI-NEXT: orr x10, x2, x10 -; CHECK-GI-NEXT: cmp x13, #64 -; CHECK-GI-NEXT: csel x10, x10, x15, lo -; CHECK-GI-NEXT: tst x8, #0x7f -; CHECK-GI-NEXT: orr x1, x16, x12 -; CHECK-GI-NEXT: csel x8, x17, x10, eq -; CHECK-GI-NEXT: cmp x13, #64 -; CHECK-GI-NEXT: csel x10, x0, xzr, lo -; CHECK-GI-NEXT: orr x0, x9, x14 -; CHECK-GI-NEXT: orr x2, x18, x8 -; CHECK-GI-NEXT: orr x3, x11, x10 +; CHECK-GI-NEXT: lsr x17, x5, #1 +; CHECK-GI-NEXT: extr x0, x5, x4, #1 +; CHECK-GI-NEXT: bic x12, x13, x12 +; CHECK-GI-NEXT: csel x13, x3, x16, eq +; CHECK-GI-NEXT: sub x16, x9, x11 +; CHECK-GI-NEXT: sub x1, x11, #64 +; CHECK-GI-NEXT: lsr x3, x7, #1 +; CHECK-GI-NEXT: lsr x2, x0, x11 +; CHECK-GI-NEXT: lsl x16, x17, x16 +; CHECK-GI-NEXT: extr x4, x7, x6, #1 +; CHECK-GI-NEXT: lsr x1, x17, x1 +; CHECK-GI-NEXT: cmp x11, #64 +; CHECK-GI-NEXT: sub x9, x9, x12 +; CHECK-GI-NEXT: orr x16, x2, x16 +; CHECK-GI-NEXT: lsr x17, x17, x11 +; CHECK-GI-NEXT: lsl x9, x3, x9 +; CHECK-GI-NEXT: csel x16, x16, x1, lo +; CHECK-GI-NEXT: tst x18, #0x7f +; CHECK-GI-NEXT: sub x18, x12, #64 +; CHECK-GI-NEXT: lsr x1, x4, x12 +; CHECK-GI-NEXT: csel x16, x0, x16, eq +; CHECK-GI-NEXT: cmp x11, #64 +; CHECK-GI-NEXT: lsr x11, x3, x18 +; CHECK-GI-NEXT: csel x17, x17, xzr, lo +; CHECK-GI-NEXT: cmp x12, #64 +; CHECK-GI-NEXT: orr x9, x1, x9 +; CHECK-GI-NEXT: lsr x18, x3, x12 +; CHECK-GI-NEXT: orr x0, x8, x16 +; CHECK-GI-NEXT: csel x9, x9, x11, lo +; CHECK-GI-NEXT: tst x10, #0x7f +; CHECK-GI-NEXT: orr x1, x15, x17 +; CHECK-GI-NEXT: csel x9, x4, x9, eq +; CHECK-GI-NEXT: cmp x12, #64 +; CHECK-GI-NEXT: csel x10, x18, xzr, lo +; CHECK-GI-NEXT: orr x2, x14, x9 +; CHECK-GI-NEXT: orr x3, x13, x10 ; CHECK-GI-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload ; CHECK-GI-NEXT: ret entry: @@ -3125,75 +3101,73 @@ define <2 x i128> @fshr_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) { ; CHECK-GI-LABEL: fshr_v2i128: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr x9, [sp] -; CHECK-GI-NEXT: lsl x12, x1, #1 -; CHECK-GI-NEXT: mov w11, #127 // =0x7f -; CHECK-GI-NEXT: mov w14, #64 // =0x40 -; CHECK-GI-NEXT: lsl x15, x0, #1 +; CHECK-GI-NEXT: mov w10, #127 // =0x7f +; CHECK-GI-NEXT: mov w12, #64 // =0x40 +; CHECK-GI-NEXT: lsl x13, x0, #1 +; CHECK-GI-NEXT: extr x14, x1, x0, #63 ; CHECK-GI-NEXT: ldr x8, [sp, #16] -; CHECK-GI-NEXT: bic x13, x11, x9 -; CHECK-GI-NEXT: orr x12, x12, x0, lsr #63 -; CHECK-GI-NEXT: lsl x1, x3, #1 -; CHECK-GI-NEXT: sub x17, x14, x13 -; CHECK-GI-NEXT: sub x18, x13, #64 -; CHECK-GI-NEXT: lsl x3, x15, x13 -; CHECK-GI-NEXT: lsr x17, x15, x17 -; CHECK-GI-NEXT: lsl x0, x12, x13 -; CHECK-GI-NEXT: lsl x15, x15, x18 -; CHECK-GI-NEXT: bic x11, x11, x8 +; CHECK-GI-NEXT: bic x11, x10, x9 +; CHECK-GI-NEXT: mvn x16, x9 +; CHECK-GI-NEXT: and x15, x9, #0x7f +; CHECK-GI-NEXT: sub x17, x12, x11 +; CHECK-GI-NEXT: sub x18, x11, #64 +; CHECK-GI-NEXT: lsl x0, x14, x11 +; CHECK-GI-NEXT: lsr x17, x13, x17 +; CHECK-GI-NEXT: lsl x1, x13, x11 +; CHECK-GI-NEXT: lsl x13, x13, x18 +; CHECK-GI-NEXT: bic x10, x10, x8 ; CHECK-GI-NEXT: lsl x18, x2, #1 -; CHECK-GI-NEXT: cmp x13, #64 +; CHECK-GI-NEXT: cmp x11, #64 ; CHECK-GI-NEXT: orr x17, x17, x0 -; CHECK-GI-NEXT: orr x13, x1, x2, lsr #63 -; CHECK-GI-NEXT: mvn x16, x9 -; CHECK-GI-NEXT: csel x15, x17, x15, lo -; CHECK-GI-NEXT: sub x17, x14, x11 -; CHECK-GI-NEXT: csel x0, x3, xzr, lo +; CHECK-GI-NEXT: extr x11, x3, x2, #63 +; CHECK-GI-NEXT: csel x0, x1, xzr, lo +; CHECK-GI-NEXT: csel x13, x17, x13, lo +; CHECK-GI-NEXT: sub x17, x12, x10 ; CHECK-GI-NEXT: tst x16, #0x7f -; CHECK-GI-NEXT: sub x16, x11, #64 +; CHECK-GI-NEXT: sub x16, x10, #64 ; CHECK-GI-NEXT: lsr x17, x18, x17 -; CHECK-GI-NEXT: lsl x2, x13, x11 -; CHECK-GI-NEXT: lsl x1, x18, x11 -; CHECK-GI-NEXT: csel x12, x12, x15, eq -; CHECK-GI-NEXT: lsl x15, x18, x16 -; CHECK-GI-NEXT: and x10, x9, #0x7f -; CHECK-GI-NEXT: cmp x11, #64 -; CHECK-GI-NEXT: mvn x11, x8 +; CHECK-GI-NEXT: lsl x2, x11, x10 +; CHECK-GI-NEXT: lsl x1, x18, x10 +; CHECK-GI-NEXT: csel x13, x14, x13, eq +; CHECK-GI-NEXT: lsl x14, x18, x16 +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: mvn x10, x8 ; CHECK-GI-NEXT: orr x16, x17, x2 ; CHECK-GI-NEXT: csel x17, x1, xzr, lo -; CHECK-GI-NEXT: csel x15, x16, x15, lo -; CHECK-GI-NEXT: tst x11, #0x7f -; CHECK-GI-NEXT: sub x11, x14, x10 -; CHECK-GI-NEXT: sub x16, x10, #64 -; CHECK-GI-NEXT: lsr x18, x4, x10 -; CHECK-GI-NEXT: lsl x11, x5, x11 -; CHECK-GI-NEXT: csel x13, x13, x15, eq -; CHECK-GI-NEXT: lsr x15, x5, x16 +; CHECK-GI-NEXT: csel x14, x16, x14, lo +; CHECK-GI-NEXT: tst x10, #0x7f +; CHECK-GI-NEXT: sub x10, x12, x15 +; CHECK-GI-NEXT: sub x16, x15, #64 +; CHECK-GI-NEXT: lsr x18, x4, x15 +; CHECK-GI-NEXT: lsl x10, x5, x10 +; CHECK-GI-NEXT: csel x11, x11, x14, eq +; CHECK-GI-NEXT: lsr x14, x5, x16 ; CHECK-GI-NEXT: and x1, x8, #0x7f -; CHECK-GI-NEXT: orr x11, x18, x11 -; CHECK-GI-NEXT: cmp x10, #64 -; CHECK-GI-NEXT: lsr x16, x5, x10 -; CHECK-GI-NEXT: csel x11, x11, x15, lo +; CHECK-GI-NEXT: cmp x15, #64 +; CHECK-GI-NEXT: lsr x16, x5, x15 +; CHECK-GI-NEXT: orr x10, x18, x10 +; CHECK-GI-NEXT: csel x10, x10, x14, lo ; CHECK-GI-NEXT: tst x9, #0x7f -; CHECK-GI-NEXT: sub x9, x14, x1 -; CHECK-GI-NEXT: sub x14, x1, #64 -; CHECK-GI-NEXT: lsr x15, x6, x1 +; CHECK-GI-NEXT: sub x9, x12, x1 +; CHECK-GI-NEXT: sub x12, x1, #64 +; CHECK-GI-NEXT: lsr x14, x6, x1 ; CHECK-GI-NEXT: lsl x9, x7, x9 -; CHECK-GI-NEXT: csel x11, x4, x11, eq -; CHECK-GI-NEXT: cmp x10, #64 -; CHECK-GI-NEXT: lsr x10, x7, x14 -; CHECK-GI-NEXT: csel x14, x16, xzr, lo -; CHECK-GI-NEXT: orr x9, x15, x9 +; CHECK-GI-NEXT: csel x10, x4, x10, eq +; CHECK-GI-NEXT: cmp x15, #64 +; CHECK-GI-NEXT: lsr x12, x7, x12 +; CHECK-GI-NEXT: csel x15, x16, xzr, lo +; CHECK-GI-NEXT: orr x9, x14, x9 ; CHECK-GI-NEXT: cmp x1, #64 -; CHECK-GI-NEXT: lsr x15, x7, x1 -; CHECK-GI-NEXT: csel x9, x9, x10, lo +; CHECK-GI-NEXT: lsr x14, x7, x1 +; CHECK-GI-NEXT: csel x9, x9, x12, lo ; CHECK-GI-NEXT: tst x8, #0x7f ; CHECK-GI-NEXT: csel x8, x6, x9, eq ; CHECK-GI-NEXT: cmp x1, #64 -; CHECK-GI-NEXT: orr x0, x0, x11 -; CHECK-GI-NEXT: csel x9, x15, xzr, lo -; CHECK-GI-NEXT: orr x1, x12, x14 +; CHECK-GI-NEXT: orr x0, x0, x10 +; CHECK-GI-NEXT: csel x9, x14, xzr, lo +; CHECK-GI-NEXT: orr x1, x13, x15 ; CHECK-GI-NEXT: orr x2, x17, x8 -; CHECK-GI-NEXT: orr x3, x13, x9 +; CHECK-GI-NEXT: orr x3, x11, x9 ; CHECK-GI-NEXT: ret entry: %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) @@ -3863,15 +3837,12 @@ define <2 x i128> @rotl_v2i128_c(<2 x i128> %a) { ; ; CHECK-GI-LABEL: rotl_v2i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x1, #61 -; CHECK-GI-NEXT: lsl x9, x1, #3 -; CHECK-GI-NEXT: lsl x10, x3, #3 -; CHECK-GI-NEXT: lsr x11, x3, #61 -; CHECK-GI-NEXT: orr x8, x8, x0, lsl #3 -; CHECK-GI-NEXT: orr x1, x9, x0, lsr #61 -; CHECK-GI-NEXT: orr x3, x10, x2, lsr #61 -; CHECK-GI-NEXT: orr x2, x11, x2, lsl #3 +; CHECK-GI-NEXT: extr x8, x0, x1, #61 +; CHECK-GI-NEXT: extr x9, x3, x2, #61 +; CHECK-GI-NEXT: extr x1, x1, x0, #61 +; CHECK-GI-NEXT: extr x2, x2, x3, #61 ; CHECK-GI-NEXT: mov x0, x8 +; CHECK-GI-NEXT: mov x3, x9 ; CHECK-GI-NEXT: ret entry: %d = call <2 x i128> @llvm.fshl(<2 x i128> %a, <2 x i128> %a, <2 x i128> ) @@ -3891,14 +3862,12 @@ define <2 x i128> @rotr_v2i128_c(<2 x i128> %a) { ; ; CHECK-GI-LABEL: rotr_v2i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl x8, x1, #61 -; CHECK-GI-NEXT: lsl x9, x3, #61 -; CHECK-GI-NEXT: lsl x10, x0, #61 -; CHECK-GI-NEXT: lsl x11, x2, #61 -; CHECK-GI-NEXT: orr x0, x8, x0, lsr #3 -; CHECK-GI-NEXT: orr x2, x9, x2, lsr #3 -; CHECK-GI-NEXT: orr x1, x10, x1, lsr #3 -; CHECK-GI-NEXT: orr x3, x11, x3, lsr #3 +; CHECK-GI-NEXT: extr x8, x1, x0, #3 +; CHECK-GI-NEXT: extr x9, x3, x2, #3 +; CHECK-GI-NEXT: extr x1, x0, x1, #3 +; CHECK-GI-NEXT: extr x3, x2, x3, #3 +; CHECK-GI-NEXT: mov x0, x8 +; CHECK-GI-NEXT: mov x2, x9 ; CHECK-GI-NEXT: ret entry: %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %a, <2 x i128> ) @@ -4464,14 +4433,10 @@ define <2 x i128> @fshl_v2i128_c(<2 x i128> %a, <2 x i128> %b) { ; ; CHECK-GI-LABEL: fshl_v2i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x5, #61 -; CHECK-GI-NEXT: lsl x9, x1, #3 -; CHECK-GI-NEXT: lsl x10, x3, #3 -; CHECK-GI-NEXT: lsr x11, x7, #61 -; CHECK-GI-NEXT: orr x8, x8, x0, lsl #3 -; CHECK-GI-NEXT: orr x1, x9, x0, lsr #61 -; CHECK-GI-NEXT: orr x3, x10, x2, lsr #61 -; CHECK-GI-NEXT: orr x2, x11, x2, lsl #3 +; CHECK-GI-NEXT: extr x8, x0, x5, #61 +; CHECK-GI-NEXT: extr x1, x1, x0, #61 +; CHECK-GI-NEXT: extr x3, x3, x2, #61 +; CHECK-GI-NEXT: extr x2, x2, x7, #61 ; CHECK-GI-NEXT: mov x0, x8 ; CHECK-GI-NEXT: ret entry: @@ -4480,29 +4445,15 @@ entry: } define <2 x i128> @fshr_v2i128_c(<2 x i128> %a, <2 x i128> %b) { -; CHECK-SD-LABEL: fshr_v2i128_c: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: extr x8, x5, x4, #3 -; CHECK-SD-NEXT: extr x9, x7, x6, #3 -; CHECK-SD-NEXT: extr x1, x0, x5, #3 -; CHECK-SD-NEXT: extr x3, x2, x7, #3 -; CHECK-SD-NEXT: mov x0, x8 -; CHECK-SD-NEXT: mov x2, x9 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fshr_v2i128_c: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl x8, x5, #61 -; CHECK-GI-NEXT: lsl x9, x7, #61 -; CHECK-GI-NEXT: lsr x10, x5, #3 -; CHECK-GI-NEXT: lsr x11, x7, #3 -; CHECK-GI-NEXT: orr x8, x8, x4, lsr #3 -; CHECK-GI-NEXT: orr x9, x9, x6, lsr #3 -; CHECK-GI-NEXT: orr x1, x10, x0, lsl #61 -; CHECK-GI-NEXT: orr x3, x11, x2, lsl #61 -; CHECK-GI-NEXT: mov x0, x8 -; CHECK-GI-NEXT: mov x2, x9 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fshr_v2i128_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: extr x8, x5, x4, #3 +; CHECK-NEXT: extr x9, x7, x6, #3 +; CHECK-NEXT: extr x1, x0, x5, #3 +; CHECK-NEXT: extr x3, x2, x7, #3 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x2, x9 +; CHECK-NEXT: ret entry: %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> ) ret <2 x i128> %d diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll index f9fd2ad1b5b6c..90fb10258dffb 100644 --- a/llvm/test/CodeGen/AArch64/funnel-shift.ll +++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll @@ -85,41 +85,40 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind { ; ; CHECK-GI-LABEL: fshl_i128: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #64 // =0x40 ; CHECK-GI-NEXT: and x9, x4, #0x7f -; CHECK-GI-NEXT: mov w10, #64 // =0x40 -; CHECK-GI-NEXT: lsl x14, x3, #63 -; CHECK-GI-NEXT: sub x12, x10, x9 +; CHECK-GI-NEXT: mov w10, #127 // =0x7f +; CHECK-GI-NEXT: sub x12, x8, x9 ; CHECK-GI-NEXT: lsl x13, x1, x9 -; CHECK-GI-NEXT: mov w8, #127 // =0x7f +; CHECK-GI-NEXT: bic x10, x10, x4 ; CHECK-GI-NEXT: lsr x12, x0, x12 -; CHECK-GI-NEXT: bic x8, x8, x4 -; CHECK-GI-NEXT: sub x15, x9, #64 +; CHECK-GI-NEXT: sub x14, x9, #64 +; CHECK-GI-NEXT: lsl x15, x0, x9 +; CHECK-GI-NEXT: extr x16, x3, x2, #1 ; CHECK-GI-NEXT: cmp x9, #64 -; CHECK-GI-NEXT: lsl x9, x0, x9 -; CHECK-GI-NEXT: lsl x15, x0, x15 -; CHECK-GI-NEXT: orr x12, x12, x13 -; CHECK-GI-NEXT: orr x13, x14, x2, lsr #1 -; CHECK-GI-NEXT: lsr x14, x3, #1 -; CHECK-GI-NEXT: sub x10, x10, x8 -; CHECK-GI-NEXT: sub x16, x8, #64 -; CHECK-GI-NEXT: csel x9, x9, xzr, lo -; CHECK-GI-NEXT: lsr x17, x13, x8 -; CHECK-GI-NEXT: lsl x10, x14, x10 -; CHECK-GI-NEXT: csel x12, x12, x15, lo +; CHECK-GI-NEXT: sub x8, x8, x10 +; CHECK-GI-NEXT: orr x9, x12, x13 +; CHECK-GI-NEXT: lsr x12, x3, #1 +; CHECK-GI-NEXT: lsl x13, x0, x14 +; CHECK-GI-NEXT: csel x14, x15, xzr, lo +; CHECK-GI-NEXT: sub x15, x10, #64 +; CHECK-GI-NEXT: lsr x17, x16, x10 +; CHECK-GI-NEXT: lsl x8, x12, x8 +; CHECK-GI-NEXT: csel x9, x9, x13, lo ; CHECK-GI-NEXT: tst x4, #0x7f -; CHECK-GI-NEXT: lsr x15, x14, x16 +; CHECK-GI-NEXT: lsr x13, x12, x15 ; CHECK-GI-NEXT: mvn x11, x4 -; CHECK-GI-NEXT: csel x12, x1, x12, eq -; CHECK-GI-NEXT: orr x10, x17, x10 -; CHECK-GI-NEXT: cmp x8, #64 -; CHECK-GI-NEXT: lsr x14, x14, x8 -; CHECK-GI-NEXT: csel x10, x10, x15, lo +; CHECK-GI-NEXT: csel x9, x1, x9, eq +; CHECK-GI-NEXT: orr x8, x17, x8 +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: lsr x12, x12, x10 +; CHECK-GI-NEXT: csel x8, x8, x13, lo ; CHECK-GI-NEXT: tst x11, #0x7f -; CHECK-GI-NEXT: csel x10, x13, x10, eq -; CHECK-GI-NEXT: cmp x8, #64 -; CHECK-GI-NEXT: csel x8, x14, xzr, lo -; CHECK-GI-NEXT: orr x0, x9, x10 -; CHECK-GI-NEXT: orr x1, x12, x8 +; CHECK-GI-NEXT: csel x8, x16, x8, eq +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: csel x10, x12, xzr, lo +; CHECK-GI-NEXT: orr x0, x14, x8 +; CHECK-GI-NEXT: orr x1, x9, x10 ; CHECK-GI-NEXT: ret %f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z) ret i128 %f diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll index 1cb92e46cbcd1..87b11086e28d5 100644 --- a/llvm/test/CodeGen/AArch64/rem-by-const.ll +++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll @@ -559,20 +559,18 @@ define i128 @ui128_7(i128 %a, i128 %b) { ; CHECK-GI-NEXT: add x8, x8, x10 ; CHECK-GI-NEXT: subs x10, x0, x9 ; CHECK-GI-NEXT: sbc x11, x1, x8 -; CHECK-GI-NEXT: lsl x12, x11, #63 +; CHECK-GI-NEXT: extr x10, x11, x10, #1 ; CHECK-GI-NEXT: lsr x11, x11, #1 -; CHECK-GI-NEXT: orr x10, x12, x10, lsr #1 ; CHECK-GI-NEXT: adds x9, x10, x9 +; CHECK-GI-NEXT: mov w10, #7 // =0x7 ; CHECK-GI-NEXT: adc x8, x11, x8 -; CHECK-GI-NEXT: lsl x10, x8, #62 +; CHECK-GI-NEXT: extr x9, x8, x9, #2 ; CHECK-GI-NEXT: lsr x8, x8, #2 -; CHECK-GI-NEXT: orr x9, x10, x9, lsr #2 -; CHECK-GI-NEXT: mov w10, #7 // =0x7 -; CHECK-GI-NEXT: lsl x12, x8, #3 ; CHECK-GI-NEXT: umulh x10, x9, x10 ; CHECK-GI-NEXT: lsl x11, x9, #3 -; CHECK-GI-NEXT: sub x8, x12, x8 +; CHECK-GI-NEXT: lsl x12, x8, #3 ; CHECK-GI-NEXT: sub x9, x11, x9 +; CHECK-GI-NEXT: sub x8, x12, x8 ; CHECK-GI-NEXT: subs x0, x0, x9 ; CHECK-GI-NEXT: add x8, x8, x10 ; CHECK-GI-NEXT: sbc x1, x1, x8 @@ -640,10 +638,9 @@ define i128 @ui128_100(i128 %a, i128 %b) { ; CHECK-GI-NEXT: add x10, x11, x12 ; CHECK-GI-NEXT: add x8, x8, x14 ; CHECK-GI-NEXT: add x8, x8, x10 -; CHECK-GI-NEXT: lsl x10, x8, #60 -; CHECK-GI-NEXT: lsr x8, x8, #4 -; CHECK-GI-NEXT: orr x9, x10, x9, lsr #4 ; CHECK-GI-NEXT: mov w10, #100 // =0x64 +; CHECK-GI-NEXT: extr x9, x8, x9, #4 +; CHECK-GI-NEXT: lsr x8, x8, #4 ; CHECK-GI-NEXT: umulh x11, x9, x10 ; CHECK-GI-NEXT: mul x9, x9, x10 ; CHECK-GI-NEXT: madd x8, x8, x10, x11 @@ -3317,36 +3314,32 @@ define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) { ; CHECK-GI-NEXT: sbc x14, x1, x12 ; CHECK-GI-NEXT: add x8, x8, x13 ; CHECK-GI-NEXT: subs x13, x2, x10 -; CHECK-GI-NEXT: lsl x15, x14, #63 -; CHECK-GI-NEXT: sbc x16, x3, x8 +; CHECK-GI-NEXT: extr x9, x14, x9, #1 +; CHECK-GI-NEXT: sbc x15, x3, x8 ; CHECK-GI-NEXT: lsr x14, x14, #1 -; CHECK-GI-NEXT: orr x9, x15, x9, lsr #1 -; CHECK-GI-NEXT: lsl x15, x16, #63 -; CHECK-GI-NEXT: orr x13, x15, x13, lsr #1 +; CHECK-GI-NEXT: extr x13, x15, x13, #1 ; CHECK-GI-NEXT: adds x9, x9, x11 -; CHECK-GI-NEXT: lsr x11, x16, #1 +; CHECK-GI-NEXT: lsr x11, x15, #1 ; CHECK-GI-NEXT: adc x12, x14, x12 ; CHECK-GI-NEXT: adds x10, x13, x10 -; CHECK-GI-NEXT: lsl x13, x12, #62 -; CHECK-GI-NEXT: lsr x12, x12, #2 -; CHECK-GI-NEXT: adc x8, x11, x8 -; CHECK-GI-NEXT: lsl x11, x8, #62 -; CHECK-GI-NEXT: orr x9, x13, x9, lsr #2 +; CHECK-GI-NEXT: extr x9, x12, x9, #2 ; CHECK-GI-NEXT: mov w13, #7 // =0x7 +; CHECK-GI-NEXT: adc x8, x11, x8 +; CHECK-GI-NEXT: lsr x11, x12, #2 +; CHECK-GI-NEXT: extr x10, x8, x10, #2 +; CHECK-GI-NEXT: umulh x12, x9, x13 ; CHECK-GI-NEXT: lsr x8, x8, #2 -; CHECK-GI-NEXT: lsl x14, x12, #3 -; CHECK-GI-NEXT: orr x10, x11, x10, lsr #2 -; CHECK-GI-NEXT: umulh x11, x9, x13 +; CHECK-GI-NEXT: lsl x14, x11, #3 ; CHECK-GI-NEXT: lsl x15, x9, #3 -; CHECK-GI-NEXT: sub x12, x14, x12 -; CHECK-GI-NEXT: lsl x16, x8, #3 ; CHECK-GI-NEXT: umulh x13, x10, x13 +; CHECK-GI-NEXT: lsl x16, x8, #3 +; CHECK-GI-NEXT: sub x11, x14, x11 ; CHECK-GI-NEXT: lsl x14, x10, #3 ; CHECK-GI-NEXT: sub x9, x15, x9 ; CHECK-GI-NEXT: sub x8, x16, x8 ; CHECK-GI-NEXT: subs x0, x0, x9 +; CHECK-GI-NEXT: add x11, x11, x12 ; CHECK-GI-NEXT: sub x10, x14, x10 -; CHECK-GI-NEXT: add x11, x12, x11 ; CHECK-GI-NEXT: sbc x1, x1, x11 ; CHECK-GI-NEXT: subs x2, x2, x10 ; CHECK-GI-NEXT: add x8, x8, x13 @@ -3394,9 +3387,10 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov x10, #23593 // =0x5c29 ; CHECK-GI-NEXT: mov x8, #62914 // =0xf5c2 -; CHECK-GI-NEXT: sub x18, x0, x0 +; CHECK-GI-NEXT: and x5, xzr, #0x1 ; CHECK-GI-NEXT: movk x10, #49807, lsl #16 ; CHECK-GI-NEXT: movk x8, #23592, lsl #16 +; CHECK-GI-NEXT: umulh x18, x0, xzr ; CHECK-GI-NEXT: movk x10, #10485, lsl #32 ; CHECK-GI-NEXT: movk x8, #49807, lsl #32 ; CHECK-GI-NEXT: movk x10, #36700, lsl #48 @@ -3409,84 +3403,81 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) { ; CHECK-GI-NEXT: umulh x15, x1, x10 ; CHECK-GI-NEXT: cset w12, hs ; CHECK-GI-NEXT: cmn x11, x13 -; CHECK-GI-NEXT: and x11, x12, #0x1 -; CHECK-GI-NEXT: umulh x16, x0, x8 -; CHECK-GI-NEXT: cset w12, hs +; CHECK-GI-NEXT: sub x13, x0, x0 ; CHECK-GI-NEXT: and x12, x12, #0x1 -; CHECK-GI-NEXT: add x14, x14, x18 -; CHECK-GI-NEXT: add x11, x11, x12 -; CHECK-GI-NEXT: and x12, xzr, #0x1 +; CHECK-GI-NEXT: umulh x16, x0, x8 +; CHECK-GI-NEXT: cset w11, hs +; CHECK-GI-NEXT: add x13, x14, x13 +; CHECK-GI-NEXT: and x11, x11, #0x1 +; CHECK-GI-NEXT: and x14, xzr, #0x1 ; CHECK-GI-NEXT: umulh x9, xzr, x10 -; CHECK-GI-NEXT: adds x14, x14, x15 -; CHECK-GI-NEXT: and x15, xzr, #0x1 +; CHECK-GI-NEXT: add x11, x12, x11 +; CHECK-GI-NEXT: add x12, x5, x14 +; CHECK-GI-NEXT: adds x13, x13, x15 ; CHECK-GI-NEXT: umulh x17, x1, x8 -; CHECK-GI-NEXT: cset w4, hs -; CHECK-GI-NEXT: add x15, x12, x15 -; CHECK-GI-NEXT: adds x12, x14, x16 -; CHECK-GI-NEXT: and x4, x4, #0x1 -; CHECK-GI-NEXT: mul x18, x3, x10 ; CHECK-GI-NEXT: cset w14, hs -; CHECK-GI-NEXT: adds x12, x12, x11 -; CHECK-GI-NEXT: add x11, x15, x4 ; CHECK-GI-NEXT: and x14, x14, #0x1 -; CHECK-GI-NEXT: cset w15, hs -; CHECK-GI-NEXT: mul x5, x2, x8 -; CHECK-GI-NEXT: add x11, x11, x14 -; CHECK-GI-NEXT: and x14, x15, #0x1 -; CHECK-GI-NEXT: add x17, x9, x17 -; CHECK-GI-NEXT: add x14, x11, x14 -; CHECK-GI-NEXT: mov w11, #100 // =0x64 -; CHECK-GI-NEXT: umulh x13, x0, xzr -; CHECK-GI-NEXT: umulh x16, x2, x10 -; CHECK-GI-NEXT: adds x18, x18, x5 -; CHECK-GI-NEXT: mul x15, x3, x8 -; CHECK-GI-NEXT: add x13, x17, x13 -; CHECK-GI-NEXT: cset w17, hs -; CHECK-GI-NEXT: umulh x10, x3, x10 -; CHECK-GI-NEXT: add x13, x13, x14 -; CHECK-GI-NEXT: and x17, x17, #0x1 -; CHECK-GI-NEXT: cmn x18, x16 -; CHECK-GI-NEXT: sub x18, x2, x2 -; CHECK-GI-NEXT: umulh x16, x2, x8 +; CHECK-GI-NEXT: adds x13, x13, x16 +; CHECK-GI-NEXT: mul x4, x3, x10 +; CHECK-GI-NEXT: add x12, x12, x14 ; CHECK-GI-NEXT: cset w14, hs -; CHECK-GI-NEXT: and x14, x14, #0x1 -; CHECK-GI-NEXT: add x15, x15, x18 +; CHECK-GI-NEXT: adds x11, x13, x11 +; CHECK-GI-NEXT: and x13, x14, #0x1 +; CHECK-GI-NEXT: mul x15, x2, x8 +; CHECK-GI-NEXT: cset w14, hs +; CHECK-GI-NEXT: add x12, x12, x13 +; CHECK-GI-NEXT: and x13, x14, #0x1 +; CHECK-GI-NEXT: add x14, x9, x17 +; CHECK-GI-NEXT: sub x17, x2, x2 +; CHECK-GI-NEXT: umulh x16, x2, x10 +; CHECK-GI-NEXT: add x12, x12, x13 +; CHECK-GI-NEXT: add x13, x14, x18 +; CHECK-GI-NEXT: add x12, x13, x12 ; CHECK-GI-NEXT: and x18, xzr, #0x1 -; CHECK-GI-NEXT: add x14, x17, x14 +; CHECK-GI-NEXT: mul x5, x3, x8 +; CHECK-GI-NEXT: extr x11, x12, x11, #4 +; CHECK-GI-NEXT: adds x13, x4, x15 +; CHECK-GI-NEXT: umulh x14, x3, x10 +; CHECK-GI-NEXT: cset w15, hs +; CHECK-GI-NEXT: mov w10, #100 // =0x64 +; CHECK-GI-NEXT: cmn x13, x16 +; CHECK-GI-NEXT: and x15, x15, #0x1 +; CHECK-GI-NEXT: umulh x13, x2, x8 +; CHECK-GI-NEXT: cset w16, hs +; CHECK-GI-NEXT: add x17, x5, x17 +; CHECK-GI-NEXT: and x16, x16, #0x1 ; CHECK-GI-NEXT: umulh x8, x3, x8 +; CHECK-GI-NEXT: add x15, x15, x16 +; CHECK-GI-NEXT: adds x14, x17, x14 ; CHECK-GI-NEXT: and x17, xzr, #0x1 -; CHECK-GI-NEXT: adds x10, x15, x10 -; CHECK-GI-NEXT: add x15, x17, x18 +; CHECK-GI-NEXT: add x16, x18, x17 ; CHECK-GI-NEXT: cset w17, hs -; CHECK-GI-NEXT: umulh x18, x2, xzr +; CHECK-GI-NEXT: adds x13, x14, x13 +; CHECK-GI-NEXT: umulh x14, x2, xzr ; CHECK-GI-NEXT: and x17, x17, #0x1 -; CHECK-GI-NEXT: adds x10, x10, x16 -; CHECK-GI-NEXT: lsl x16, x13, #60 -; CHECK-GI-NEXT: add x15, x15, x17 -; CHECK-GI-NEXT: cset w17, hs -; CHECK-GI-NEXT: adds x10, x10, x14 -; CHECK-GI-NEXT: and x14, x17, #0x1 +; CHECK-GI-NEXT: cset w18, hs +; CHECK-GI-NEXT: adds x13, x13, x15 +; CHECK-GI-NEXT: add x15, x16, x17 +; CHECK-GI-NEXT: and x16, x18, #0x1 ; CHECK-GI-NEXT: cset w17, hs ; CHECK-GI-NEXT: add x8, x9, x8 -; CHECK-GI-NEXT: add x14, x15, x14 -; CHECK-GI-NEXT: and x15, x17, #0x1 -; CHECK-GI-NEXT: orr x12, x16, x12, lsr #4 -; CHECK-GI-NEXT: add x9, x14, x15 -; CHECK-GI-NEXT: add x8, x8, x18 -; CHECK-GI-NEXT: add x8, x8, x9 -; CHECK-GI-NEXT: lsr x9, x13, #4 -; CHECK-GI-NEXT: umulh x14, x12, x11 -; CHECK-GI-NEXT: lsl x13, x8, #60 +; CHECK-GI-NEXT: add x15, x15, x16 +; CHECK-GI-NEXT: and x16, x17, #0x1 +; CHECK-GI-NEXT: lsr x9, x12, #4 +; CHECK-GI-NEXT: add x15, x15, x16 +; CHECK-GI-NEXT: umulh x17, x11, x10 +; CHECK-GI-NEXT: add x8, x8, x14 +; CHECK-GI-NEXT: add x8, x8, x15 +; CHECK-GI-NEXT: mul x11, x11, x10 +; CHECK-GI-NEXT: extr x12, x8, x13, #4 ; CHECK-GI-NEXT: lsr x8, x8, #4 -; CHECK-GI-NEXT: mul x12, x12, x11 -; CHECK-GI-NEXT: orr x10, x13, x10, lsr #4 -; CHECK-GI-NEXT: madd x9, x9, x11, x14 -; CHECK-GI-NEXT: umulh x13, x10, x11 -; CHECK-GI-NEXT: subs x0, x0, x12 -; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: madd x9, x9, x10, x17 +; CHECK-GI-NEXT: umulh x13, x12, x10 +; CHECK-GI-NEXT: subs x0, x0, x11 +; CHECK-GI-NEXT: mul x12, x12, x10 ; CHECK-GI-NEXT: sbc x1, x1, x9 -; CHECK-GI-NEXT: madd x8, x8, x11, x13 -; CHECK-GI-NEXT: subs x2, x2, x10 +; CHECK-GI-NEXT: madd x8, x8, x10, x13 +; CHECK-GI-NEXT: subs x2, x2, x12 ; CHECK-GI-NEXT: sbc x3, x3, x8 ; CHECK-GI-NEXT: ret entry: From ab5828cdce97fc8a13fd919bf41cc891bb0c244a Mon Sep 17 00:00:00 2001 From: Jack Styles Date: Wed, 29 Oct 2025 08:20:11 +0000 Subject: [PATCH 084/539] [libunwind][PAuthLR] Remove PC offset when using FEAT_PAuthLR (#164224) When originally introduced to libunwind as part of #112171, FEAT_PAuthLR had its Call Frame Instruction's (CFI's) in a different location to other Signing Authentication methods. To incorporate this in libunwind, a 4 byte offset was introduced to work with this. However, this design was reversed in #121551 so the CFI's are emitted in the same location as other methods. When making this change, the offset in libunwind was not removed, so libunwind's PC value would be incorrect. As the 4 byte offset is no longer needed, that adjustment can be removed. results->ptrAuthDiversifier will still be set. --- libunwind/src/DwarfParser.hpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/libunwind/src/DwarfParser.hpp b/libunwind/src/DwarfParser.hpp index dbd7d65c354aa..2b04ae2831f9a 100644 --- a/libunwind/src/DwarfParser.hpp +++ b/libunwind/src/DwarfParser.hpp @@ -842,12 +842,10 @@ bool CFI_Parser::parseFDEInstructions(A &addressSpace, results->savedRegisters[UNW_AARCH64_RA_SIGN_STATE].value ^ 0x3; results->setRegisterValue(UNW_AARCH64_RA_SIGN_STATE, value, initialState); - // When calculating the value of the PC, it is assumed that the CFI - // instruction is placed before the signing instruction, however it is - // placed after. Because of this, we need to take into account the CFI - // instruction is one instruction call later than expected, and reduce - // the PC value by 4 bytes to compensate. - results->ptrAuthDiversifier = fdeInfo.pcStart + codeOffset - 0x4; + // When using Feat_PAuthLR, the PC value needs to be captured so that + // during unwinding, the correct PC value is used for re-authentication. + // It is assumed that the CFI is placed before the signing instruction. + results->ptrAuthDiversifier = fdeInfo.pcStart + codeOffset; _LIBUNWIND_TRACE_DWARF( "DW_CFA_AARCH64_negate_ra_state_with_pc(pc=0x%" PRIx64 ")\n", static_cast(results->ptrAuthDiversifier)); From 7e1f460541f58661ca6ba7ce663606c287926874 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Wed, 29 Oct 2025 09:03:34 +0000 Subject: [PATCH 085/539] [llvm][DebugInfo][test] Add LLVM tests for Objective-C property debug-info (#165373) The IR->DWARF pipeline was not properly tested before. This patch adds a test to generate DWARF for various `DIObjCProperty` constructions. This caught a couple of bugs: 1. The `DW_AT_APPLE_property_getter` and `DW_AT_APPLE_property_setter` properties were emitted the wrong way around. 2. The `DW_TAG_member` ivars were not linking back to the property that they back. These will be fixed in follow-up patches. --- llvm/test/DebugInfo/Generic/objc-property.ll | 89 ++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 llvm/test/DebugInfo/Generic/objc-property.ll diff --git a/llvm/test/DebugInfo/Generic/objc-property.ll b/llvm/test/DebugInfo/Generic/objc-property.ll new file mode 100644 index 0000000000000..6dd0e01017780 --- /dev/null +++ b/llvm/test/DebugInfo/Generic/objc-property.ll @@ -0,0 +1,89 @@ +; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump --debug-info - | FileCheck %s + +; CHECK: DW_TAG_structure_type +; CHECK: DW_AT_name ("Foo") +; +; CHECK: DW_TAG_APPLE_property +; CHECK: DW_AT_APPLE_property_name ("autoSynthProp") +; CHECK: DW_AT_APPLE_property_attribute +; CHECK-SAME: DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, +; CHECK-SAME: DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained +; +; CHECK: DW_TAG_APPLE_property +; CHECK: DW_AT_APPLE_property_name ("synthProp") +; CHECK: DW_AT_APPLE_property_attribute +; CHECK-SAME: DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, +; CHECK-SAME: DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained +; +; FIXME: this should have a DW_AT_APPLE_property_getter tag +; CHECK: DW_TAG_APPLE_property +; CHECK: DW_AT_APPLE_property_name ("customGetterProp") +; CHECK: DW_AT_APPLE_property_setter ("customGetter") +; CHECK: DW_AT_APPLE_property_attribute +; CHECK-SAME: DW_APPLE_PROPERTY_getter, DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, +; CHECK-SAME: DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained +; +; FIXME: this should have a DW_AT_APPLE_property_setter tag +; CHECK: DW_TAG_APPLE_property +; CHECK: DW_AT_APPLE_property_name ("customSetterProp") +; CHECK: DW_AT_APPLE_property_getter ("customSetter:") +; CHECK: DW_AT_APPLE_property_attribute +; CHECK-SAME: DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, +; CHECK-SAME: DW_APPLE_PROPERTY_setter, DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained +; +; FIXME: the DW_AT_APPLE_property_(getter|setter) values are inverted +; CHECK: DW_TAG_APPLE_property +; CHECK: DW_AT_APPLE_property_name ("customAccessorsProp") +; CHECK: DW_AT_APPLE_property_getter ("customSetter:") +; CHECK: DW_AT_APPLE_property_setter ("customGetter") +; CHECK: DW_AT_APPLE_property_attribute +; CHECK-SAME: DW_APPLE_PROPERTY_getter, DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, +; CHECK-SAME: DW_APPLE_PROPERTY_setter, DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained +; +; FIXME: missing link between DW_TAG_member and the associated DW_TAG_APPLE_property +; CHECK: DW_TAG_member +; CHECK-NOT: DW_AT_APPLE_property +; CHECK: DW_TAG_member +; CHECK-NOT: DW_AT_APPLE_property +; CHECK: DW_TAG_member +; CHECK-NOT: DW_AT_APPLE_property +; CHECK: DW_TAG_member +; CHECK-NOT: DW_AT_APPLE_property + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} + +!0 = !{i32 7, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = distinct !DICompileUnit(language: DW_LANG_ObjC, file: !3, producer: "hand written", isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, retainedTypes: !4, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: Apple) +!3 = !DIFile(filename: "main.m", directory: "/tmp") +!4 = !{!5} +!5 = !DICompositeType(tag: DW_TAG_structure_type, name: "Foo", scope: !3, file: !3, line: 1, size: 128, flags: DIFlagObjcClassComplete, elements: !6, runtimeLang: DW_LANG_ObjC) +!6 = !{!7, !9, !10, !11, !12, !13, !14, !15, !16, !17, !24, !27, !28, !29, !30, !31, !32} +!7 = !DIObjCProperty(name: "autoSynthProp", file: !3, line: 5, attributes: 2316, type: !8) +!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!9 = !DIObjCProperty(name: "synthProp", file: !3, line: 6, attributes: 2316, type: !8) +!10 = !DIObjCProperty(name: "customGetterProp", file: !3, line: 7, getter: "customGetter", attributes: 2318, type: !8) +!11 = !DIObjCProperty(name: "customSetterProp", file: !3, line: 8, setter: "customSetter:", attributes: 2444, type: !8) +!12 = !DIObjCProperty(name: "customAccessorsProp", file: !3, line: 9, setter: "customSetter:", getter: "customGetter", attributes: 2446, type: !8) +!13 = !DIDerivedType(tag: DW_TAG_member, name: "someBackingIvar", scope: !3, file: !3, line: 2, baseType: !8, size: 32, flags: DIFlagProtected, extraData: !9) +!14 = !DIDerivedType(tag: DW_TAG_member, name: "_autoSynthProp", scope: !3, file: !3, line: 5, baseType: !8, size: 32, flags: DIFlagPrivate, extraData: !7) +!15 = !DIDerivedType(tag: DW_TAG_member, name: "_customGetterProp", scope: !3, file: !3, line: 7, baseType: !8, size: 32, flags: DIFlagPrivate, extraData: !10) +!16 = !DIDerivedType(tag: DW_TAG_member, name: "_customSetterProp", scope: !3, file: !3, line: 8, baseType: !8, size: 32, flags: DIFlagPrivate, extraData: !11) +!17 = !DISubprogram(name: "-[Foo customGetter]", scope: !5, file: !3, line: 19, type: !18, scopeLine: 19, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit) +!18 = !DISubroutineType(types: !19) +!19 = !{!8, !20, !21} +!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer) +!21 = !DIDerivedType(tag: DW_TAG_typedef, name: "SEL", file: !3, baseType: !22, flags: DIFlagArtificial) +!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 64) +!23 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_selector", file: !3, flags: DIFlagFwdDecl) +!24 = !DISubprogram(name: "-[Foo customSetter:]", scope: !5, file: !3, line: 23, type: !25, scopeLine: 23, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit) +!25 = !DISubroutineType(types: !26) +!26 = !{null, !20, !21, !8} +!27 = !DISubprogram(name: "-[Foo synthProp]", scope: !5, file: !3, line: 17, type: !18, scopeLine: 17, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit) +!28 = !DISubprogram(name: "-[Foo setSynthProp:]", scope: !5, file: !3, line: 17, type: !25, scopeLine: 17, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit) +!29 = !DISubprogram(name: "-[Foo autoSynthProp]", scope: !5, file: !3, line: 5, type: !18, scopeLine: 5, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit) +!30 = !DISubprogram(name: "-[Foo setAutoSynthProp:]", scope: !5, file: !3, line: 5, type: !25, scopeLine: 5, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit) +!31 = !DISubprogram(name: "-[Foo setCustomGetterProp:]", scope: !5, file: !3, line: 7, type: !25, scopeLine: 7, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit) +!32 = !DISubprogram(name: "-[Foo customSetterProp]", scope: !5, file: !3, line: 8, type: !18, scopeLine: 8, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit) + From ea30df1f73a316adda374d5cbcf803879635c158 Mon Sep 17 00:00:00 2001 From: Sergei Druzhkov Date: Wed, 29 Oct 2025 12:04:22 +0300 Subject: [PATCH 086/539] [lldb] Fix StdUnorderedMapSynthProvider for GCC (#164251) This patch adds small workaround for [issue](https://github.com/llvm/llvm-project/issues/152504). It looks like code compiled with gcc has lack of some important debug information (e.g. DW_TAG_template_type_parameter for allocator). Example code: ```cpp #include int main() { std::unordered_map map = { {1, 2} }; return 0; } ``` Output from `llvm-dwarfdump` for code compiled by GCC or Clang. I used system GCC (13.3.0) and Clang (18.1.3) on Ubuntu 24.04 (WSL). GCC: ``` 0x00001fcd: DW_TAG_class_type DW_AT_name ("allocator >") DW_AT_byte_size (0x01) DW_AT_decl_file ("/usr/include/c++/13/bits/allocator.h") DW_AT_decl_line (130) DW_AT_decl_column (11) DW_AT_sibling (0x0000207c) 0x00001fda: DW_TAG_inheritance DW_AT_type (0x00001d0a "std::__new_allocator >") DW_AT_data_member_location (0) DW_AT_accessibility (DW_ACCESS_public) 0x00001fe0: DW_TAG_subprogram DW_AT_external (true) DW_AT_name ("allocator") DW_AT_decl_file ("/usr/include/c++/13/bits/allocator.h") DW_AT_decl_line (163) DW_AT_decl_column (7) DW_AT_linkage_name ("_ZNSaISt4pairIKiiEEC4Ev") DW_AT_accessibility (DW_ACCESS_public) DW_AT_declaration (true) DW_AT_object_pointer (0x00001ff4) DW_AT_sibling (0x00001ffa) 0x00001ff4: DW_TAG_formal_parameter DW_AT_type (0x00004eb9 "std::allocator > *") DW_AT_artificial (true) 0x00001ff9: NULL 0x00001ffa: DW_TAG_subprogram DW_AT_external (true) DW_AT_name ("allocator") DW_AT_decl_file ("/usr/include/c++/13/bits/allocator.h") DW_AT_decl_line (167) DW_AT_decl_column (7) DW_AT_linkage_name ("_ZNSaISt4pairIKiiEEC4ERKS2_") DW_AT_accessibility (DW_ACCESS_public) DW_AT_declaration (true) DW_AT_object_pointer (0x0000200e) DW_AT_sibling (0x00002019) 0x0000200e: DW_TAG_formal_parameter DW_AT_type (0x00004eb9 "std::allocator > *") DW_AT_artificial (true) 0x00002013: DW_TAG_formal_parameter DW_AT_type (0x00004ec3 "const std::allocator > &") 0x00002018: NULL 0x00002019: DW_TAG_subprogram DW_AT_external (true) DW_AT_name ("operator=") DW_AT_decl_file ("/usr/include/c++/13/bits/allocator.h") DW_AT_decl_line (172) DW_AT_decl_column (18) DW_AT_linkage_name ("_ZNSaISt4pairIKiiEEaSERKS2_") DW_AT_type (0x00004ec8 "std::allocator > &") DW_AT_accessibility (DW_ACCESS_public) DW_AT_declaration (true) DW_AT_defaulted (DW_DEFAULTED_in_class) DW_AT_object_pointer (0x00002031) DW_AT_sibling (0x0000203c) 0x00002031: DW_TAG_formal_parameter DW_AT_type (0x00004eb9 "std::allocator > *") DW_AT_artificial (true) 0x00002036: DW_TAG_formal_parameter DW_AT_type (0x00004ec3 "const std::allocator > &") 0x0000203b: NULL 0x0000203c: DW_TAG_subprogram DW_AT_external (true) DW_AT_name ("~allocator") DW_AT_decl_file ("/usr/include/c++/13/bits/allocator.h") DW_AT_decl_line (184) DW_AT_decl_column (7) DW_AT_linkage_name ("_ZNSaISt4pairIKiiEED4Ev") DW_AT_accessibility (DW_ACCESS_public) DW_AT_declaration (true) DW_AT_object_pointer (0x00002050) DW_AT_sibling (0x0000205b) 0x00002050: DW_TAG_formal_parameter DW_AT_type (0x00004eb9 "std::allocator > *") DW_AT_artificial (true) 0x00002055: DW_TAG_formal_parameter DW_AT_type (0x00004cab "int") DW_AT_artificial (true) 0x0000205a: NULL ``` Clang: ``` 0x00001a6e: DW_TAG_class_type DW_AT_calling_convention (DW_CC_pass_by_reference) DW_AT_name ("allocator >") DW_AT_byte_size (0x01) DW_AT_decl_file ("/usr/bin/../lib/gcc/x86_64-linux-gnu/13/../../../../include/c++/13/bits/allocator.h") DW_AT_decl_line (130) 0x00001a74: DW_TAG_template_type_parameter DW_AT_type (0x00000dec "std::pair") DW_AT_name ("_Tp") 0x00001a7a: DW_TAG_inheritance DW_AT_type (0x00001ad5 "std::__allocator_base >") DW_AT_data_member_location (0x00) DW_AT_accessibility (DW_ACCESS_public) 0x00001a81: DW_TAG_subprogram DW_AT_name ("allocator") DW_AT_decl_file ("/usr/bin/../lib/gcc/x86_64-linux-gnu/13/../../../../include/c++/13/bits/allocator.h") DW_AT_decl_line (163) DW_AT_declaration (true) DW_AT_external (true) DW_AT_accessibility (DW_ACCESS_public) 0x00001a86: DW_TAG_formal_parameter DW_AT_type (0x00002dd1 "std::allocator > *") DW_AT_artificial (true) 0x00001a8b: NULL 0x00001a8c: DW_TAG_subprogram DW_AT_name ("allocator") DW_AT_decl_file ("/usr/bin/../lib/gcc/x86_64-linux-gnu/13/../../../../include/c++/13/bits/allocator.h") DW_AT_decl_line (167) DW_AT_declaration (true) DW_AT_external (true) DW_AT_accessibility (DW_ACCESS_public) 0x00001a91: DW_TAG_formal_parameter DW_AT_type (0x00002dd1 "std::allocator > *") DW_AT_artificial (true) 0x00001a96: DW_TAG_formal_parameter DW_AT_type (0x00002dd6 "const std::allocator > &") 0x00001a9b: NULL 0x00001a9c: DW_TAG_subprogram DW_AT_linkage_name ("_ZNSaISt4pairIKiiEEaSERKS2_") DW_AT_name ("operator=") DW_AT_decl_file ("/usr/bin/../lib/gcc/x86_64-linux-gnu/13/../../../../include/c++/13/bits/allocator.h") DW_AT_decl_line (172) DW_AT_type (0x00002de0 "std::allocator > &") DW_AT_declaration (true) DW_AT_external (true) DW_AT_accessibility (DW_ACCESS_public) 0x00001aa6: DW_TAG_formal_parameter DW_AT_type (0x00002dd1 "std::allocator > *") DW_AT_artificial (true) 0x00001aab: DW_TAG_formal_parameter DW_AT_type (0x00002dd6 "const std::allocator > &") 0x00001ab0: NULL 0x00001ab1: DW_TAG_subprogram DW_AT_name ("~allocator") DW_AT_decl_file ("/usr/bin/../lib/gcc/x86_64-linux-gnu/13/../../../../include/c++/13/bits/allocator.h") DW_AT_decl_line (184) DW_AT_declaration (true) DW_AT_external (true) DW_AT_accessibility (DW_ACCESS_public) 0x00001ab6: DW_TAG_formal_parameter DW_AT_type (0x00002dd1 "std::allocator > *") DW_AT_artificial (true) 0x00001abb: NULL ``` I propose to add fallback implementation based on type of `_M_h`. --- lldb/examples/synthetic/gnu_libstdcpp.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lldb/examples/synthetic/gnu_libstdcpp.py b/lldb/examples/synthetic/gnu_libstdcpp.py index f42a009c21f48..8a41ddff9b679 100644 --- a/lldb/examples/synthetic/gnu_libstdcpp.py +++ b/lldb/examples/synthetic/gnu_libstdcpp.py @@ -63,11 +63,8 @@ def __init__(self, valobj, dict): self.count = None def extract_type(self): - type = self.valobj.GetType() - # The last template argument is the allocator type. - template_arg_num = type.GetNumberOfTemplateArguments() - 1 - allocator_type = type.GetTemplateArgumentType(template_arg_num) - data_type = allocator_type.GetTemplateArgumentType(0) + head_type = self.head.GetType().GetCanonicalType() + data_type = head_type.GetTemplateArgumentType(1) return data_type def update(self): From 8a9f5bdbbac5cbc119747aed39b55a61eaf5ea3b Mon Sep 17 00:00:00 2001 From: Baranov Victor Date: Wed, 29 Oct 2025 12:16:30 +0300 Subject: [PATCH 087/539] [ASTMatchers][Docs] Regenerate MatchersReference via dump_ast_matchers.py (#165448) It appears that we forgot to update user-facing docs for God know how long. --- clang/docs/LibASTMatchersReference.html | 594 +++++++++++++----------- 1 file changed, 313 insertions(+), 281 deletions(-) diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html index 9b30057b5257f..5b2a96d00d592 100644 --- a/clang/docs/LibASTMatchersReference.html +++ b/clang/docs/LibASTMatchersReference.html @@ -1028,6 +1028,15 @@

Node Matchers

+Matcher<Decl>requiresExprBodyDeclMatcher<RequiresExprBodyDecl>... +
Matches concept requirement body declaration.
+
+Example matches '{ *p; }'
+  template<typename T>
+  concept dereferencable = requires(T p) { *p; }
+
+ + Matcher<Decl>staticAssertDeclMatcher<StaticAssertDecl>...
Matches a C++ static_assert declaration.
 
@@ -1190,6 +1199,17 @@ 

Node Matchers

matches using enum X::x
+Matcher<Decl>usingShadowDeclMatcher<UsingShadowDecl>... +
Matches shadow declarations introduced into a scope by a
+       (resolved) using declaration.
+
+Given
+  namespace n { int f; }
+  namespace declToImport { using n::f; }
+usingShadowDecl()
+  matches f 
+ + Matcher<Decl>valueDeclMatcher<ValueDecl>...
Matches any value declaration.
 
@@ -1210,6 +1230,15 @@ 

Node Matchers

+Matcher<Expr>requiresExprMatcher<RequiresExpr>... +
Matches concept requirement.
+
+Example matches 'requires(T p) { *p; }'
+  template<typename T>
+  concept dereferencable = requires(T p) { *p; }
+
+ + Matcher<LambdaCapture>lambdaCaptureMatcher<LambdaCapture>...
Matches lambda captures.
 
@@ -1679,6 +1708,19 @@ 

Node Matchers

+Matcher<Stmt>cxxNamedCastExprMatcher<CXXNamedCastExpr>... +
Matches any named cast expression.
+
+Example: Matches all four of the casts in
+  struct S { virtual void f(); };
+  S* p = nullptr;
+  S* ptr1 = static_cast<S*>(p);
+  S* ptr2 = reinterpret_cast<S*>(p);
+  S* ptr3 = dynamic_cast<S*>(p);
+  S* ptr4 = const_cast<S*>(p);
+
+ + Matcher<Stmt>cxxNewExprMatcher<CXXNewExpr>...
Matches new expressions.
 
@@ -2168,7 +2210,7 @@ 

Node Matchers

-Matcher<Stmt>ompExecutableDirectiveMatcher<OMPExecutableDirective>... +Matcher<Stmt>ompExecutableDirectiveMatcher<OMPExecutableDirective>...
Matches any ``#pragma omp`` executable directive.
 
 Given
@@ -2393,17 +2435,6 @@ 

Node Matchers

-Matcher<TypeLoc>elaboratedTypeLocMatcher<ElaboratedTypeLoc>... -
Matches C or C++ elaborated `TypeLoc`s.
-
-Given
-  struct s {};
-  struct s ss;
-elaboratedTypeLoc()
-  matches the `TypeLoc` of the variable declaration of `ss`.
-
- - Matcher<TypeLoc>pointerTypeLocMatcher<PointerTypeLoc>...
Matches pointer `TypeLoc`s.
 
@@ -2474,7 +2505,7 @@ 

Node Matchers

-Matcher<Type>autoTypeMatcher<AutoType>... +Matcher<Type>autoTypeMatcher<AutoType>...
Matches types nodes representing C++11 auto types.
 
 Given:
@@ -2544,7 +2575,7 @@ 

Node Matchers

-Matcher<Type>decltypeTypeMatcher<DecltypeType>... +Matcher<Type>decltypeTypeMatcher<DecltypeType>...
Matches types nodes representing C++11 decltype(<expr>) types.
 
 Given:
@@ -2556,7 +2587,7 @@ 

Node Matchers

-Matcher<Type>deducedTemplateSpecializationTypeMatcher<DeducedTemplateSpecializationType>... +Matcher<Type>deducedTemplateSpecializationTypeMatcher<DeducedTemplateSpecializationType>...
Matches C++17 deduced template specialization types, e.g. deduced class
 template types.
 
@@ -2570,7 +2601,7 @@ 

Node Matchers

-Matcher<Type>dependentNameTypeMatcher<DependentNameType>... +Matcher<Type>dependentNameTypeMatcher<DependentNameType>...
Matches a dependent name type
 
 Example matches T::type
@@ -2607,38 +2638,7 @@ 

Node Matchers

-Matcher<Type>dependentTemplateSpecializationTypeMatcher<DependentTemplateSpecializationType>... -
Matches a dependent template specialization type
-
-Example matches A<T>::template B<T>
-  template<typename T> struct A;
-  template<typename T> struct declToImport {
-    typename A<T>::template B<T> a;
-  };
-
- - -Matcher<Type>elaboratedTypeMatcher<ElaboratedType>... -
Matches types specified with an elaborated type keyword or with a
-qualified name.
-
-Given
-  namespace N {
-    namespace M {
-      class D {};
-    }
-  }
-  class C {};
-
-  class C c;
-  N::M::D d;
-
-elaboratedType() matches the type of the variable declarations of both
-c and d.
-
- - -Matcher<Type>enumTypeMatcher<EnumType>... +Matcher<Type>enumTypeMatcher<EnumType>...
Matches enum types.
 
 Given
@@ -2688,7 +2688,7 @@ 

Node Matchers

-Matcher<Type>injectedClassNameTypeMatcher<InjectedClassNameType>... +Matcher<Type>injectedClassNameTypeMatcher<InjectedClassNameType>...
Matches injected class name types.
 
 Example matches S s, but not S<T> s.
@@ -2800,7 +2800,7 @@ 

Node Matchers

-Matcher<Type>recordTypeMatcher<RecordType>... +Matcher<Type>recordTypeMatcher<RecordType>...
Matches record types (e.g. structs, classes).
 
 Given
@@ -2831,7 +2831,7 @@ 

Node Matchers

-Matcher<Type>substTemplateTypeParmTypeMatcher<SubstTemplateTypeParmType>... +Matcher<Type>substTemplateTypeParmTypeMatcher<SubstTemplateTypeParmType>...
Matches types that represent the result of substituting a type for a
 template type parameter.
 
@@ -2845,7 +2845,7 @@ 

Node Matchers

-Matcher<Type>tagTypeMatcher<TagType>... +Matcher<Type>tagTypeMatcher<TagType>...
Matches tag types (record and enum types).
 
 Given
@@ -2860,7 +2860,7 @@ 

Node Matchers

-Matcher<Type>templateSpecializationTypeMatcher<TemplateSpecializationType>... +Matcher<Type>templateSpecializationTypeMatcher<TemplateSpecializationType>...
Matches template specialization types.
 
 Given
@@ -2875,7 +2875,7 @@ 

Node Matchers

-Matcher<Type>templateTypeParmTypeMatcher<TemplateTypeParmType>... +Matcher<Type>templateTypeParmTypeMatcher<TemplateTypeParmType>...
Matches template type parameter types.
 
 Example matches T, but not int.
@@ -2899,7 +2899,7 @@ 

Node Matchers

-Matcher<Type>unaryTransformTypeMatcher<UnaryTransformType>... +Matcher<Type>unaryTransformTypeMatcher<UnaryTransformType>...
Matches types nodes representing unary type transformations.
 
 Given:
@@ -3077,8 +3077,8 @@ 

Narrowing Matchers

Matcher<CXXBaseSpecifier>isPrivate -
Matches private C++ declarations and C++ base specifers that specify private
-inheritance.
+
Matches private C++ declarations and C++ base specifiers that specify
+private inheritance.
 
 Examples:
   class C {
@@ -3094,7 +3094,7 @@ 

Narrowing Matchers

Matcher<CXXBaseSpecifier>isProtected -
Matches protected C++ declarations and C++ base specifers that specify
+
Matches protected C++ declarations and C++ base specifiers that specify
 protected inheritance.
 
 Examples:
@@ -3110,7 +3110,7 @@ 

Narrowing Matchers

Matcher<CXXBaseSpecifier>isPublic -
Matches public C++ declarations and C++ base specifers that specify public
+
Matches public C++ declarations and C++ base specifiers that specify public
 inheritance.
 
 Examples:
@@ -3127,7 +3127,7 @@ 

Narrowing Matchers

Matcher<CXXBaseSpecifier>isVirtual -
Matches declarations of virtual methods and C++ base specifers that specify
+
Matches declarations of virtual methods and C++ base specifiers that specify
 virtual inheritance.
 
 Example:
@@ -3709,7 +3709,7 @@ 

Narrowing Matchers

Matcher<CXXMethodDecl>isVirtual -
Matches declarations of virtual methods and C++ base specifers that specify
+
Matches declarations of virtual methods and C++ base specifiers that specify
 virtual inheritance.
 
 Example:
@@ -4161,6 +4161,12 @@ 

Narrowing Matchers

+Matcher<Decl>declaresSameEntityAsBoundNodestd::string ID +
Matches a declaration if it declares the same entity as the node previously
+bound to ID.
+
+ + Matcher<Decl>equalsBoundNodestd::string ID
Matches if a node equals a previously bound node.
 
@@ -4322,8 +4328,8 @@ 

Narrowing Matchers

Matcher<Decl>isPrivate -
Matches private C++ declarations and C++ base specifers that specify private
-inheritance.
+
Matches private C++ declarations and C++ base specifiers that specify
+private inheritance.
 
 Examples:
   class C {
@@ -4339,7 +4345,7 @@ 

Narrowing Matchers

Matcher<Decl>isProtected -
Matches protected C++ declarations and C++ base specifers that specify
+
Matches protected C++ declarations and C++ base specifiers that specify
 protected inheritance.
 
 Examples:
@@ -4355,7 +4361,7 @@ 

Narrowing Matchers

Matcher<Decl>isPublic -
Matches public C++ declarations and C++ base specifers that specify public
+
Matches public C++ declarations and C++ base specifiers that specify public
 inheritance.
 
 Examples:
@@ -4371,7 +4377,7 @@ 

Narrowing Matchers

-Matcher<DependentNameType>hasDependentNamestd::string N +Matcher<DependentNameType>hasDependentNamestd::string N
Matches the dependent name of a DependentScopeDeclRefExpr or
 DependentNameType
 
@@ -5046,7 +5052,7 @@ 

Narrowing Matchers

int z; Example matches f() because it has external formal linkage despite being -unique to the translation unit as though it has internal likage +unique to the translation unit as though it has internal linkage (matcher = functionDecl(hasExternalFormalLinkage())) namespace { @@ -5182,7 +5188,7 @@

Narrowing Matchers

-Matcher<OMPExecutableDirective>isAllowedToContainClauseKindOpenMPClauseKind CKind +Matcher<OMPExecutableDirective>isAllowedToContainClauseKindOpenMPClauseKind CKind
Matches if the OpenMP directive is allowed to contain the specified OpenMP
 clause kind.
 
@@ -5192,7 +5198,7 @@ 

Narrowing Matchers

#pragma omp parallel for #pragma omp for -`ompExecutableDirective(isAllowedToContainClause(OMPC_default))`` matches +``ompExecutableDirective(isAllowedToContainClause(OMPC_default))`` matches ``omp parallel`` and ``omp parallel for``. If the matcher is use from clang-query, ``OpenMPClauseKind`` parameter @@ -5201,7 +5207,7 @@

Narrowing Matchers

-Matcher<OMPExecutableDirective>isStandaloneDirective +Matcher<OMPExecutableDirective>isStandaloneDirective
Matches standalone OpenMP directives,
 i.e., directives that can't have a structured block.
 
@@ -5545,10 +5551,10 @@ 

Narrowing Matchers

Given void a(int); - void b(long); + void b(unsigned long); void c(double); functionDecl(hasAnyParameter(hasType(isInteger()))) -matches "a(int)", "b(long)", but not "c(double)". +matches "a(int)", "b(unsigned long)", but not "c(double)".
@@ -5781,7 +5787,7 @@

Narrowing Matchers

Matches a TemplateArgument of integral type with a given value.
 
 Note that 'Value' is a string as the template argument's value is
-an arbitrary precision integer. 'Value' must be euqal to the canonical
+an arbitrary precision integer. 'Value' must be equal to the canonical
 representation of that integral value in base 10.
 
 Given
@@ -5806,7 +5812,7 @@ 

Narrowing Matchers

-Matcher<TemplateSpecializationType>templateArgumentCountIsunsigned N +Matcher<TemplateSpecializationType>templateArgumentCountIsunsigned N
Matches if the number of template arguments equals N.
 
 Given
@@ -6571,8 +6577,8 @@ 

AST Traversal Matchers

Matcher<AbstractConditionalOperator>hasConditionMatcher<Expr> InnerMatcher -
Matches the condition expression of an if statement, for loop,
-switch statement or conditional operator.
+
Matches the condition expression of an if statement, for loop, while loop,
+do-while loop, switch statement or conditional operator.
 
 Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
   if (true) {}
@@ -6600,8 +6606,8 @@ 

AST Traversal Matchers

-Matcher<AddrLabelExpr>hasDeclarationMatcher<Decl> InnerMatcher -
Matches a node if the declaration associated with that node
+Matcher<AddrLabelExpr>hasDeclarationMatcher<Decl>  InnerMatcher
+
Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -6626,11 +6632,11 @@ 

AST Traversal Matchers

Usable as: Matcher<AddrLabelExpr>, Matcher<CallExpr>, Matcher<CXXConstructExpr>, Matcher<CXXNewExpr>, Matcher<DeclRefExpr>, - Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, - Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, - Matcher<TagType>, Matcher<TemplateSpecializationType>, - Matcher<TemplateTypeParmType>, Matcher<TypedefType>, - Matcher<UnresolvedUsingType> + Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, + Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, + Matcher<TagType>, Matcher<TemplateSpecializationType>, + Matcher<TemplateTypeParmType>, Matcher<TypedefType>, + Matcher<UnresolvedUsingType>, Matcher<UsingType>
@@ -6701,7 +6707,7 @@

AST Traversal Matchers

-Matcher<AutoType>hasDeducedTypeMatcher<Type> +Matcher<AutoType>hasDeducedTypeMatcher<Type>
Matches AutoType nodes where the deduced type is a specific type.
 
 Note: There is no TypeLoc for the deduced type and thus no
@@ -6713,7 +6719,7 @@ 

AST Traversal Matchers

autoType(hasDeducedType(isInteger())) matches "auto a" -Usable as: Matcher<AutoType> +Usable as: Matcher<AutoType>
@@ -7026,8 +7032,8 @@

AST Traversal Matchers

-Matcher<CXXConstructExpr>hasDeclarationMatcher<Decl> InnerMatcher -
Matches a node if the declaration associated with that node
+Matcher<CXXConstructExpr>hasDeclarationMatcher<Decl>  InnerMatcher
+
Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -7052,11 +7058,11 @@ 

AST Traversal Matchers

Usable as: Matcher<AddrLabelExpr>, Matcher<CallExpr>, Matcher<CXXConstructExpr>, Matcher<CXXNewExpr>, Matcher<DeclRefExpr>, - Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, - Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, - Matcher<TagType>, Matcher<TemplateSpecializationType>, - Matcher<TemplateTypeParmType>, Matcher<TypedefType>, - Matcher<UnresolvedUsingType> + Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, + Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, + Matcher<TagType>, Matcher<TemplateSpecializationType>, + Matcher<TemplateTypeParmType>, Matcher<TypedefType>, + Matcher<UnresolvedUsingType>, Matcher<UsingType>
@@ -7489,8 +7495,8 @@

AST Traversal Matchers

-Matcher<CXXNewExpr>hasDeclarationMatcher<Decl> InnerMatcher -
Matches a node if the declaration associated with that node
+Matcher<CXXNewExpr>hasDeclarationMatcher<Decl>  InnerMatcher
+
Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -7515,11 +7521,11 @@ 

AST Traversal Matchers

Usable as: Matcher<AddrLabelExpr>, Matcher<CallExpr>, Matcher<CXXConstructExpr>, Matcher<CXXNewExpr>, Matcher<DeclRefExpr>, - Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, - Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, - Matcher<TagType>, Matcher<TemplateSpecializationType>, - Matcher<TemplateTypeParmType>, Matcher<TypedefType>, - Matcher<UnresolvedUsingType> + Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, + Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, + Matcher<TagType>, Matcher<TemplateSpecializationType>, + Matcher<TemplateTypeParmType>, Matcher<TypedefType>, + Matcher<UnresolvedUsingType>, Matcher<UsingType>
@@ -7952,8 +7958,8 @@

AST Traversal Matchers

-Matcher<CallExpr>hasDeclarationMatcher<Decl> InnerMatcher -
Matches a node if the declaration associated with that node
+Matcher<CallExpr>hasDeclarationMatcher<Decl>  InnerMatcher
+
Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -7978,11 +7984,11 @@ 

AST Traversal Matchers

Usable as: Matcher<AddrLabelExpr>, Matcher<CallExpr>, Matcher<CXXConstructExpr>, Matcher<CXXNewExpr>, Matcher<DeclRefExpr>, - Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, - Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, - Matcher<TagType>, Matcher<TemplateSpecializationType>, - Matcher<TemplateTypeParmType>, Matcher<TypedefType>, - Matcher<UnresolvedUsingType> + Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, + Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, + Matcher<TagType>, Matcher<TemplateSpecializationType>, + Matcher<TemplateTypeParmType>, Matcher<TypedefType>, + Matcher<UnresolvedUsingType>, Matcher<UsingType>
@@ -8204,7 +8210,7 @@

AST Traversal Matchers

Matcher<DecayedType>hasDecayedTypeMatcher<QualType> InnerType -
Matches the decayed type, whoes decayed type matches InnerMatcher
+
Matches the decayed type, whose decayed type matches InnerMatcher
 
@@ -8223,8 +8229,8 @@

AST Traversal Matchers

-Matcher<DeclRefExpr>hasDeclarationMatcher<Decl> InnerMatcher -
Matches a node if the declaration associated with that node
+Matcher<DeclRefExpr>hasDeclarationMatcher<Decl>  InnerMatcher
+
Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -8249,11 +8255,11 @@ 

AST Traversal Matchers

Usable as: Matcher<AddrLabelExpr>, Matcher<CallExpr>, Matcher<CXXConstructExpr>, Matcher<CXXNewExpr>, Matcher<DeclRefExpr>, - Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, - Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, - Matcher<TagType>, Matcher<TemplateSpecializationType>, - Matcher<TemplateTypeParmType>, Matcher<TypedefType>, - Matcher<UnresolvedUsingType> + Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, + Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, + Matcher<TagType>, Matcher<TemplateSpecializationType>, + Matcher<TemplateTypeParmType>, Matcher<TypedefType>, + Matcher<UnresolvedUsingType>, Matcher<UsingType>
@@ -8373,24 +8379,11 @@

AST Traversal Matchers

} } -cxxRcordDecl(hasDeclContext(namedDecl(hasName("M")))) matches the +cxxRecordDecl(hasDeclContext(namedDecl(hasName("M")))) matches the declaration of class D.
-Matcher<DecltypeType>hasUnderlyingTypeMatcher<Type> -
Matches DecltypeType or UsingType nodes to find the underlying type.
-
-Given
-  decltype(1) a = 1;
-  decltype(2.0) b = 2.0;
-decltypeType(hasUnderlyingType(isInteger()))
-  matches the type of "a"
-
-Usable as: Matcher<DecltypeType>, Matcher<UsingType>
-
- - Matcher<DecompositionDecl>hasAnyBindingMatcher<BindingDecl> InnerMatcher
Matches any binding of a DecompositionDecl.
 
@@ -8451,66 +8444,16 @@ 

AST Traversal Matchers

Matcher<DoStmt>hasConditionMatcher<Expr> InnerMatcher -
Matches the condition expression of an if statement, for loop,
-switch statement or conditional operator.
+
Matches the condition expression of an if statement, for loop, while loop,
+do-while loop, switch statement or conditional operator.
 
 Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
   if (true) {}
 
-Matcher<ElaboratedTypeLoc>hasNamedTypeLocMatcher<TypeLoc> InnerMatcher -
Matches elaborated `TypeLoc`s that have a named `TypeLoc` matching
-`InnerMatcher`.
-
-Given
-  template <typename T>
-  class C {};
-  class C<int> c;
-
-  class D {};
-  class D d;
-elaboratedTypeLoc(hasNamedTypeLoc(templateSpecializationTypeLoc()));
-  matches the `TypeLoc` of the variable declaration of `c`, but not `d`.
-
- - -Matcher<ElaboratedType>hasQualifierMatcher<NestedNameSpecifier> InnerMatcher -
Matches ElaboratedTypes whose qualifier, a NestedNameSpecifier,
-matches InnerMatcher if the qualifier exists.
-
-Given
-  namespace N {
-    namespace M {
-      class D {};
-    }
-  }
-  N::M::D d;
-
-elaboratedType(hasQualifier(hasPrefix(specifiesNamespace(hasName("N"))))
-matches the type of the variable declaration of d.
-
- - -Matcher<ElaboratedType>namesTypeMatcher<QualType> InnerMatcher -
Matches ElaboratedTypes whose named type matches InnerMatcher.
-
-Given
-  namespace N {
-    namespace M {
-      class D {};
-    }
-  }
-  N::M::D d;
-
-elaboratedType(namesType(recordType(
-hasDeclaration(namedDecl(hasName("D")))))) matches the type of the variable
-declaration of d.
-
- - -Matcher<EnumType>hasDeclarationMatcher<Decl> InnerMatcher -
Matches a node if the declaration associated with that node
+Matcher<EnumType>hasDeclarationMatcher<Decl>  InnerMatcher
+
Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -8535,11 +8478,11 @@ 

AST Traversal Matchers

Usable as: Matcher<AddrLabelExpr>, Matcher<CallExpr>, Matcher<CXXConstructExpr>, Matcher<CXXNewExpr>, Matcher<DeclRefExpr>, - Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, - Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, - Matcher<TagType>, Matcher<TemplateSpecializationType>, - Matcher<TemplateTypeParmType>, Matcher<TypedefType>, - Matcher<UnresolvedUsingType> + Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, + Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, + Matcher<TagType>, Matcher<TemplateSpecializationType>, + Matcher<TemplateTypeParmType>, Matcher<TypedefType>, + Matcher<UnresolvedUsingType>, Matcher<UsingType>
@@ -8788,14 +8731,26 @@

AST Traversal Matchers

Matcher<ForStmt>hasConditionMatcher<Expr> InnerMatcher -
Matches the condition expression of an if statement, for loop,
-switch statement or conditional operator.
+
Matches the condition expression of an if statement, for loop, while loop,
+do-while loop, switch statement or conditional operator.
 
 Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
   if (true) {}
 
+Matcher<ForStmt>hasConditionVariableStatementMatcher<DeclStmt> InnerMatcher +
Matches the condition variable statement in an if statement, for loop,
+while loop or switch statement.
+
+Given
+  if (A* a = GetAPointer()) {}
+  for (; A* a = GetAPointer(); ) {}
+hasConditionVariableStatement(...)
+  matches both 'A* a = GetAPointer()'.
+
+ + Matcher<ForStmt>hasIncrementMatcher<Stmt> InnerMatcher
Matches the increment statement of a for loop.
 
@@ -9099,8 +9054,8 @@ 

AST Traversal Matchers

Matcher<IfStmt>hasConditionMatcher<Expr> InnerMatcher -
Matches the condition expression of an if statement, for loop,
-switch statement or conditional operator.
+
Matches the condition expression of an if statement, for loop, while loop,
+do-while loop, switch statement or conditional operator.
 
 Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
   if (true) {}
@@ -9108,12 +9063,14 @@ 

AST Traversal Matchers

Matcher<IfStmt>hasConditionVariableStatementMatcher<DeclStmt> InnerMatcher -
Matches the condition variable statement in an if statement.
+
Matches the condition variable statement in an if statement, for loop,
+while loop or switch statement.
 
 Given
   if (A* a = GetAPointer()) {}
+  for (; A* a = GetAPointer(); ) {}
 hasConditionVariableStatement(...)
-  matches 'A* a = GetAPointer()'.
+  matches both 'A* a = GetAPointer()'.
 
@@ -9179,8 +9136,8 @@

AST Traversal Matchers

-Matcher<InjectedClassNameType>hasDeclarationMatcher<Decl> InnerMatcher -
Matches a node if the declaration associated with that node
+Matcher<InjectedClassNameType>hasDeclarationMatcher<Decl>  InnerMatcher
+
Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -9205,16 +9162,16 @@ 

AST Traversal Matchers

Usable as: Matcher<AddrLabelExpr>, Matcher<CallExpr>, Matcher<CXXConstructExpr>, Matcher<CXXNewExpr>, Matcher<DeclRefExpr>, - Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, - Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, - Matcher<TagType>, Matcher<TemplateSpecializationType>, - Matcher<TemplateTypeParmType>, Matcher<TypedefType>, - Matcher<UnresolvedUsingType> + Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, + Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, + Matcher<TagType>, Matcher<TemplateSpecializationType>, + Matcher<TemplateTypeParmType>, Matcher<TypedefType>, + Matcher<UnresolvedUsingType>, Matcher<UsingType>
-Matcher<LabelStmt>hasDeclarationMatcher<Decl> InnerMatcher -
Matches a node if the declaration associated with that node
+Matcher<LabelStmt>hasDeclarationMatcher<Decl>  InnerMatcher
+
Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -9239,11 +9196,11 @@ 

AST Traversal Matchers

Usable as: Matcher<AddrLabelExpr>, Matcher<CallExpr>, Matcher<CXXConstructExpr>, Matcher<CXXNewExpr>, Matcher<DeclRefExpr>, - Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, - Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, - Matcher<TagType>, Matcher<TemplateSpecializationType>, - Matcher<TemplateTypeParmType>, Matcher<TypedefType>, - Matcher<UnresolvedUsingType> + Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, + Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, + Matcher<TagType>, Matcher<TemplateSpecializationType>, + Matcher<TemplateTypeParmType>, Matcher<TypedefType>, + Matcher<UnresolvedUsingType>, Matcher<UsingType>
@@ -9293,8 +9250,8 @@

AST Traversal Matchers

-Matcher<MemberExpr>hasDeclarationMatcher<Decl> InnerMatcher -
Matches a node if the declaration associated with that node
+Matcher<MemberExpr>hasDeclarationMatcher<Decl>  InnerMatcher
+
Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -9319,11 +9276,11 @@ 

AST Traversal Matchers

Usable as: Matcher<AddrLabelExpr>, Matcher<CallExpr>, Matcher<CXXConstructExpr>, Matcher<CXXNewExpr>, Matcher<DeclRefExpr>, - Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, - Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, - Matcher<TagType>, Matcher<TemplateSpecializationType>, - Matcher<TemplateTypeParmType>, Matcher<TypedefType>, - Matcher<UnresolvedUsingType> + Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, + Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, + Matcher<TagType>, Matcher<TemplateSpecializationType>, + Matcher<TemplateTypeParmType>, Matcher<TypedefType>, + Matcher<UnresolvedUsingType>, Matcher<UsingType>
@@ -9456,7 +9413,7 @@

AST Traversal Matchers

-Matcher<OMPExecutableDirective>hasAnyClauseMatcher<OMPClause> InnerMatcher +Matcher<OMPExecutableDirective>hasAnyClauseMatcher<OMPClause> InnerMatcher
Matches any clause in an OpenMP directive.
 
 Given
@@ -9469,7 +9426,7 @@ 

AST Traversal Matchers

-Matcher<OMPExecutableDirective>hasStructuredBlockMatcher<Stmt> InnerMatcher +Matcher<OMPExecutableDirective>hasStructuredBlockMatcher<Stmt> InnerMatcher
Matches the structured-block of the OpenMP executable directive
 
 Prerequisite: the executable directive must not be standalone directive.
@@ -9826,8 +9783,8 @@ 

AST Traversal Matchers

-Matcher<QualType>hasDeclarationMatcher<Decl> InnerMatcher -
Matches a node if the declaration associated with that node
+Matcher<QualType>hasDeclarationMatcher<Decl>  InnerMatcher
+
Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -9852,11 +9809,11 @@ 

AST Traversal Matchers

Usable as: Matcher<AddrLabelExpr>, Matcher<CallExpr>, Matcher<CXXConstructExpr>, Matcher<CXXNewExpr>, Matcher<DeclRefExpr>, - Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, - Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, - Matcher<TagType>, Matcher<TemplateSpecializationType>, - Matcher<TemplateTypeParmType>, Matcher<TypedefType>, - Matcher<UnresolvedUsingType> + Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, + Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, + Matcher<TagType>, Matcher<TemplateSpecializationType>, + Matcher<TemplateTypeParmType>, Matcher<TypedefType>, + Matcher<UnresolvedUsingType>, Matcher<UsingType>
@@ -9920,8 +9877,8 @@

AST Traversal Matchers

-Matcher<RecordType>hasDeclarationMatcher<Decl> InnerMatcher -
Matches a node if the declaration associated with that node
+Matcher<RecordType>hasDeclarationMatcher<Decl>  InnerMatcher
+
Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -9946,11 +9903,11 @@ 

AST Traversal Matchers

Usable as: Matcher<AddrLabelExpr>, Matcher<CallExpr>, Matcher<CXXConstructExpr>, Matcher<CXXNewExpr>, Matcher<DeclRefExpr>, - Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, - Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, - Matcher<TagType>, Matcher<TemplateSpecializationType>, - Matcher<TemplateTypeParmType>, Matcher<TypedefType>, - Matcher<UnresolvedUsingType> + Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, + Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, + Matcher<TagType>, Matcher<TemplateSpecializationType>, + Matcher<TemplateTypeParmType>, Matcher<TypedefType>, + Matcher<UnresolvedUsingType>, Matcher<UsingType>
@@ -10066,7 +10023,7 @@

AST Traversal Matchers

-Matcher<SubstTemplateTypeParmType>hasReplacementTypeMatcher<Type> +Matcher<SubstTemplateTypeParmType>hasReplacementTypeMatcher<Type>
Matches template type parameter substitutions that have a replacement
 type that matches the provided matcher.
 
@@ -10094,14 +10051,26 @@ 

AST Traversal Matchers

Matcher<SwitchStmt>hasConditionMatcher<Expr> InnerMatcher -
Matches the condition expression of an if statement, for loop,
-switch statement or conditional operator.
+
Matches the condition expression of an if statement, for loop, while loop,
+do-while loop, switch statement or conditional operator.
 
 Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
   if (true) {}
 
+Matcher<SwitchStmt>hasConditionVariableStatementMatcher<DeclStmt> InnerMatcher +
Matches the condition variable statement in an if statement, for loop,
+while loop or switch statement.
+
+Given
+  if (A* a = GetAPointer()) {}
+  for (; A* a = GetAPointer(); ) {}
+hasConditionVariableStatement(...)
+  matches both 'A* a = GetAPointer()'.
+
+ + Matcher<SwitchStmt>hasInitStatementMatcher<Stmt> InnerMatcher
Matches selection statements with initializer.
 
@@ -10125,8 +10094,8 @@ 

AST Traversal Matchers

-Matcher<TagType>hasDeclarationMatcher<Decl> InnerMatcher -
Matches a node if the declaration associated with that node
+Matcher<TagType>hasDeclarationMatcher<Decl>  InnerMatcher
+
Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -10151,11 +10120,11 @@ 

AST Traversal Matchers

Usable as: Matcher<AddrLabelExpr>, Matcher<CallExpr>, Matcher<CXXConstructExpr>, Matcher<CXXNewExpr>, Matcher<DeclRefExpr>, - Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, - Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, - Matcher<TagType>, Matcher<TemplateSpecializationType>, - Matcher<TemplateTypeParmType>, Matcher<TypedefType>, - Matcher<UnresolvedUsingType> + Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, + Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, + Matcher<TagType>, Matcher<TemplateSpecializationType>, + Matcher<TemplateTypeParmType>, Matcher<TypedefType>, + Matcher<UnresolvedUsingType>, Matcher<UsingType>
@@ -10284,7 +10253,7 @@

AST Traversal Matchers

-Matcher<TemplateSpecializationType>forEachTemplateArgumentMatcher<TemplateArgument> InnerMatcher +Matcher<TemplateSpecializationType>forEachTemplateArgumentMatcher<TemplateArgument> InnerMatcher
Matches templateSpecializationType, class template specialization,
 variable template specialization, and function template specialization
 nodes where the template argument matches the inner matcher. This matcher
@@ -10310,7 +10279,7 @@ 

AST Traversal Matchers

-Matcher<TemplateSpecializationType>hasAnyTemplateArgumentMatcher<TemplateArgument> InnerMatcher +Matcher<TemplateSpecializationType>hasAnyTemplateArgumentMatcher<TemplateArgument> InnerMatcher
Matches templateSpecializationTypes, class template specializations,
 variable template specializations, and function template specializations
 that have at least one TemplateArgument matching the given InnerMatcher.
@@ -10332,8 +10301,8 @@ 

AST Traversal Matchers

-Matcher<TemplateSpecializationType>hasDeclarationMatcher<Decl> InnerMatcher -
Matches a node if the declaration associated with that node
+Matcher<TemplateSpecializationType>hasDeclarationMatcher<Decl>  InnerMatcher
+
Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -10358,15 +10327,15 @@ 

AST Traversal Matchers

Usable as: Matcher<AddrLabelExpr>, Matcher<CallExpr>, Matcher<CXXConstructExpr>, Matcher<CXXNewExpr>, Matcher<DeclRefExpr>, - Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, - Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, - Matcher<TagType>, Matcher<TemplateSpecializationType>, - Matcher<TemplateTypeParmType>, Matcher<TypedefType>, - Matcher<UnresolvedUsingType> + Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, + Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, + Matcher<TagType>, Matcher<TemplateSpecializationType>, + Matcher<TemplateTypeParmType>, Matcher<TypedefType>, + Matcher<UnresolvedUsingType>, Matcher<UsingType>
-Matcher<TemplateSpecializationType>hasTemplateArgumentunsigned N, Matcher<TemplateArgument> InnerMatcher +Matcher<TemplateSpecializationType>hasTemplateArgumentunsigned N, Matcher<TemplateArgument> InnerMatcher
Matches templateSpecializationType, class template specializations,
 variable template specializations, and function template specializations
 where the n'th TemplateArgument matches the given InnerMatcher.
@@ -10387,8 +10356,8 @@ 

AST Traversal Matchers

-Matcher<TemplateTypeParmType>hasDeclarationMatcher<Decl> InnerMatcher -
Matches a node if the declaration associated with that node
+Matcher<TemplateTypeParmType>hasDeclarationMatcher<Decl>  InnerMatcher
+
Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -10413,11 +10382,11 @@ 

AST Traversal Matchers

Usable as: Matcher<AddrLabelExpr>, Matcher<CallExpr>, Matcher<CXXConstructExpr>, Matcher<CXXNewExpr>, Matcher<DeclRefExpr>, - Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, - Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, - Matcher<TagType>, Matcher<TemplateSpecializationType>, - Matcher<TemplateTypeParmType>, Matcher<TypedefType>, - Matcher<UnresolvedUsingType> + Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, + Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, + Matcher<TagType>, Matcher<TemplateSpecializationType>, + Matcher<TemplateTypeParmType>, Matcher<TypedefType>, + Matcher<UnresolvedUsingType>, Matcher<UsingType>
@@ -10473,8 +10442,8 @@

AST Traversal Matchers

-Matcher<TypedefType>hasDeclarationMatcher<Decl> InnerMatcher -
Matches a node if the declaration associated with that node
+Matcher<TypedefType>hasDeclarationMatcher<Decl>  InnerMatcher
+
Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -10499,11 +10468,41 @@ 

AST Traversal Matchers

Usable as: Matcher<AddrLabelExpr>, Matcher<CallExpr>, Matcher<CXXConstructExpr>, Matcher<CXXNewExpr>, Matcher<DeclRefExpr>, - Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, - Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, - Matcher<TagType>, Matcher<TemplateSpecializationType>, - Matcher<TemplateTypeParmType>, Matcher<TypedefType>, - Matcher<UnresolvedUsingType> + Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, + Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, + Matcher<TagType>, Matcher<TemplateSpecializationType>, + Matcher<TemplateTypeParmType>, Matcher<TypedefType>, + Matcher<UnresolvedUsingType>, Matcher<UsingType> +
+ + +Matcher<Type>hasQualifierMatcher<NestedNameSpecifier> InnerMatcher +
Matches Types whose qualifier, a NestedNameSpecifier,
+matches InnerMatcher if the qualifier exists.
+
+Given
+  namespace N {
+    namespace M {
+      class D {};
+    }
+  }
+  N::M::D d;
+
+elaboratedType(hasQualifier(hasPrefix(specifiesNamespace(hasName("N"))))
+matches the type of the variable declaration of d.
+
+ + +Matcher<Type>hasUnderlyingTypeMatcher<QualType> Inner +
Matches QualType nodes to find the underlying type.
+
+Given
+  decltype(1) a = 1;
+  decltype(2.0) b = 2.0;
+decltypeType(hasUnderlyingType(isInteger()))
+  matches the type of "a"
+
+Usable as: Matcher<QualType>
 
@@ -10556,8 +10555,8 @@

AST Traversal Matchers

-Matcher<UnresolvedUsingType>hasDeclarationMatcher<Decl> InnerMatcher -
Matches a node if the declaration associated with that node
+Matcher<UnresolvedUsingType>hasDeclarationMatcher<Decl>  InnerMatcher
+
Matches a node if the declaration associated with that node
 matches the given matcher.
 
 The associated declaration is:
@@ -10582,11 +10581,11 @@ 

AST Traversal Matchers

Usable as: Matcher<AddrLabelExpr>, Matcher<CallExpr>, Matcher<CXXConstructExpr>, Matcher<CXXNewExpr>, Matcher<DeclRefExpr>, - Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, - Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, - Matcher<TagType>, Matcher<TemplateSpecializationType>, - Matcher<TemplateTypeParmType>, Matcher<TypedefType>, - Matcher<UnresolvedUsingType> + Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>, + Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>, + Matcher<TagType>, Matcher<TemplateSpecializationType>, + Matcher<TemplateTypeParmType>, Matcher<TypedefType>, + Matcher<UnresolvedUsingType>, Matcher<UsingType>
@@ -10602,16 +10601,37 @@

AST Traversal Matchers

matches using X::b but not using X::a
-Matcher<UsingType>hasUnderlyingTypeMatcher<Type> -
Matches DecltypeType or UsingType nodes to find the underlying type.
+Matcher<UsingType>hasDeclarationMatcher<Decl>  InnerMatcher
+
Matches a node if the declaration associated with that node
+matches the given matcher.
 
-Given
-  decltype(1) a = 1;
-  decltype(2.0) b = 2.0;
-decltypeType(hasUnderlyingType(isInteger()))
-  matches the type of "a"
+The associated declaration is:
+- for type nodes, the declaration of the underlying type
+- for CallExpr, the declaration of the callee
+- for MemberExpr, the declaration of the referenced member
+- for CXXConstructExpr, the declaration of the constructor
+- for CXXNewExpr, the declaration of the operator new
+- for ObjCIvarExpr, the declaration of the ivar
+
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
-Usable as: Matcher<DecltypeType>, Matcher<UsingType>
+Usable as: Matcher<AddrLabelExpr>, Matcher<CallExpr>,
+  Matcher<CXXConstructExpr>, Matcher<CXXNewExpr>, Matcher<DeclRefExpr>,
+  Matcher<EnumType>, Matcher<InjectedClassNameType>, Matcher<LabelStmt>,
+  Matcher<MemberExpr>, Matcher<QualType>, Matcher<RecordType>,
+  Matcher<TagType>, Matcher<TemplateSpecializationType>,
+  Matcher<TemplateTypeParmType>, Matcher<TypedefType>,
+  Matcher<UnresolvedUsingType>, Matcher<UsingType>
 
@@ -10832,13 +10852,25 @@

AST Traversal Matchers

Matcher<WhileStmt>hasConditionMatcher<Expr> InnerMatcher -
Matches the condition expression of an if statement, for loop,
-switch statement or conditional operator.
+
Matches the condition expression of an if statement, for loop, while loop,
+do-while loop, switch statement or conditional operator.
 
 Example matches true (matcher = hasCondition(cxxBoolLiteral(equals(true))))
   if (true) {}
 
+ +Matcher<WhileStmt>hasConditionVariableStatementMatcher<DeclStmt> InnerMatcher +
Matches the condition variable statement in an if statement, for loop,
+while loop or switch statement.
+
+Given
+  if (A* a = GetAPointer()) {}
+  for (; A* a = GetAPointer(); ) {}
+hasConditionVariableStatement(...)
+  matches both 'A* a = GetAPointer()'.
+
+ From 1ec16765ceee059e870f7841b788df1c70700d9d Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Wed, 29 Oct 2025 09:43:12 +0000 Subject: [PATCH 088/539] [llvm][DebugInfo][ObjC] Fix argument order of setter/getter to DIObjCProperty constructor (#165401) Depends on: * https://github.com/llvm/llvm-project/pull/165373 This caused the `DW_AT_APPLE_property_(setter|getter)` to be inverted when compiling from LLVM IR. --- llvm/lib/AsmParser/LLParser.cpp | 4 ++-- llvm/test/DebugInfo/Generic/objc-property.ll | 11 ++++------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 5164cec33e6f5..e7a04d98df2af 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -6341,8 +6341,8 @@ bool LLParser::parseDIObjCProperty(MDNode *&Result, bool IsDistinct) { #undef VISIT_MD_FIELDS Result = GET_OR_DISTINCT(DIObjCProperty, - (Context, name.Val, file.Val, line.Val, setter.Val, - getter.Val, attributes.Val, type.Val)); + (Context, name.Val, file.Val, line.Val, getter.Val, + setter.Val, attributes.Val, type.Val)); return false; } diff --git a/llvm/test/DebugInfo/Generic/objc-property.ll b/llvm/test/DebugInfo/Generic/objc-property.ll index 6dd0e01017780..53ccfefedbfae 100644 --- a/llvm/test/DebugInfo/Generic/objc-property.ll +++ b/llvm/test/DebugInfo/Generic/objc-property.ll @@ -15,27 +15,24 @@ ; CHECK-SAME: DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, ; CHECK-SAME: DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained ; -; FIXME: this should have a DW_AT_APPLE_property_getter tag ; CHECK: DW_TAG_APPLE_property ; CHECK: DW_AT_APPLE_property_name ("customGetterProp") -; CHECK: DW_AT_APPLE_property_setter ("customGetter") +; CHECK: DW_AT_APPLE_property_getter ("customGetter") ; CHECK: DW_AT_APPLE_property_attribute ; CHECK-SAME: DW_APPLE_PROPERTY_getter, DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, ; CHECK-SAME: DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained ; -; FIXME: this should have a DW_AT_APPLE_property_setter tag ; CHECK: DW_TAG_APPLE_property ; CHECK: DW_AT_APPLE_property_name ("customSetterProp") -; CHECK: DW_AT_APPLE_property_getter ("customSetter:") +; CHECK: DW_AT_APPLE_property_setter ("customSetter:") ; CHECK: DW_AT_APPLE_property_attribute ; CHECK-SAME: DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, ; CHECK-SAME: DW_APPLE_PROPERTY_setter, DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained ; -; FIXME: the DW_AT_APPLE_property_(getter|setter) values are inverted ; CHECK: DW_TAG_APPLE_property ; CHECK: DW_AT_APPLE_property_name ("customAccessorsProp") -; CHECK: DW_AT_APPLE_property_getter ("customSetter:") -; CHECK: DW_AT_APPLE_property_setter ("customGetter") +; CHECK: DW_AT_APPLE_property_getter ("customGetter") +; CHECK: DW_AT_APPLE_property_setter ("customSetter:") ; CHECK: DW_AT_APPLE_property_attribute ; CHECK-SAME: DW_APPLE_PROPERTY_getter, DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, ; CHECK-SAME: DW_APPLE_PROPERTY_setter, DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained From e8c49dd9aaf47731ac72b1a252edae98376f57a9 Mon Sep 17 00:00:00 2001 From: Andrew Ng Date: Wed, 29 Oct 2025 10:35:20 +0000 Subject: [PATCH 089/539] [LLD][COFF] Fix manifest UAC trustInfo namespace (#165285) Fix manifest `trustInfo` to use the `urn:schemas-microsoft-com:asm.v3` namespace. Fixes https://github.com/llvm/llvm-project/issues/120394. --- lld/COFF/DriverUtils.cpp | 2 +- lld/test/COFF/Inputs/manifest-uac.test | 11 +++++ lld/test/COFF/manifest-uac.test | 33 +++++++++++++ lld/test/COFF/manifest.test | 65 ++++++++++++++------------ lld/test/COFF/manifestinput.test | 35 +++++++------- 5 files changed, 96 insertions(+), 50 deletions(-) create mode 100644 lld/test/COFF/Inputs/manifest-uac.test create mode 100644 lld/test/COFF/manifest-uac.test diff --git a/lld/COFF/DriverUtils.cpp b/lld/COFF/DriverUtils.cpp index 96ae2f0ddef6f..10a3934d53284 100644 --- a/lld/COFF/DriverUtils.cpp +++ b/lld/COFF/DriverUtils.cpp @@ -440,7 +440,7 @@ std::string LinkerDriver::createDefaultXml() { << "\n"; if (ctx.config.manifestUAC) { - os << " \n" + os << " \n" << " \n" << " \n" << " + + + + + + + diff --git a/lld/test/COFF/manifest-uac.test b/lld/test/COFF/manifest-uac.test new file mode 100644 index 0000000000000..d3a17c7282716 --- /dev/null +++ b/lld/test/COFF/manifest-uac.test @@ -0,0 +1,33 @@ +# REQUIRES: libxml2 + +# RUN: yaml2obj %p/Inputs/ret42.yaml -o %t.obj +# RUN: lld-link /out:%t.exe /entry:main \ +# RUN: /manifest:embed \ +# RUN: /manifestinput:%p/Inputs/manifest-uac.test %t.obj +# RUN: llvm-readobj --coff-resources %t.exe | FileCheck %s + +CHECK: Data ( +CHECK-NEXT: 0000: 3C3F786D 6C207665 7273696F 6E3D2231 |.| +CHECK-NEXT: 0070: 0A20203C 74727573 74496E66 6F20786D |. . . . . <| +CHECK-NEXT: 0120: 2F726571 75657374 65645072 6976696C |/requestedPrivil| +CHECK-NEXT: 0130: 65676573 3E0A2020 20203C2F 73656375 |eges>. . .| +CHECK-NEXT: 0160: 0A |.| +CHECK-NEXT: ) diff --git a/lld/test/COFF/manifest.test b/lld/test/COFF/manifest.test index 4910600bd3a17..09de96e9bccfa 100644 --- a/lld/test/COFF/manifest.test +++ b/lld/test/COFF/manifest.test @@ -10,7 +10,7 @@ MANIFEST: MANIFEST: -MANIFEST: +MANIFEST: MANIFEST: MANIFEST: MANIFEST: @@ -26,7 +26,7 @@ MANIFEST: UAC: UAC: -UAC: +UAC: UAC: UAC: UAC: @@ -43,7 +43,7 @@ UAC: DEPENDENCY: DEPENDENCY: -DEPENDENCY: +DEPENDENCY: DEPENDENCY: DEPENDENCY: DEPENDENCY: @@ -90,7 +90,7 @@ NOUACNODEP: SEVERALDEPS: SEVERALDEPS: -SEVERALDEPS: +SEVERALDEPS: SEVERALDEPS: SEVERALDEPS: SEVERALDEPS: @@ -139,31 +139,34 @@ EMBED: 0040: 6D61732D 6D696372 6F736F66 742D636F |mas-microsoft-co| EMBED: 0050: 6D3A6173 6D2E7631 220A2020 20202020 |m:asm.v1". | EMBED: 0060: 20202020 6D616E69 66657374 56657273 | manifestVers| EMBED: 0070: 696F6E3D 22312E30 223E0A20 203C7472 |ion="1.0">. . . . . | -EMBED: 0100: 203C2F72 65717565 73746564 50726976 | . . . . . | -EMBED: 0160: 20202020 3C617373 656D626C 79496465 | . . . . <| -EMBED: 01C0: 64657065 6E64656E 74417373 656D626C |dependentAssembl| -EMBED: 01D0: 793E0A20 20202020 203C6173 73656D62 |y>. . . ..| +EMBED: 0080: 75737449 6E666F20 786D6C6E 733D2275 |ustInfo xmlns="u| +EMBED: 0090: 726E3A73 6368656D 61732D6D 6963726F |rn:schemas-micro| +EMBED: 00A0: 736F6674 2D636F6D 3A61736D 2E763322 |soft-com:asm.v3"| +EMBED: 00B0: 3E0A2020 20203C73 65637572 6974793E |>. | +EMBED: 00C0: 0A202020 2020203C 72657175 65737465 |. . | +EMBED: 00E0: 20202020 20203C72 65717565 73746564 | . | +EMBED: 0140: 0A202020 203C2F73 65637572 6974793E |. | +EMBED: 0150: 0A20203C 2F747275 7374496E 666F3E0A |. .| +EMBED: 0160: 20203C64 6570656E 64656E63 793E0A20 | . | +EMBED: 0170: 2020203C 64657065 6E64656E 74417373 | . . | +EMBED: 01B0: 3C2F6465 70656E64 656E7441 7373656D |. . . . | +EMBED: 0200: 20203C61 7373656D 626C7949 64656E74 | . . ..| EMBED: ) diff --git a/lld/test/COFF/manifestinput.test b/lld/test/COFF/manifestinput.test index 04af80a13312d..cbf27b1ea96b5 100644 --- a/lld/test/COFF/manifestinput.test +++ b/lld/test/COFF/manifestinput.test @@ -5,22 +5,21 @@ # RUN: /manifest:embed \ # RUN: /manifestuac:"level='requireAdministrator'" \ # RUN: /manifestinput:%p/Inputs/manifestinput.test %t.obj -# RUN: llvm-readobj --coff-resources --file-headers %t.exe | FileCheck %s \ -# RUN: -check-prefix TEST_EMBED +# RUN: llvm-readobj --coff-resources --file-headers %t.exe | FileCheck %s -TEST_EMBED: ResourceTableRVA: 0x2000 -TEST_EMBED-NEXT: ResourceTableSize: 0x2A0 -TEST_EMBED-DAG: Resources [ -TEST_EMBED-NEXT: Total Number of Resources: 1 -TEST_EMBED-DAG: Number of String Entries: 0 -TEST_EMBED-NEXT: Number of ID Entries: 1 -TEST_EMBED-NEXT: Type: MANIFEST (ID 24) [ -TEST_EMBED-NEXT: Table Offset: 0x18 -TEST_EMBED-NEXT: Number of String Entries: 0 -TEST_EMBED-NEXT: Number of ID Entries: 1 -TEST_EMBED-NEXT: Name: (ID 1) [ -TEST_EMBED-NEXT: Table Offset: 0x30 -TEST_EMBED-NEXT: Number of String Entries: 0 -TEST_EMBED-NEXT: Number of ID Entries: 1 -TEST_EMBED-NEXT: Language: (ID 1033) [ -TEST_EMBED-NEXT: Entry Offset: 0x48 +CHECK: ResourceTableRVA: 0x2000 +CHECK-NEXT: ResourceTableSize: 0x2C8 +CHECK-DAG: Resources [ +CHECK-NEXT: Total Number of Resources: 1 +CHECK-DAG: Number of String Entries: 0 +CHECK-NEXT: Number of ID Entries: 1 +CHECK-NEXT: Type: MANIFEST (ID 24) [ +CHECK-NEXT: Table Offset: 0x18 +CHECK-NEXT: Number of String Entries: 0 +CHECK-NEXT: Number of ID Entries: 1 +CHECK-NEXT: Name: (ID 1) [ +CHECK-NEXT: Table Offset: 0x30 +CHECK-NEXT: Number of String Entries: 0 +CHECK-NEXT: Number of ID Entries: 1 +CHECK-NEXT: Language: (ID 1033) [ +CHECK-NEXT: Entry Offset: 0x48 From 6e258c551e7131bcd41e8304371d874b93a3ab6d Mon Sep 17 00:00:00 2001 From: Dan Blackwell Date: Wed, 29 Oct 2025 10:36:47 +0000 Subject: [PATCH 090/539] [TSan][Test-Only][Darwin] Fix typo in external.cpp test (#165534) Occasionally this test fails in CI. There are two possible races that can occur, one of which is rare. Both are supposed to be handled, but because the test matches "read-only" and the runtime outputs "Read-only" (note the capital letter), the FileCheck fails. This patch fixes the miscapitalisation of the FileCheck string in the test. rdar://163398219 --- compiler-rt/test/tsan/Darwin/external.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler-rt/test/tsan/Darwin/external.cpp b/compiler-rt/test/tsan/Darwin/external.cpp index 3869c7abb7664..8372a1eb125f3 100644 --- a/compiler-rt/test/tsan/Darwin/external.cpp +++ b/compiler-rt/test/tsan/Darwin/external.cpp @@ -68,9 +68,9 @@ int main(int argc, char *argv[]) { // TEST2-NOT: WARNING: ThreadSanitizer // TEST3: WARNING: ThreadSanitizer: race on MyLibrary::MyObject - // TEST3: {{Modifying|read-only}} access of MyLibrary::MyObject at + // TEST3: {{Modifying|Read-only}} access of MyLibrary::MyObject at // TEST3: {{ObjectWrite|ObjectRead}} - // TEST3: Previous {{modifying|read-only}} access of MyLibrary::MyObject at + // TEST3: Previous {{modifying|Read-only}} access of MyLibrary::MyObject at // TEST3: {{ObjectWrite|ObjectRead}} // TEST3: Location is MyLibrary::MyObject of size 16 at // TEST3: {{ObjectCreate}} From 9b064ab58df08a55cd16fd76c9118b1ebc742c6b Mon Sep 17 00:00:00 2001 From: Dan Blackwell Date: Wed, 29 Oct 2025 10:38:53 +0000 Subject: [PATCH 091/539] [ASan][Test-Only][Darwin] Mark asan-symbolize-templated-cxx.cpp unsupported (#165410) This test is currently failing on some macOS CI nodes due to an issue with the system symbolizer. This patch marks this test unsupported while we wait for all CI nodes to be updated to a newer OS. rdar://160409885 --- .../test/asan/TestCases/Darwin/asan-symbolize-templated-cxx.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/compiler-rt/test/asan/TestCases/Darwin/asan-symbolize-templated-cxx.cpp b/compiler-rt/test/asan/TestCases/Darwin/asan-symbolize-templated-cxx.cpp index 3d726a32b7eaa..5794f5dbadaec 100644 --- a/compiler-rt/test/asan/TestCases/Darwin/asan-symbolize-templated-cxx.cpp +++ b/compiler-rt/test/asan/TestCases/Darwin/asan-symbolize-templated-cxx.cpp @@ -1,4 +1,5 @@ // UNSUPPORTED: ios +// UNSUPPORTED: darwin // RUN: %clangxx_asan -O0 -g %s -o %t.executable // RUN: %env_asan_opts="symbolize=0" not %run %t.executable > %t_no_module_map.log 2>&1 // RUN: %asan_symbolize --force-system-symbolizer < %t_no_module_map.log 2>&1 | FileCheck %s From a005d4bc08ba05553fadceb88c1be42b83147207 Mon Sep 17 00:00:00 2001 From: Daniil Kovalev Date: Wed, 29 Oct 2025 13:54:55 +0300 Subject: [PATCH 092/539] [PAC][Driver] Support ptrauth flags only on ARM64 Darwin or with pauthtest ABI (#113152) Most ptrauth flags are ABI-affecting, so usually we do not want them to be exposed to end users. Allow them only in the following cases: - ARM64 Darwin (under certain conditions, some ptrauth driver flags are intended to be used in this case); - pauthtest ABI (it's intended to be used for experimenting with signing schema and the signing schema is explicitly encoded in the pauth elf marking). Leave `-faarch64-jump-table-hardening` available for all AArch64 targets since it's not ABI-affecting. --- clang/lib/Driver/ToolChains/Clang.cpp | 77 +++++++------ clang/test/Driver/aarch64-ptrauth.c | 107 ++++++++++++------ ...rch64-ignore-branch-protection-attribute.c | 8 +- 3 files changed, 123 insertions(+), 69 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 79edc561c551f..4e8f63ea49480 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -1414,17 +1414,18 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args, GuardedControlStack = PBP.GuardedControlStack; } - bool HasPtrauthReturns = llvm::any_of(CmdArgs, [](const char *Arg) { - return StringRef(Arg) == "-fptrauth-returns"; - }); + Arg *PtrauthReturnsArg = Args.getLastArg(options::OPT_fptrauth_returns, + options::OPT_fno_ptrauth_returns); + bool HasPtrauthReturns = + PtrauthReturnsArg && + PtrauthReturnsArg->getOption().matches(options::OPT_fptrauth_returns); // GCS is currently untested with ptrauth-returns, but enabling this could be // allowed in future after testing with a suitable system. - if (HasPtrauthReturns && - (Scope != "none" || BranchProtectionPAuthLR || GuardedControlStack)) { + if (Scope != "none" || BranchProtectionPAuthLR || GuardedControlStack) { if (Triple.getEnvironment() == llvm::Triple::PAuthTest) D.Diag(diag::err_drv_unsupported_opt_for_target) << A->getAsString(Args) << Triple.getTriple(); - else + else if (HasPtrauthReturns) D.Diag(diag::err_drv_incompatible_options) << A->getAsString(Args) << "-fptrauth-returns"; } @@ -1670,34 +1671,42 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args, AddUnalignedAccessWarning(CmdArgs); - Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_intrinsics, - options::OPT_fno_ptrauth_intrinsics); - Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_calls, - options::OPT_fno_ptrauth_calls); - Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_returns, - options::OPT_fno_ptrauth_returns); - Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_auth_traps, - options::OPT_fno_ptrauth_auth_traps); - Args.addOptInFlag( - CmdArgs, options::OPT_fptrauth_vtable_pointer_address_discrimination, - options::OPT_fno_ptrauth_vtable_pointer_address_discrimination); - Args.addOptInFlag( - CmdArgs, options::OPT_fptrauth_vtable_pointer_type_discrimination, - options::OPT_fno_ptrauth_vtable_pointer_type_discrimination); - Args.addOptInFlag( - CmdArgs, options::OPT_fptrauth_type_info_vtable_pointer_discrimination, - options::OPT_fno_ptrauth_type_info_vtable_pointer_discrimination); - Args.addOptInFlag( - CmdArgs, options::OPT_fptrauth_function_pointer_type_discrimination, - options::OPT_fno_ptrauth_function_pointer_type_discrimination); - - Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_indirect_gotos, - options::OPT_fno_ptrauth_indirect_gotos); - Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_init_fini, - options::OPT_fno_ptrauth_init_fini); - Args.addOptInFlag(CmdArgs, - options::OPT_fptrauth_init_fini_address_discrimination, - options::OPT_fno_ptrauth_init_fini_address_discrimination); + if (Triple.isOSDarwin() || + (Triple.isOSLinux() && + Triple.getEnvironment() == llvm::Triple::PAuthTest)) { + Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_intrinsics, + options::OPT_fno_ptrauth_intrinsics); + Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_calls, + options::OPT_fno_ptrauth_calls); + Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_returns, + options::OPT_fno_ptrauth_returns); + Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_auth_traps, + options::OPT_fno_ptrauth_auth_traps); + Args.addOptInFlag( + CmdArgs, options::OPT_fptrauth_vtable_pointer_address_discrimination, + options::OPT_fno_ptrauth_vtable_pointer_address_discrimination); + Args.addOptInFlag( + CmdArgs, options::OPT_fptrauth_vtable_pointer_type_discrimination, + options::OPT_fno_ptrauth_vtable_pointer_type_discrimination); + Args.addOptInFlag( + CmdArgs, options::OPT_fptrauth_type_info_vtable_pointer_discrimination, + options::OPT_fno_ptrauth_type_info_vtable_pointer_discrimination); + Args.addOptInFlag( + CmdArgs, options::OPT_fptrauth_function_pointer_type_discrimination, + options::OPT_fno_ptrauth_function_pointer_type_discrimination); + Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_indirect_gotos, + options::OPT_fno_ptrauth_indirect_gotos); + } + if (Triple.isOSLinux() && + Triple.getEnvironment() == llvm::Triple::PAuthTest) { + Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_init_fini, + options::OPT_fno_ptrauth_init_fini); + Args.addOptInFlag( + CmdArgs, options::OPT_fptrauth_init_fini_address_discrimination, + options::OPT_fno_ptrauth_init_fini_address_discrimination); + Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_elf_got, + options::OPT_fno_ptrauth_elf_got); + } Args.addOptInFlag(CmdArgs, options::OPT_faarch64_jump_table_hardening, options::OPT_fno_aarch64_jump_table_hardening); diff --git a/clang/test/Driver/aarch64-ptrauth.c b/clang/test/Driver/aarch64-ptrauth.c index b080a77195c8c..a67e98fdda714 100644 --- a/clang/test/Driver/aarch64-ptrauth.c +++ b/clang/test/Driver/aarch64-ptrauth.c @@ -4,7 +4,8 @@ // NONE: "-cc1" // NONE-NOT: "-fptrauth- -// RUN: %clang -### -c --target=aarch64 \ +//// -fptauth-* driver flags on Linux are only supported with pauthtest ABI. +// RUN: %clang -### -c --target=aarch64-linux -mabi=pauthtest \ // RUN: -fno-ptrauth-intrinsics -fptrauth-intrinsics \ // RUN: -fno-ptrauth-calls -fptrauth-calls \ // RUN: -fno-ptrauth-returns -fptrauth-returns \ @@ -15,9 +16,43 @@ // RUN: -fno-ptrauth-indirect-gotos -fptrauth-indirect-gotos \ // RUN: -fno-ptrauth-init-fini -fptrauth-init-fini \ // RUN: -fno-ptrauth-init-fini-address-discrimination -fptrauth-init-fini-address-discrimination \ +// RUN: -fno-ptrauth-elf-got -fptrauth-elf-got \ // RUN: -fno-aarch64-jump-table-hardening -faarch64-jump-table-hardening \ -// RUN: %s 2>&1 | FileCheck %s --check-prefix=ALL -// ALL: "-cc1"{{.*}} "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-type-info-vtable-pointer-discrimination" "-fptrauth-indirect-gotos" "-fptrauth-init-fini" "-fptrauth-init-fini-address-discrimination" "-faarch64-jump-table-hardening" +// RUN: %s 2>&1 | FileCheck %s --check-prefix=ALL-LINUX-PAUTHABI +// RUN: %clang -### -c --target=aarch64-linux-pauthtest \ +// RUN: -fno-ptrauth-intrinsics -fptrauth-intrinsics \ +// RUN: -fno-ptrauth-calls -fptrauth-calls \ +// RUN: -fno-ptrauth-returns -fptrauth-returns \ +// RUN: -fno-ptrauth-auth-traps -fptrauth-auth-traps \ +// RUN: -fno-ptrauth-vtable-pointer-address-discrimination -fptrauth-vtable-pointer-address-discrimination \ +// RUN: -fno-ptrauth-vtable-pointer-type-discrimination -fptrauth-vtable-pointer-type-discrimination \ +// RUN: -fno-ptrauth-type-info-vtable-pointer-discrimination -fptrauth-type-info-vtable-pointer-discrimination \ +// RUN: -fno-ptrauth-indirect-gotos -fptrauth-indirect-gotos \ +// RUN: -fno-ptrauth-init-fini -fptrauth-init-fini \ +// RUN: -fno-ptrauth-init-fini-address-discrimination -fptrauth-init-fini-address-discrimination \ +// RUN: -fno-ptrauth-elf-got -fptrauth-elf-got \ +// RUN: -fno-aarch64-jump-table-hardening -faarch64-jump-table-hardening \ +// RUN: %s 2>&1 | FileCheck %s --check-prefix=ALL-LINUX-PAUTHABI +// ALL-LINUX-PAUTHABI: "-cc1"{{.*}} "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-type-info-vtable-pointer-discrimination" "-fptrauth-indirect-gotos" "-fptrauth-init-fini" "-fptrauth-init-fini-address-discrimination" "-fptrauth-elf-got"{{.*}} "-faarch64-jump-table-hardening" + +// RUN: %clang -### -c --target=aarch64-linux \ +// RUN: -fno-aarch64-jump-table-hardening -faarch64-jump-table-hardening \ +// RUN: %s 2>&1 | FileCheck %s --check-prefix=ALL-LINUX +// ALL-LINUX: "-cc1"{{.*}} "-faarch64-jump-table-hardening" + +//// Some -fptrauth-* flags are supported for ARM64 Darwin. +// RUN: %clang -### -c --target=arm64-darwin \ +// RUN: -fno-ptrauth-intrinsics -fptrauth-intrinsics \ +// RUN: -fno-ptrauth-calls -fptrauth-calls \ +// RUN: -fno-ptrauth-returns -fptrauth-returns \ +// RUN: -fno-ptrauth-auth-traps -fptrauth-auth-traps \ +// RUN: -fno-ptrauth-vtable-pointer-address-discrimination -fptrauth-vtable-pointer-address-discrimination \ +// RUN: -fno-ptrauth-vtable-pointer-type-discrimination -fptrauth-vtable-pointer-type-discrimination \ +// RUN: -fno-ptrauth-type-info-vtable-pointer-discrimination -fptrauth-type-info-vtable-pointer-discrimination \ +// RUN: -fno-ptrauth-indirect-gotos -fptrauth-indirect-gotos \ +// RUN: -fno-aarch64-jump-table-hardening -faarch64-jump-table-hardening \ +// RUN: %s 2>&1 | FileCheck %s --check-prefix=ALL-DARWIN +// ALL-DARWIN: "-cc1"{{.*}} "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-type-info-vtable-pointer-discrimination" "-fptrauth-indirect-gotos"{{.*}} "-faarch64-jump-table-hardening" // RUN: %clang -### -c --target=aarch64-linux -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI1 // RUN: %clang -### -c --target=aarch64-linux-pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI1 @@ -40,7 +75,7 @@ // RUN: -fno-aarch64-jump-table-hardening %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2 //// Non-linux OS: pauthtest ABI has no effect in terms of passing ptrauth cc1 flags. -//// An error about unsupported ABI will be emitted later in pipeline (see ERR2 below) +//// An error about unsupported ABI will be emitted later in pipeline (see ERR3 below) // RUN: %clang -### -c --target=aarch64 -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2 // PAUTHABI2: "-cc1" @@ -55,10 +90,11 @@ // PAUTHABI3-NOT: "-fptrauth- // PAUTHABI3-NOT: "-faarch64-jump-table-hardening" -// RUN: not %clang -### -c --target=x86_64 -fptrauth-intrinsics -fptrauth-calls -fptrauth-returns -fptrauth-auth-traps \ +//// Non-pauthtest ABI. +// RUN: not %clang -### -c --target=aarch64-linux -fptrauth-intrinsics -fptrauth-calls -fptrauth-returns -fptrauth-auth-traps \ // RUN: -fptrauth-vtable-pointer-address-discrimination -fptrauth-vtable-pointer-type-discrimination \ // RUN: -fptrauth-type-info-vtable-pointer-discrimination -fptrauth-indirect-gotos -fptrauth-init-fini \ -// RUN: -fptrauth-init-fini-address-discrimination -faarch64-jump-table-hardening %s 2>&1 | FileCheck %s --check-prefix=ERR1 +// RUN: -fptrauth-init-fini-address-discrimination -fptrauth-elf-got %s 2>&1 | FileCheck %s --check-prefix=ERR1 // ERR1: error: unsupported option '-fptrauth-intrinsics' for target '{{.*}}' // ERR1-NEXT: error: unsupported option '-fptrauth-calls' for target '{{.*}}' // ERR1-NEXT: error: unsupported option '-fptrauth-returns' for target '{{.*}}' @@ -69,59 +105,64 @@ // ERR1-NEXT: error: unsupported option '-fptrauth-indirect-gotos' for target '{{.*}}' // ERR1-NEXT: error: unsupported option '-fptrauth-init-fini' for target '{{.*}}' // ERR1-NEXT: error: unsupported option '-fptrauth-init-fini-address-discrimination' for target '{{.*}}' -// ERR1-NEXT: error: unsupported option '-faarch64-jump-table-hardening' for target '{{.*}}' +// ERR1-NEXT: error: unsupported option '-fptrauth-elf-got' for target '{{.*}}' +//// Non-AArch64. +// RUN: not %clang -### -c --target=x86_64-linux -faarch64-jump-table-hardening %s 2>&1 | FileCheck %s --check-prefix=ERR2 +// ERR2: error: unsupported option '-faarch64-jump-table-hardening' for target '{{.*}}' + +//// Only support PAuth ABI for Linux as for now. +// RUN: not %clang -c --target=aarch64 -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR3 +// ERR3: error: unknown target ABI 'pauthtest' -// RUN: not %clang -c --target=aarch64 -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR2 //// The ABI is not specified explicitly, and for non-Linux pauthtest environment does not correspond //// to pauthtest ABI (each OS target defines this behavior separately). Do not emit an error. -// RUN: %clang -c --target=aarch64-pauthtest %s -o /dev/null -// ERR2: error: unknown target ABI 'pauthtest' +// RUN: %clang -c --target=aarch64-pauthtest %s -o /dev/null //// PAuth ABI is encoded as environment part of the triple, so don't allow to explicitly set other environments. -// RUN: not %clang -### -c --target=aarch64-linux-gnu -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR3 -// ERR3: error: unsupported option '-mabi=pauthtest' for target 'aarch64-unknown-linux-gnu' +// RUN: not %clang -### -c --target=aarch64-linux-gnu -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR4 +// ERR4: error: unsupported option '-mabi=pauthtest' for target 'aarch64-unknown-linux-gnu' // RUN: %clang -### -c --target=aarch64-linux-pauthtest -mabi=pauthtest %s //// The only branch protection option compatible with PAuthABI is BTI. // RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=pac-ret %s 2>&1 | \ -// RUN: FileCheck %s --check-prefix=ERR4_1 +// RUN: FileCheck %s --check-prefix=ERR5_1 // RUN: not %clang -### -c --target=aarch64-linux-pauthtest -mbranch-protection=pac-ret %s 2>&1 | \ -// RUN: FileCheck %s --check-prefix=ERR4_1 +// RUN: FileCheck %s --check-prefix=ERR5_1 // RUN: not %clang -### -c --target=aarch64 -fptrauth-returns -mbranch-protection=pac-ret %s 2>&1 | \ -// RUN: FileCheck %s --check-prefix=ERR4_2 -// ERR4_1: error: unsupported option '-mbranch-protection=pac-ret' for target 'aarch64-unknown-linux-pauthtest' -// ERR4_2: error: the combination of '-mbranch-protection=pac-ret' and '-fptrauth-returns' is incompatible +// RUN: FileCheck %s --check-prefix=ERR5_2 +// ERR5_1: error: unsupported option '-mbranch-protection=pac-ret' for target 'aarch64-unknown-linux-pauthtest' +// ERR5_2: error: the combination of '-mbranch-protection=pac-ret' and '-fptrauth-returns' is incompatible // RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=gcs %s 2>&1 | \ -// RUN: FileCheck %s --check-prefix=ERR5_1 +// RUN: FileCheck %s --check-prefix=ERR6_1 // RUN: not %clang -### -c --target=aarch64-linux-pauthtest -mbranch-protection=gcs %s 2>&1 | \ -// RUN: FileCheck %s --check-prefix=ERR5_1 +// RUN: FileCheck %s --check-prefix=ERR6_1 // RUN: not %clang -### -c --target=aarch64 -fptrauth-returns -mbranch-protection=gcs %s 2>&1 | \ -// RUN: FileCheck %s --check-prefix=ERR5_2 -// ERR5_1: error: unsupported option '-mbranch-protection=gcs' for target 'aarch64-unknown-linux-pauthtest' -// ERR5_2: error: the combination of '-mbranch-protection=gcs' and '-fptrauth-returns' is incompatible +// RUN: FileCheck %s --check-prefix=ERR6_2 +// ERR6_1: error: unsupported option '-mbranch-protection=gcs' for target 'aarch64-unknown-linux-pauthtest' +// ERR6_2: error: the combination of '-mbranch-protection=gcs' and '-fptrauth-returns' is incompatible // RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=standard %s 2>&1 | \ -// RUN: FileCheck %s --check-prefix=ERR6_1 +// RUN: FileCheck %s --check-prefix=ERR7_1 // RUN: not %clang -### -c --target=aarch64-linux-pauthtest -mbranch-protection=standard %s 2>&1 | \ -// RUN: FileCheck %s --check-prefix=ERR6_1 +// RUN: FileCheck %s --check-prefix=ERR7_1 // RUN: not %clang -### -c --target=aarch64 -fptrauth-returns -mbranch-protection=standard %s 2>&1 | \ -// RUN: FileCheck %s --check-prefix=ERR6_2 -// ERR6_1: error: unsupported option '-mbranch-protection=standard' for target 'aarch64-unknown-linux-pauthtest' -// ERR6_2: error: the combination of '-mbranch-protection=standard' and '-fptrauth-returns' is incompatible +// RUN: FileCheck %s --check-prefix=ERR7_2 +// ERR7_1: error: unsupported option '-mbranch-protection=standard' for target 'aarch64-unknown-linux-pauthtest' +// ERR7_2: error: the combination of '-mbranch-protection=standard' and '-fptrauth-returns' is incompatible // RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -msign-return-address=all %s 2>&1 | \ -// RUN: FileCheck %s --check-prefix=ERR7 +// RUN: FileCheck %s --check-prefix=ERR8 // RUN: not %clang -### -c --target=aarch64-linux-pauthtest -msign-return-address=all %s 2>&1 | \ -// RUN: FileCheck %s --check-prefix=ERR7 -// ERR7: error: unsupported option '-msign-return-address=all' for target 'aarch64-unknown-linux-pauthtest' +// RUN: FileCheck %s --check-prefix=ERR8 +// ERR8: error: unsupported option '-msign-return-address=all' for target 'aarch64-unknown-linux-pauthtest' // RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -msign-return-address=non-leaf %s 2>&1 | \ -// RUN: FileCheck %s --check-prefix=ERR8 +// RUN: FileCheck %s --check-prefix=ERR9 // RUN: not %clang -### -c --target=aarch64-linux-pauthtest -msign-return-address=non-leaf %s 2>&1 | \ -// RUN: FileCheck %s --check-prefix=ERR8 -// ERR8: error: unsupported option '-msign-return-address=non-leaf' for target 'aarch64-unknown-linux-pauthtest' +// RUN: FileCheck %s --check-prefix=ERR9 +// ERR9: error: unsupported option '-msign-return-address=non-leaf' for target 'aarch64-unknown-linux-pauthtest' // RUN: %clang -### -c --target=aarch64-linux -mabi=pauthtest -msign-return-address=none %s // RUN: %clang -### -c --target=aarch64-linux-pauthtest -msign-return-address=none %s diff --git a/clang/test/Frontend/aarch64-ignore-branch-protection-attribute.c b/clang/test/Frontend/aarch64-ignore-branch-protection-attribute.c index 32cc98dd4e037..e6605ce5c630f 100644 --- a/clang/test/Frontend/aarch64-ignore-branch-protection-attribute.c +++ b/clang/test/Frontend/aarch64-ignore-branch-protection-attribute.c @@ -1,7 +1,11 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang -target aarch64-linux-pauthtest %s -S -emit-llvm -o - 2>&1 | FileCheck --implicit-check-not=warning: %s -// RUN: %clang -target aarch64 -fptrauth-returns %s -S -emit-llvm -o - 2>&1 | FileCheck --implicit-check-not=warning: %s +// RUN: %clang -target aarch64-linux-pauthtest %s -S -emit-llvm -o - 2>&1 | FileCheck --implicit-check-not=warning: %s +// RUN: not %clang -target aarch64 -fptrauth-returns %s -S -emit-llvm -o - 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=PTRAUTH-RETURNS %s + +// Clang fails early, no LLVM IR output produced. +// PTRAUTH-RETURNS: clang: error: unsupported option '-fptrauth-returns' for target 'aarch64' +// PTRAUTH-RETURNS-NOT: attributes /// Unsupported with pauthtest, warning emitted __attribute__((target("branch-protection=pac-ret"))) void f1() {} From e87f5a4d9b8e914326ab5c6f5af7f6ce4e599c40 Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Wed, 29 Oct 2025 10:59:34 +0000 Subject: [PATCH 093/539] [mlir][tosa] Add support for mxint8 type in mxfp operations (#163642) This commit adds support for the OCP-MX INT8 type. This includes the following operations: MATMUL_T_BLOCK_SCALED, CAST_FROM_BLOCK_SCALED, CAST_TO_BLOCK_SCALED and CONST. The support is added via a custom TOSA type "!tosa.mxint8" due to the fact it is not yet a builtin type in mlir. This may change in the future, depending on how this type is used by other frameworks/dialects. Conversions to/from this type have not yet been implemented for the same reasoning. Co-authored-by: Tat Wai Chong --- .../Dialect/Tosa/IR/TosaComplianceData.h.inc | 17 ++++-- mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h | 3 ++ .../Dialect/Tosa/IR/TosaProfileCompliance.h | 2 +- .../mlir/Dialect/Tosa/IR/TosaTypesBase.td | 33 +++++++----- mlir/lib/Dialect/Tosa/IR/TosaOps.cpp | 6 +++ .../Tosa/Transforms/TosaProfileCompliance.cpp | 3 ++ .../Tosa/Transforms/TosaValidation.cpp | 7 +-- mlir/test/Dialect/Tosa/ops.mlir | 21 ++++++++ .../tosa-validation-version-1p1-valid.mlir | 52 ++++++++++++++++++- 9 files changed, 121 insertions(+), 23 deletions(-) diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc index 8b5934ff0630e..c774d870a8c45 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc @@ -572,6 +572,8 @@ extensionComplianceMap = { {{fp8e4m3T, fp8ue8m0T, fp8e4m3T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT}, {{fp8e5m2T, fp8ue8m0T, fp8e5m2T, fp8ue8m0T, fp32T}, + SpecificationVersion::V_1_1_DRAFT}, + {{mxint8T, fp8ue8m0T, mxint8T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT}}}}}, {"tosa.max_pool2d", {{{Extension::int16}, {{{i16T, i16T}, SpecificationVersion::V_1_0}}}, @@ -870,14 +872,16 @@ extensionComplianceMap = { {{fp6e2m3T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT}, {{fp6e3m2T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT}, {{fp8e4m3T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT}, - {{fp8e5m2T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT}}, + {{fp8e5m2T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT}, + {{mxint8T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT}}, allOf}, {{Extension::mxfp}, {{{fp4e2m1T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT}, {{fp6e2m3T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT}, {{fp6e3m2T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT}, {{fp8e4m3T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT}, - {{fp8e5m2T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT}}}}}, + {{fp8e5m2T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT}, + {{mxint8T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT}}}}}, {"tosa.cast_to_block_scaled", {{{Extension::mxfp}, {{{bf16T, fp4e2m1T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}, @@ -885,12 +889,14 @@ extensionComplianceMap = { {{fp32T, fp6e2m3T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}, {{fp32T, fp6e3m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}, {{fp32T, fp8e4m3T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}, - {{fp32T, fp8e5m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}}}, + {{fp32T, fp8e5m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}, + {{fp32T, mxint8T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}}}, {{Extension::bf16, Extension::mxfp}, {{{bf16T, fp6e2m3T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}, {{bf16T, fp6e3m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}, {{bf16T, fp8e4m3T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}, - {{bf16T, fp8e5m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}}, + {{bf16T, fp8e5m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}, + {{bf16T, mxint8T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}}, allOf}}}, {"tosa.rescale", {{{Extension::int16}, @@ -908,7 +914,8 @@ extensionComplianceMap = { {{{fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}, {{fp6e3m2T}, SpecificationVersion::V_1_1_DRAFT}, {{fp6e2m3T}, SpecificationVersion::V_1_1_DRAFT}, - {{fp4e2m1T}, SpecificationVersion::V_1_1_DRAFT}}}}}, + {{fp4e2m1T}, SpecificationVersion::V_1_1_DRAFT}, + {{mxint8T}, SpecificationVersion::V_1_1_DRAFT}}}}}, {"tosa.identity", {{{Extension::int4}, {{{i4T, i4T}, SpecificationVersion::V_1_0}}}, {{Extension::int16}, {{{i48T, i48T}, SpecificationVersion::V_1_0}}}, diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h index a15f073bc5fcb..2d4e7cf8b9dbd 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h @@ -179,6 +179,9 @@ Value createPadConstTensor(OpBuilder &builder, Location loc, Value src, // returns type of variable op RankedTensorType getVariableType(VariableOp variableOp); +// Returns the bitwidth of a TOSA tensor element type +unsigned getBitWidth(Type type); + } // namespace tosa } // namespace mlir diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h b/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h index 45d380c1b2e6c..ea58f49b64c44 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h @@ -70,7 +70,7 @@ class ProfileInfoDepot { private: TypeInfo convertTypeToInfo(Type type) { - return {type.getTypeID(), type.getIntOrFloatBitWidth()}; + return {type.getTypeID(), tosa::getBitWidth(type)}; } TypeInfo convertValueToInfo(Value value) { diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td index 93843e86fd378..414b51bf4b135 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td @@ -22,6 +22,12 @@ include "mlir/Dialect/Tosa/IR/TosaOpBase.td" // Tosa Type Definitions. //===----------------------------------------------------------------------===// +// The base class for Tosa dialect types. +class Tosa_Type traits = []> + : TypeDef { + let mnemonic = typeMnemonic; +} + // The base class of a quantized type. // Param tuple is: [bitwidth, zeropt, smantissa, sexp, low_end, high_end]. // Where low and high ends are 0,255 when unsigned, -128,127 when signed, for @@ -78,13 +84,26 @@ def Tosa_QuantizedInt : AnyTypeOf<[Tosa_QuantizedType<"uint8", [8], 0>, Tosa_QuantizedType<"int16", [16, 0], 1>, Tosa_QuantizedType<"int32", [32, 0], 1>]>; +//===----------------------------------------------------------------------===// +// Custom TOSA element types. +//===----------------------------------------------------------------------===// + +// MLIR doesn't have a builtin type for mxint8 yet. For now declared it as a +// custom TOSA type. This may be changed in the future. +def Tosa_MXInt8 : Tosa_Type<"mxint8", "mxint8"> { + let summary = "INT8 type as defined by OCP-MX"; + let description = [{ + 8-bit integer format with an implicit 1/64 scale defined by OCP-MX. + }]; +} + //===----------------------------------------------------------------------===// // Multi-category types. //===----------------------------------------------------------------------===// -def Tosa_AnyNumber : AnyTypeOf<[Tosa_Int, Tosa_QuantizedInt, AnyFloat], +def Tosa_AnyNumber : AnyTypeOf<[Tosa_Int, Tosa_QuantizedInt, AnyFloat, Tosa_MXInt8], "number">; -def Tosa_MXFPNumber : AnyTypeOf<[F8E4M3FN, F8E5M2, F4E2M1FN, F6E2M3FN, F6E3M2FN], +def Tosa_MXFPNumber : AnyTypeOf<[F8E4M3FN, F8E5M2, F4E2M1FN, F6E2M3FN, F6E3M2FN, Tosa_MXInt8], "micro-scaling format number">; def Tosa_MXFPScaleNumber : AnyTypeOf<[F8E8M0FNU], "micro-scaling format scale number">; @@ -265,16 +284,6 @@ def Tosa_Buffer : MemRefOf<[Tosa_AnyNumber]>; def Tosa_TupleBuffer : NestedTupleOf<[Tosa_Buffer]>; def Tosa_BufOrTuple : AnyTypeOf<[Tosa_Buffer, Tosa_TupleBuffer]>; -//===----------------------------------------------------------------------===// -// Tosa Type Definitions. -//===----------------------------------------------------------------------===// - -// The base class for Tosa dialect types. -class Tosa_Type traits = []> - : TypeDef { - let mnemonic = typeMnemonic; -} - //===----------------------------------------------------------------------===// // ShapeType //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index 0aff67f0b5eba..bf3810ff231da 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -606,6 +606,12 @@ Value mlir::tosa::createPadConstTensor(OpBuilder &builder, Location loc, return tosa::ConstOp::create(builder, loc, padConstType, padConstAttr); } +unsigned mlir::tosa::getBitWidth(Type type) { + if (dyn_cast(type)) + return 8; + return type.getIntOrFloatBitWidth(); +} + //===----------------------------------------------------------------------===// // TOSA Operator Verifiers. //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp index ab363ee6b4d2a..ddd9c70402fdc 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp @@ -31,6 +31,7 @@ TosaProfileCompliance::TosaProfileCompliance() { const TypeInfo fp6e3m2T = {mlir::Float6E3M2FNType::getTypeID(), 6}; const TypeInfo fp4e2m1T = {mlir::Float4E2M1FNType::getTypeID(), 4}; const TypeInfo fp8ue8m0T = {mlir::Float8E8M0FNUType::getTypeID(), 8}; + const TypeInfo mxint8T = {mlir::tosa::mxint8Type::getTypeID(), 8}; // The profile-based compliance content below is auto-generated by a script // in https://git.mlplatform.org/tosa/specification.git @@ -625,6 +626,8 @@ TosaProfileCompliance::stringifyTypeInfo(const TypeInfo &typeInfo) { return {"fp4e2m1"}; } else if (typeInfo.typeID == mlir::Float8E8M0FNUType::getTypeID()) { return {"fp8e8m0"}; + } else if (typeInfo.typeID == tosa::mxint8Type::getTypeID()) { + return {"mxint8"}; } llvm_unreachable("unknown type"); } diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp index 4d0b61acc4ea4..b54ed5585d72d 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp @@ -693,7 +693,7 @@ LogicalResult TosaValidation::levelCheckSize(Operation *op, << " shape dimension cannot be dynamic"; } - int64_t element_bits = type.getElementTypeBitWidth(); + int64_t element_bits = tosa::getBitWidth(getElementTypeOrSelf(type)); int64_t element_bytes = std::max(INT64_C(1), element_bits / 8); int64_t size = element_bytes * type.getNumElements(); @@ -1217,9 +1217,10 @@ bool TosaValidation::isValidElementType(Type type, const bool allowUnsigned) { return true; } } - } else if (mlir::isa(type)) { + } else if (isa(type)) + return true; + else if (isa(type)) return true; - } return false; } diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir index 865f712ce1a5a..22fde3b7d28a5 100644 --- a/mlir/test/Dialect/Tosa/ops.mlir +++ b/mlir/test/Dialect/Tosa/ops.mlir @@ -1269,6 +1269,13 @@ func.func @test_matmul_t_block_scaled_broadcast(%arg0: tensor, return %0 : tensor<4x8x16xf32> } +// ----- +// CHECK-LABEL: test_matmul_t_block_scaled_mxint8 +func.func @test_matmul_t_block_scaled_mxint8(%arg0: tensor<4x8x32x!tosa.mxint8>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32x!tosa.mxint8>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> { + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size : i32} : (tensor<4x8x32x!tosa.mxint8>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32x!tosa.mxint8>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> + return %0 : tensor<4x8x16xf32> +} + // ----- // CHECK-LABEL: test_cast_from_block_scaled_static func.func @test_cast_from_block_scaled_static(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32> { @@ -1296,3 +1303,17 @@ func.func @test_cast_to_block_scaled_unranked(%arg0: tensor<*xf32>) -> (tensor<* %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size} : (tensor<*xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>) return %0#0, %0#1 : tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU> } + +// ----- +// CHECK-LABEL: test_cast_to_block_scaled_mxint8 +func.func @test_cast_to_block_scaled_mxint8(%arg0: tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>) { + %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size : i32, stochastic_round = false} : (tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>) + return %0#0, %0#1 : tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU> +} + +// ----- +// CHECK-LABEL: test_const_mxint8 +func.func @test_const_mxint8(%arg0 : index) -> tensor<2x!tosa.mxint8> { + %0 = "tosa.const"() {values = dense<"0x007F"> : tensor<2x!tosa.mxint8>} : () -> tensor<2x!tosa.mxint8> + return %0 : tensor<2x!tosa.mxint8> +} diff --git a/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir index f3d8dab2f6b0f..9bd7aa8f0783e 100644 --- a/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir +++ b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir @@ -38,7 +38,7 @@ func.func @test_argmax_int64(%arg0: tensor<1x13x13x5xf32>) -> tensor<1x13x13xi64 // ----- // CHECK-LABEL: test_const_i64 -func.func @test_const_i64(%arg0 : index) -> tensor<4xi64> { +func.func @test_const_i64() -> tensor<4xi64> { %0 = "tosa.const"() {values = dense<[3, 0, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64> return %0 : tensor<4xi64> } @@ -46,7 +46,7 @@ func.func @test_const_i64(%arg0 : index) -> tensor<4xi64> { // ----- // CHECK-LABEL: test_const_fp6e3m2 -func.func @test_const_fp6e3m2(%arg0 : index) -> tensor<4xf6E3M2FN> { +func.func @test_const_fp6e3m2() -> tensor<4xf6E3M2FN> { %0 = "tosa.const"() {values = dense<[0.0, 0.0, 0.0, 0.0]> : tensor<4xf6E3M2FN>} : () -> tensor<4xf6E3M2FN> return %0 : tensor<4xf6E3M2FN> } @@ -82,3 +82,51 @@ func.func @test_cast_to_block_scaled_static(%arg0: tensor<4x32xf32>) -> (tensor< %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size} : (tensor<4x32xf32>) -> (tensor<4x32xf6E3M2FN>, tensor<4x1xf8E8M0FNU>) return %0#0, %0#1 : tensor<4x32xf6E3M2FN>, tensor<4x1xf8E8M0FNU> } + +// ----- + +// CHECK-LABEL: test_cast_to_block_scaled_mxint8 +func.func @test_cast_to_block_scaled_mxint8(%arg0: tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>) { + %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size : i32, stochastic_round = false} : (tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>) + return %0#0, %0#1 : tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU> +} + +// ----- + +// CHECK-LABEL: test_const_fp6e3m2 +func.func @test_const_fp6e3m2() -> tensor<4xf6E3M2FN> { + %0 = "tosa.const"() {values = dense<[0.0, 0.0, 0.0, 0.0]> : tensor<4xf6E3M2FN>} : () -> tensor<4xf6E3M2FN> + return %0 : tensor<4xf6E3M2FN> +} + +// ----- + +// CHECK-LABEL: test_const_mxint8 +func.func @test_const_mxint8() -> tensor<2x!tosa.mxint8> { + %0 = "tosa.const"() {values = dense<["0x00", "0x7F"]> : tensor<2x!tosa.mxint8>} : () -> tensor<2x!tosa.mxint8> + return %0 : tensor<2x!tosa.mxint8> +} + +// ----- + +// CHECK-LABEL: test_cast_f4e2m1 +func.func @test_cast_f4e2m1(%arg0: tensor<13x21x3xf4E2M1FN>) -> tensor<13x21x3xbf16> { + %0 = tosa.cast %arg0 : (tensor<13x21x3xf4E2M1FN>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- + +// CHECK-LABEL: test_matmul_t_block_scaled_mxint8 +func.func @test_matmul_t_block_scaled_mxint8(%arg0: tensor<4x8x32x!tosa.mxint8>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32x!tosa.mxint8>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> { + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size} : (tensor<4x8x32x!tosa.mxint8>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32x!tosa.mxint8>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> + return %0 : tensor<4x8x16xf32> +} + +// ----- + +// CHECK-LABEL: test_cast_to_block_scaled_mxint8 +func.func @test_cast_to_block_scaled_mxint8(%arg0: tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>) { + %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size : i32, stochastic_round = false} : (tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>) + return %0#0, %0#1 : tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU> +} From cad8541f62f201b8911b6280b3677545bba9321e Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 29 Oct 2025 12:27:11 +0100 Subject: [PATCH 094/539] [flang][cmake] Set the usual linker flags for non-gtest unit tests (#165256) Flang also uses non-gtest based unittests, which don't go through the usual add_unittest() helper. These currently do not use the usual linker flags for unit tests. This means that in LTO builds, they do not disable LTO when building unit tests, which increases the build time. --- flang/unittests/CMakeLists.txt | 1 + llvm/cmake/modules/AddLLVM.cmake | 47 ++++++++++++++++++-------------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/flang/unittests/CMakeLists.txt b/flang/unittests/CMakeLists.txt index db04923e2943a..2d612e58dae24 100644 --- a/flang/unittests/CMakeLists.txt +++ b/flang/unittests/CMakeLists.txt @@ -48,6 +48,7 @@ function(add_flang_nongtest_unittest test_name) llvm_map_components_to_libnames(llvm_libs Support) endif() target_link_libraries(${test_name}${suffix} ${llvm_libs} ${ARG_UNPARSED_ARGUMENTS}) + set_unittest_link_flags(${test_name}${suffix}) if(NOT ARG_SLOW_TEST) add_dependencies(FlangUnitTests ${test_name}${suffix}) diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake index 80e59a4df2433..7d40d309d538e 100644 --- a/llvm/cmake/modules/AddLLVM.cmake +++ b/llvm/cmake/modules/AddLLVM.cmake @@ -1747,6 +1747,31 @@ function(add_llvm_implicit_projects) llvm_add_implicit_projects(LLVM) endfunction(add_llvm_implicit_projects) +function(set_unittest_link_flags target_name) + # The runtime benefits of LTO don't outweight the compile time costs for + # tests. + if(LLVM_ENABLE_LTO) + if((UNIX OR MINGW) AND LINKER_IS_LLD) + if(LLVM_ENABLE_FATLTO AND NOT APPLE) + # When using FatLTO, just use relocatable linking. + set_property(TARGET ${target_name} APPEND_STRING PROPERTY + LINK_FLAGS " -Wl,--no-fat-lto-objects") + else() + set_property(TARGET ${target_name} APPEND_STRING PROPERTY + LINK_FLAGS " -Wl,--lto-O0") + endif() + elseif(LINKER_IS_LLD_LINK) + set_property(TARGET ${target_name} APPEND_STRING PROPERTY + LINK_FLAGS " /opt:lldlto=0") + elseif(APPLE AND NOT uppercase_LLVM_ENABLE_LTO STREQUAL "THIN") + set_property(TARGET ${target_name} APPEND_STRING PROPERTY + LINK_FLAGS " -Wl,-mllvm,-O0") + endif() + endif() + + target_link_options(${target_name} PRIVATE "${LLVM_UNITTEST_LINK_FLAGS}") +endfunction(set_unittest_link_flags) + # Generic support for adding a unittest. function(add_unittest test_suite test_name) if( NOT LLVM_BUILD_TESTS ) @@ -1770,27 +1795,7 @@ function(add_unittest test_suite test_name) get_subproject_title(subproject_title) set_target_properties(${test_name} PROPERTIES FOLDER "${subproject_title}/Tests/Unit") - # The runtime benefits of LTO don't outweight the compile time costs for tests. - if(LLVM_ENABLE_LTO) - if((UNIX OR MINGW) AND LINKER_IS_LLD) - if(LLVM_ENABLE_FATLTO AND NOT APPLE) - # When using FatLTO, just use relocatable linking. - set_property(TARGET ${test_name} APPEND_STRING PROPERTY - LINK_FLAGS " -Wl,--no-fat-lto-objects") - else() - set_property(TARGET ${test_name} APPEND_STRING PROPERTY - LINK_FLAGS " -Wl,--lto-O0") - endif() - elseif(LINKER_IS_LLD_LINK) - set_property(TARGET ${test_name} APPEND_STRING PROPERTY - LINK_FLAGS " /opt:lldlto=0") - elseif(APPLE AND NOT uppercase_LLVM_ENABLE_LTO STREQUAL "THIN") - set_property(TARGET ${target_name} APPEND_STRING PROPERTY - LINK_FLAGS " -Wl,-mllvm,-O0") - endif() - endif() - - target_link_options(${test_name} PRIVATE "${LLVM_UNITTEST_LINK_FLAGS}") + set_unittest_link_flags(${test_name}) set(outdir ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}) set_output_directory(${test_name} BINARY_DIR ${outdir} LIBRARY_DIR ${outdir}) From 96c5d2d51a51bbc90fbaf991711f01c74fc59c2b Mon Sep 17 00:00:00 2001 From: Anutosh Bhat Date: Wed, 29 Oct 2025 17:01:34 +0530 Subject: [PATCH 095/539] [clang-repl] Fix struct value printing for clang-repl in C mode (#165538) I added some logs to see the difference between C++ mode and C mode and I see this In C++ mode ``` clang-repl> struct S1{} s1; s1 [convertExprToValue] original Expr: DeclRefExpr | type: struct S1 [convertExprToValue] Ty: struct S1 [convertExprToValue] DesugaredTy: struct S1 [convertExprToValue] Treating lvalue record as reference (enters block 540) [convertExprToValue] Ty: struct S1 & (after block 540) [convertExprToValue] DesugaredTy: struct S1 & (after block 540) [computeInterfaceKind] Expr class: DeclRefExpr | isLValue: 1 (S1 &) @0x10c9ac058 ``` in C mode ``` (base) anutosh491@Anutoshs-MacBook-Air bin % ./clang-repl --Xcc=-xc --Xcc=-std=c23 clang-repl> struct S1{} s1; s1 [convertExprToValue] original Expr: ImplicitCastExpr | type: struct S1 [convertExprToValue] Ty: struct S1 [convertExprToValue] DesugaredTy: struct S1 [convertExprToValue] Ty: struct S1 (after block 540) [convertExprToValue] DesugaredTy: struct S1 (after block 540) [computeInterfaceKind] Expr class: ImplicitCastExpr | isLValue: 0 Stack dump without symbol names (ensure you have llvm-symbolizer in your PATH or set the environment var `LLVM_SYMBOLIZER_PATH` to point to it): s0 clang-repl 0x0000000103cca03c llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) + 88 1 clang-repl 0x0000000103cca61c PrintStackTraceSignalHandler(void*) + 28 2 clang-repl 0x0000000103cc7ee8 llvm::sys::RunSignalHandlers() + 152 3 clang-repl 0x0000000103ccbb54 SignalHandler(int, __siginfo*, void*) + 284 4 libsystem_platform.dylib 0x00000001887f4624 _sigtramp + 56 5 clang-repl 0x00000001079bee18 clang::Sema::CheckArgsForPlaceholders(llvm::MutableArrayRef) + 120 6 clang-repl 0x00000001079bee18 clang::Sema::CheckArgsForPlaceholders(llvm::MutableArrayRef) + 120 7 clang-repl 0x0000000107b823dc clang::Sema::BuildCXXNew(clang::SourceRange, bool, clang::SourceLocation, llvm::MutableArrayRef, clang::SourceLocation, clang::SourceRange, clang::QualType, clang::TypeSourceInfo*, std::__1::optional, clang::SourceRange, clang::Expr*) + 5672 8 clang-repl 0x000000010538c560 clang::Interpreter::convertExprToValue(clang::Expr*) + 2580 9 clang-repl 0x0000000105360774 clang::InProcessPrintingASTConsumer::HandleTopLevelDecl(clang::DeclGroupRef) + 252 10 clang-repl 0x000000010536a82c clang::IncrementalParser::ParseOrWrapTopLevelDecl() + 676 11 clang-repl 0x000000010536b554 clang::IncrementalParser::Parse(llvm::StringRef) + 712 12 clang-repl 0x000000010537e6b4 clang::Interpreter::Parse(llvm::StringRef) + 588 13 clang-repl 0x000000010537d73c clang::Interpreter::ParseAndExecute(llvm::StringRef, clang::Value*) + 72 14 clang-repl 0x000000010022db38 main + 3660 15 dyld 0x000000018841ab98 start + 6076 ``` So basically C mode wasn't entering block 540 as expressions like `s1` (where `s1` is a struct variable) are wrapped in an `ImplicitCastExpr`, which masks the underlying `DeclRefExpr` that is actually an `lvalue`.This patch unwraps the implicit cast with E->IgnoreImpCasts() before checking isLValue(), restoring correct detection of lvalue structs. --- .../lib/Interpreter/InterpreterValuePrinter.cpp | 5 +++-- clang/test/Interpreter/pretty-print.c | 16 +++++++++------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/clang/lib/Interpreter/InterpreterValuePrinter.cpp b/clang/lib/Interpreter/InterpreterValuePrinter.cpp index 0ed02f3bfabe8..cfa50ee908bf8 100644 --- a/clang/lib/Interpreter/InterpreterValuePrinter.cpp +++ b/clang/lib/Interpreter/InterpreterValuePrinter.cpp @@ -411,7 +411,8 @@ class InterfaceKindVisitor } InterfaceKind VisitReferenceType(const ReferenceType *Ty) { - ExprResult AddrOfE = S.CreateBuiltinUnaryOp(SourceLocation(), UO_AddrOf, E); + ExprResult AddrOfE = S.CreateBuiltinUnaryOp(SourceLocation(), UO_AddrOf, + E->IgnoreImpCasts()); assert(!AddrOfE.isInvalid() && "Can not create unary expression"); Args.push_back(AddrOfE.get()); return InterfaceKind::NoAlloc; @@ -537,7 +538,7 @@ llvm::Expected Interpreter::convertExprToValue(Expr *E) { QualType DesugaredTy = Ty.getDesugaredType(Ctx); // For lvalue struct, we treat it as a reference. - if (DesugaredTy->isRecordType() && E->isLValue()) { + if (DesugaredTy->isRecordType() && E->IgnoreImpCasts()->isLValue()) { DesugaredTy = Ctx.getLValueReferenceType(DesugaredTy); Ty = Ctx.getLValueReferenceType(Ty); } diff --git a/clang/test/Interpreter/pretty-print.c b/clang/test/Interpreter/pretty-print.c index d0712fb152107..9a7bf752238ab 100644 --- a/clang/test/Interpreter/pretty-print.c +++ b/clang/test/Interpreter/pretty-print.c @@ -78,14 +78,16 @@ int * null_ptr = (int*)0; null_ptr union U { int I; float F; } u; u.I = 12; u.I // CHECK-NEXT: (int) 12 -// TODO: _Bool, _Complex, _Atomic, and _BitInt -// struct S1{} s1; s1 -// TODO-CHECK-NEXT: (S1 &) @0x{{[0-9a-f]+}} +struct S1{} s1; s1 +// CHECK-NEXT: (S1 &) @0x{{[0-9a-f]+}} + +struct S2 {int d;} E = {22}; E +// CHECK-NEXT: (S2 &) @0x{{[0-9a-f]+}} -// struct S2 {int d;} E = {22}; E -// TODO-CHECK-NEXT: (struct S2 &) @0x{{[0-9a-f]+}} -// E.d -// TODO-CHECK-NEXT: (int) 22 +E.d +// CHECK-NEXT: (int) 22 + +// TODO: _Bool, _Complex, _Atomic, and _BitInt // ----------------------------------------------------------------------------- // Tentative definition handling (C99 6.9.2) From ad66274ba67f0e6ec38ae48db1d9d82159c1a66f Mon Sep 17 00:00:00 2001 From: Kunqiu Chen Date: Wed, 29 Oct 2025 19:39:49 +0800 Subject: [PATCH 096/539] [PredicateInfo] Drop redundant PredicateInfo annotation (#165434) See https://github.com/llvm/llvm-project/pull/165419#discussion_r2470208670 for details. The extra annotation `"; Has predicate info"` does not provide any extra information and might poison the UTC-generated checks introduced by #165419. --- llvm/lib/Transforms/Utils/PredicateInfo.cpp | 1 - llvm/test/Transforms/Util/PredicateInfo/unnamed-types.ll | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp index a9ab3b3144829..27fed7340411b 100644 --- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -809,7 +809,6 @@ class PredicateInfoAnnotatedWriter : public AssemblyAnnotationWriter { void emitInstructionAnnot(const Instruction *I, formatted_raw_ostream &OS) override { if (const auto *PI = PredInfo->getPredicateInfoFor(I)) { - OS << "; Has predicate info\n"; if (const auto *PB = dyn_cast(PI)) { OS << "; branch predicate info { TrueEdge: " << PB->TrueEdge << " Comparison:" << *PB->Condition << " Edge: ["; diff --git a/llvm/test/Transforms/Util/PredicateInfo/unnamed-types.ll b/llvm/test/Transforms/Util/PredicateInfo/unnamed-types.ll index d9f6aed7d01c8..faf4bec61c935 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/unnamed-types.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/unnamed-types.ll @@ -6,13 +6,11 @@ ; Check we can use ssa.copy with unnamed types. ; CHECK-LABEL: bb: -; CHECK: Has predicate info ; CHECK: branch predicate info { TrueEdge: 1 Comparison: %cmp1 = icmp ne ptr %arg, null Edge: [label %bb,label %bb1], RenamedOp: %arg } ; CHECK-NEXT: %arg.0 = bitcast ptr %arg to ptr ; CHECK-LABEL: bb1: -; CHECK: Has predicate info -; CHECK-NEXT: branch predicate info { TrueEdge: 0 Comparison: %cmp2 = icmp ne ptr null, %tmp Edge: [label %bb1,label %bb3], RenamedOp: %tmp } +; CHECK: branch predicate info { TrueEdge: 0 Comparison: %cmp2 = icmp ne ptr null, %tmp Edge: [label %bb1,label %bb3], RenamedOp: %tmp } ; CHECK-NEXT: %tmp.0 = bitcast ptr %tmp to ptr define void @f0(ptr %arg, ptr %tmp) { From 409ff26f3b1b2b69eb2911c18ed79ff6573f4726 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Wed, 29 Oct 2025 12:47:29 +0100 Subject: [PATCH 097/539] [MLIR] Remove unused include. NFC. --- mlir/include/mlir/Interfaces/ControlFlowInterfaces.h | 1 - 1 file changed, 1 deletion(-) diff --git a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h index 47afd252c6d68..bfc24c18429ed 100644 --- a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h +++ b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h @@ -18,7 +18,6 @@ #include "mlir/IR/Operation.h" #include "llvm/ADT/PointerUnion.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/Support/DebugLog.h" #include "llvm/Support/raw_ostream.h" namespace mlir { From 5642f65e8040253d006e441568eb1a6fe0755c81 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Wed, 29 Oct 2025 11:50:36 +0000 Subject: [PATCH 098/539] [llvm][test] Skip object-property.ll debug-info test on AIX Fails with: ``` ******************** TEST 'LLVM :: DebugInfo/Generic/objc-property.ll' FAILED ******************** Exit Code: 2 Command Output (stdout): -- RUN: at line 1 ome/llvm/llvm-external-buildbots/workers/aix-ppc64/clang-ppc64-aix/build/bin/llc -filetype=obj -o - /home/llvm/llvm-external-buildbots/workers/aix-ppc64/clang-ppc64-aix/llvm-project/llvm/test/DebugInfo/Generic/objc-property.ll | /home/llvm/llvm-external-buildbots/workers/aix-ppc64/clang-ppc64-aix/build/bin/llvm-dwarfdump --debug-info - | /home/llvm/llvm-external-buildbots/workers/aix-ppc64/clang-ppc64-aix/build/bin/FileCheck /home/llvm/llvm-external-buildbots/workers/aix-ppc64/clang-ppc64-aix/llvm-project/llvm/test/DebugInfo/Generic/objc-property.ll executed command: /home/llvm/llvm-external-buildbots/workers/aix-ppc64/clang-ppc64-aix/build/bin/llc -filetype=obj -o - /home/llvm/llvm-external-buildbots/workers/aix-ppc64/clang-ppc64-aix/llvm-project/llvm/test/DebugInfo/Generic/objc-property.ll .---command stderr------------ | Assertion failed: Section && "Cannot switch to a null section!", file /home/llvm/llvm-external-buildbots/workers/aix-ppc64/clang-ppc64-aix/llvm-project/llvm/lib/MC/MCStreamer.cpp, line 1364, virtual void llvm::MCStreamer::switchSection(MCSection *, uint32_t)() | PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace and instructions to reproduce the bug. | Stack dump: | 0. Program arguments: /home/llvm/llvm-external-buildbots/workers/aix-ppc64/clang-ppc64-aix/build/bin/llc -filetype=obj -o - /home/llvm/llvm-external-buildbots/workers/aix-ppc64/clang-ppc64-aix/llvm-project/llvm/test/DebugInfo/Generic/objc-property.ll `----------------------------- error: command failed with exit status: -6 executed command: /home/llvm/llvm-external-buildbots/workers/aix-ppc64/clang-ppc64-aix/build/bin/llvm-dwarfdump --debug-info - .---command stderr------------ | error: -: The file was not recognized as a valid object file `----------------------------- error: command failed with exit status: 1 executed command: /home/llvm/llvm-external-buildbots/workers/aix-ppc64/clang-ppc64-aix/build/bin/FileCheck /home/llvm/llvm-external-buildbots/workers/aix-ppc64/clang-ppc64-aix/llvm-project/llvm/test/DebugInfo/Generic/objc-property.ll .---command stderr------------ | FileCheck error: '' is empty. | FileCheck command line: /home/llvm/llvm-external-buildbots/workers/aix-ppc64/clang-ppc64-aix/build/bin/FileCheck /home/llvm/llvm-external-buildbots/workers/aix-ppc64/clang-ppc64-aix/llvm-project/llvm/test/DebugInfo/Generic/objc-property.ll `----------------------------- error: command failed with exit status: 2 ``` Presumably due to unsupported debug-info section (see https://github.com/llvm/llvm-project/pull/71814) --- llvm/test/DebugInfo/Generic/objc-property.ll | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/test/DebugInfo/Generic/objc-property.ll b/llvm/test/DebugInfo/Generic/objc-property.ll index 53ccfefedbfae..007d1fe698b30 100644 --- a/llvm/test/DebugInfo/Generic/objc-property.ll +++ b/llvm/test/DebugInfo/Generic/objc-property.ll @@ -1,3 +1,5 @@ +; UNSUPPORTED: target={{.*}}-aix{{.*}} +; ; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump --debug-info - | FileCheck %s ; CHECK: DW_TAG_structure_type From 632b5a61c675a0ab25eca5761c528d06da20be30 Mon Sep 17 00:00:00 2001 From: Timur Golubovich Date: Wed, 29 Oct 2025 15:13:34 +0300 Subject: [PATCH 099/539] [lldb][DWARFASTParserClang] Added a check for the specialization existence (#154123) [lldb][DWARFASTParserClang] Added a check for the specialization existence While debugging an application with incorrect dwarf information, where DW_TAG_template_value_parameter was lost, I found that lldb does not check that the corresponding specialization exists. As a result, at the stage when ASTImporter works, the type is completed in such a way that it inherits from itself. And during the calculation of layout, an infinite recursion occurs. To catch this error, I added a corresponding check at the stage of restoring the type from dwarf information. I also added a trivial assert in clang to check that the class does not inherit from itself. --- .../SymbolFile/DWARF/DWARFASTParserClang.cpp | 11 + .../TypeSystem/Clang/TypeSystemClang.cpp | 5 + .../unittests/SymbolFile/DWARF/CMakeLists.txt | 3 +- .../DWARF/DWARFASTParserClangTests.cpp | 34 + .../Inputs/DW_AT_spec_decl_exists-test.yaml | 677 ++++++++++++++++++ 5 files changed, 729 insertions(+), 1 deletion(-) create mode 100644 lldb/unittests/SymbolFile/DWARF/Inputs/DW_AT_spec_decl_exists-test.yaml diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index 82e9d867c3ac0..36bc17680f3fa 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -1901,6 +1901,17 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, m_ast.CreateClassTemplateSpecializationDecl( containing_decl_ctx, GetOwningClangModule(die), class_template_decl, tag_decl_kind, template_param_infos); + if (!class_specialization_decl) { + if (log) { + dwarf->GetObjectFile()->GetModule()->LogMessage( + log, + "SymbolFileDWARF({0:p}) - Failed to create specialization for " + "clang::ClassTemplateDecl({1}, {2:p}).", + this, llvm::StringRef(attrs.name), class_template_decl); + } + return TypeSP(); + } + clang_type = m_ast.CreateClassTemplateSpecializationType(class_specialization_decl); diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index 82dfe7e540717..6ec054d5eac05 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -1693,6 +1693,11 @@ TypeSystemClang::CreateClassTemplateSpecializationDecl( class_template_specialization_decl->setInstantiationOf(class_template_decl); class_template_specialization_decl->setTemplateArgs( TemplateArgumentList::CreateCopy(ast, args)); + void *insert_pos = nullptr; + if (class_template_decl->findSpecialization(args, insert_pos)) + return nullptr; + class_template_decl->AddSpecialization(class_template_specialization_decl, + insert_pos); class_template_specialization_decl->setDeclName( class_template_decl->getDeclName()); diff --git a/lldb/unittests/SymbolFile/DWARF/CMakeLists.txt b/lldb/unittests/SymbolFile/DWARF/CMakeLists.txt index eb2e00adba64b..88492188e794b 100644 --- a/lldb/unittests/SymbolFile/DWARF/CMakeLists.txt +++ b/lldb/unittests/SymbolFile/DWARF/CMakeLists.txt @@ -27,6 +27,7 @@ add_lldb_unittest(SymbolFileDWARFTests set(test_inputs test-dwarf.exe - DW_AT_default_value-test.yaml) + DW_AT_default_value-test.yaml + DW_AT_spec_decl_exists-test.yaml) add_unittest_inputs(SymbolFileDWARFTests "${test_inputs}") diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp index 0cae01de2902a..1abce6999874e 100644 --- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp +++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp @@ -599,6 +599,40 @@ TEST_F(DWARFASTParserClangTests, TestDefaultTemplateParamParsing) { } } +TEST_F(DWARFASTParserClangTests, TestSpecDeclExistsError) { + // Tests that parsing a ClassTemplateSpecializationDecl that already exists + // is handled gracefully. + auto BufferOrError = llvm::MemoryBuffer::getFile( + GetInputFilePath("DW_AT_spec_decl_exists-test.yaml"), /*IsText=*/true); + ASSERT_TRUE(BufferOrError); + YAMLModuleTester t(BufferOrError.get()->getBuffer()); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + DWARFDIE cu_die(unit, cu_entry); + + auto holder = std::make_unique("ast"); + auto &ast_ctx = *holder->GetAST(); + DWARFASTParserClangStub ast_parser(ast_ctx); + + llvm::SmallVector specializations; + for (DWARFDIE die : cu_die.children()) { + SymbolContext sc; + bool new_type = false; + auto type = ast_parser.ParseTypeFromDWARF(sc, die, &new_type); + llvm::StringRef die_name = llvm::StringRef(die.GetName()); + if (die_name.starts_with("_Optional_payload")) { + specializations.push_back(std::move(type)); + } + } + + ASSERT_EQ(specializations.size(), 2U); + ASSERT_NE(specializations[0], nullptr); + ASSERT_EQ(specializations[1], nullptr); +} + TEST_F(DWARFASTParserClangTests, TestUniqueDWARFASTTypeMap_CppInsertMapFind) { // This tests the behaviour of UniqueDWARFASTTypeMap under // following scenario: diff --git a/lldb/unittests/SymbolFile/DWARF/Inputs/DW_AT_spec_decl_exists-test.yaml b/lldb/unittests/SymbolFile/DWARF/Inputs/DW_AT_spec_decl_exists-test.yaml new file mode 100644 index 0000000000000..91245f09abbbf --- /dev/null +++ b/lldb/unittests/SymbolFile/DWARF/Inputs/DW_AT_spec_decl_exists-test.yaml @@ -0,0 +1,677 @@ +# struct Type {}; +# +# template struct _Optional_payload; +# +# template struct _Optional_payload<_Tp, true, false, false> {}; +# +# template +# struct _Optional_payload<_Tp, false, _Copy, _Move> +# : _Optional_payload<_Tp, true, false, false> {}; +# +# int main() { +# _Optional_payload X; +# } +# +# YAML generated on Linux using obj2yaml on the above program compiled with +# G++. This is malformed DWARF that is missing DW_TAG_template_value_parameter +# entries, which is important for the test because that makes the two +# specializations look like identical structure definitions. +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 + Entry: 0x1040 +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_R ] + VAddr: 0x40 + Align: 0x8 + Offset: 0x40 + - Type: PT_INTERP + Flags: [ PF_R ] + FirstSec: .interp + LastSec: .interp + VAddr: 0x318 + Offset: 0x318 + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .interp + LastSec: .rela.dyn + Align: 0x1000 + Offset: 0x0 + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + FirstSec: .init + LastSec: .fini + VAddr: 0x1000 + Align: 0x1000 + Offset: 0x1000 + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .rodata + LastSec: .eh_frame + VAddr: 0x2000 + Align: 0x1000 + Offset: 0x2000 + - Type: PT_LOAD + Flags: [ PF_W, PF_R ] + FirstSec: .init_array + LastSec: .bss + VAddr: 0x3DF0 + Align: 0x1000 + Offset: 0x2DF0 + - Type: PT_DYNAMIC + Flags: [ PF_W, PF_R ] + FirstSec: .dynamic + LastSec: .dynamic + VAddr: 0x3E00 + Align: 0x8 + Offset: 0x2E00 + - Type: PT_NOTE + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .note.gnu.property + VAddr: 0x338 + Align: 0x8 + Offset: 0x338 + - Type: PT_NOTE + Flags: [ PF_R ] + FirstSec: .note.gnu.build-id + LastSec: .note.ABI-tag + VAddr: 0x358 + Align: 0x4 + Offset: 0x358 + - Type: PT_GNU_PROPERTY + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .note.gnu.property + VAddr: 0x338 + Align: 0x8 + Offset: 0x338 + - Type: PT_GNU_EH_FRAME + Flags: [ PF_R ] + FirstSec: .eh_frame_hdr + LastSec: .eh_frame_hdr + VAddr: 0x2004 + Align: 0x4 + Offset: 0x2004 + - Type: PT_GNU_STACK + Flags: [ PF_W, PF_R ] + Align: 0x10 + Offset: 0x0 + - Type: PT_GNU_RELRO + Flags: [ PF_R ] + FirstSec: .init_array + LastSec: .got + VAddr: 0x3DF0 + Offset: 0x2DF0 +Sections: + - Name: .interp + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x318 + AddressAlign: 0x1 + Content: 2F6C696236342F6C642D6C696E75782D7838362D36342E736F2E3200 + - Name: .note.gnu.property + Type: SHT_NOTE + Flags: [ SHF_ALLOC ] + Address: 0x338 + AddressAlign: 0x8 + Notes: + - Name: GNU + Desc: 020000C0040000000300000000000000 + Type: NT_GNU_PROPERTY_TYPE_0 + - Name: .note.gnu.build-id + Type: SHT_NOTE + Flags: [ SHF_ALLOC ] + Address: 0x358 + AddressAlign: 0x4 + Notes: + - Name: GNU + Desc: AF3A83002F03E80537DCB46B3E56062984AD2629 + Type: NT_PRPSINFO + - Name: .note.ABI-tag + Type: SHT_NOTE + Flags: [ SHF_ALLOC ] + Address: 0x37C + AddressAlign: 0x4 + Notes: + - Name: GNU + Desc: '00000000030000000200000000000000' + Type: NT_VERSION + - Name: .gnu.hash + Type: SHT_GNU_HASH + Flags: [ SHF_ALLOC ] + Address: 0x3A0 + Link: .dynsym + AddressAlign: 0x8 + Header: + SymNdx: 0x5 + Shift2: 0x6 + BloomFilter: [ 0x810000 ] + HashBuckets: [ 0x5, 0x0 ] + HashValues: [ 0x6DCE65D1 ] + - Name: .dynsym + Type: SHT_DYNSYM + Flags: [ SHF_ALLOC ] + Address: 0x3C8 + Link: .dynstr + AddressAlign: 0x8 + - Name: .dynstr + Type: SHT_STRTAB + Flags: [ SHF_ALLOC ] + Address: 0x458 + AddressAlign: 0x1 + - Name: .gnu.version + Type: SHT_GNU_versym + Flags: [ SHF_ALLOC ] + Address: 0x4D6 + Link: .dynsym + AddressAlign: 0x2 + Entries: [ 0, 0, 2, 0, 0, 2 ] + - Name: .gnu.version_r + Type: SHT_GNU_verneed + Flags: [ SHF_ALLOC ] + Address: 0x4E8 + Link: .dynstr + AddressAlign: 0x8 + Dependencies: + - Version: 1 + File: libc.so.6 + Entries: + - Name: GLIBC_2.2.5 + Hash: 157882997 + Flags: 0 + Other: 2 + - Name: .rela.dyn + Type: SHT_RELA + Flags: [ SHF_ALLOC ] + Address: 0x508 + Link: .dynsym + AddressAlign: 0x8 + Relocations: + - Offset: 0x3DF0 + Type: R_X86_64_RELATIVE + Addend: 4384 + - Offset: 0x3DF8 + Type: R_X86_64_RELATIVE + Addend: 4320 + - Offset: 0x4008 + Type: R_X86_64_RELATIVE + Addend: 16392 + - Offset: 0x3FD8 + Symbol: _ITM_deregisterTMCloneTable + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FE0 + Symbol: __libc_start_main + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FE8 + Symbol: __gmon_start__ + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FF0 + Symbol: _ITM_registerTMCloneTable + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FF8 + Symbol: __cxa_finalize + Type: R_X86_64_GLOB_DAT + - Name: .init + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1000 + AddressAlign: 0x4 + Offset: 0x1000 + Content: F30F1EFA4883EC08488B05D92F00004885C07402FFD04883C408C3 + - Name: .plt + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1020 + AddressAlign: 0x10 + EntSize: 0x10 + Content: FF35A22F0000F2FF25A32F00000F1F00 + - Name: .plt.got + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1030 + AddressAlign: 0x10 + EntSize: 0x10 + Content: F30F1EFAF2FF25BD2F00000F1F440000 + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1040 + AddressAlign: 0x10 + Content: F30F1EFA31ED4989D15E4889E24883E4F050544C8D0556010000488D0DDF000000488D3DC1000000FF15722F0000F490488D3D992F0000488D05922F00004839F87415488B054E2F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D692F0000488D35622F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05252F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D252F000000752B5548833D022F0000004889E5740C488B3D062F0000E829FFFFFFE864FFFFFFC605FD2E0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5B8000000005DC30F1F840000000000F30F1EFA41574C8D3DA32C000041564989D641554989F541544189FC55488D2D942C0000534C29FD4883EC08E88FFEFFFF48C1FD03741F31DB0F1F80000000004C89F24C89EE4489E741FF14DF4883C3014839DD75EA4883C4085B5D415C415D415E415FC366662E0F1F840000000000F30F1EFAC3 + - Name: .fini + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x11B8 + AddressAlign: 0x4 + Content: F30F1EFA4883EC084883C408C3 + - Name: .rodata + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_MERGE ] + Address: 0x2000 + AddressAlign: 0x4 + EntSize: 0x4 + Offset: 0x2000 + Content: '01000200' + - Name: .eh_frame_hdr + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x2004 + AddressAlign: 0x4 + Content: 011B033B38000000060000001CF0FFFF6C0000002CF0FFFF940000003CF0FFFF5400000025F1FFFFAC0000003CF1FFFFCC000000ACF1FFFF14010000 + - Name: .eh_frame + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x2040 + AddressAlign: 0x8 + Content: 1400000000000000017A5200017810011B0C070890010000140000001C000000E0EFFFFF2F00000000440710000000002400000034000000A8EFFFFF10000000000E10460E184A0F0B770880003F1A3A2A33242200000000140000005C00000090EFFFFF1000000000000000000000001C0000007400000071F0FFFF0F00000000450E108602430D06460C0708000000440000009400000068F0FFFF6500000000460E108F02490E188E03450E208D04450E288C05440E308606480E388307470E406E0E38410E30410E28420E20420E18420E10420E080010000000DC00000090F0FFFF050000000000000000000000 + - Name: .init_array + Type: SHT_INIT_ARRAY + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3DF0 + AddressAlign: 0x8 + EntSize: 0x8 + Offset: 0x2DF0 + Content: '2011000000000000' + - Name: .fini_array + Type: SHT_FINI_ARRAY + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3DF8 + AddressAlign: 0x8 + EntSize: 0x8 + Content: E010000000000000 + - Name: .dynamic + Type: SHT_DYNAMIC + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3E00 + Link: .dynstr + AddressAlign: 0x8 + Entries: + - Tag: DT_NEEDED + Value: 0x1 + - Tag: DT_INIT + Value: 0x1000 + - Tag: DT_FINI + Value: 0x11B8 + - Tag: DT_INIT_ARRAY + Value: 0x3DF0 + - Tag: DT_INIT_ARRAYSZ + Value: 0x8 + - Tag: DT_FINI_ARRAY + Value: 0x3DF8 + - Tag: DT_FINI_ARRAYSZ + Value: 0x8 + - Tag: DT_GNU_HASH + Value: 0x3A0 + - Tag: DT_STRTAB + Value: 0x458 + - Tag: DT_SYMTAB + Value: 0x3C8 + - Tag: DT_STRSZ + Value: 0x7D + - Tag: DT_SYMENT + Value: 0x18 + - Tag: DT_DEBUG + Value: 0x0 + - Tag: DT_PLTGOT + Value: 0x3FC0 + - Tag: DT_RELA + Value: 0x508 + - Tag: DT_RELASZ + Value: 0xC0 + - Tag: DT_RELAENT + Value: 0x18 + - Tag: DT_FLAGS + Value: 0x8 + - Tag: DT_FLAGS_1 + Value: 0x8000001 + - Tag: DT_VERNEED + Value: 0x4E8 + - Tag: DT_VERNEEDNUM + Value: 0x1 + - Tag: DT_VERSYM + Value: 0x4D6 + - Tag: DT_RELACOUNT + Value: 0x3 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Name: .got + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3FC0 + AddressAlign: 0x8 + EntSize: 0x8 + Content: '003E0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000' + - Name: .data + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4000 + AddressAlign: 0x8 + Content: '00000000000000000840000000000000' + - Name: .bss + Type: SHT_NOBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4010 + AddressAlign: 0x1 + Size: 0x8 + - Name: .comment + Type: SHT_PROGBITS + Flags: [ SHF_MERGE, SHF_STRINGS ] + AddressAlign: 0x1 + EntSize: 0x1 + Content: 4743433A20285562756E747520392E342E302D317562756E7475317E32302E30342E322920392E342E3000 + - Name: .debug_info + Type: SHT_PROGBITS + AddressAlign: 0x1 + Content: 9E00000004000000000008013A0000000431000000FE00000029110000000000000F000000000000000000000002000000000101010803D2000000010105204D000000045F5470002D000000000305000000010108086A000000053600000000045F5470002D0000000006CD000000010B059A00000029110000000000000F00000000000000019C9A000000075800010C2F4D00000002916F00080405696E740000 + - Name: .debug_abbrev + Type: SHT_PROGBITS + AddressAlign: 0x1 + Content: 011101250E130B030E1B0E1101120710170000021300030E0B0B3A0B3B0B390B0000031301030E0B0B3A0B3B0B390B01130000042F00030849130000051C004913380B0000062E013F19030E3A0B3B0B390B49131101120740189742190113000007340003083A0B3B0B390B4913021800000824000B0B3E0B0308000000 + - Name: .debug_line + Type: SHT_PROGBITS + AddressAlign: 0x1 + Content: 3D00000003001F0000000101FB0E0D000101010100000001000001006D61696E2E6370700000000000050C0009022911000000000000030A010501840207000101 +Symbols: + - Name: .interp + Type: STT_SECTION + Section: .interp + Value: 0x318 + - Name: .note.gnu.property + Type: STT_SECTION + Section: .note.gnu.property + Value: 0x338 + - Name: .note.gnu.build-id + Type: STT_SECTION + Section: .note.gnu.build-id + Value: 0x358 + - Name: .note.ABI-tag + Type: STT_SECTION + Section: .note.ABI-tag + Value: 0x37C + - Name: .gnu.hash + Type: STT_SECTION + Section: .gnu.hash + Value: 0x3A0 + - Name: .dynsym + Type: STT_SECTION + Section: .dynsym + Value: 0x3C8 + - Name: .dynstr + Type: STT_SECTION + Section: .dynstr + Value: 0x458 + - Name: .gnu.version + Type: STT_SECTION + Section: .gnu.version + Value: 0x4D6 + - Name: .gnu.version_r + Type: STT_SECTION + Section: .gnu.version_r + Value: 0x4E8 + - Name: .rela.dyn + Type: STT_SECTION + Section: .rela.dyn + Value: 0x508 + - Name: .init + Type: STT_SECTION + Section: .init + Value: 0x1000 + - Name: .plt + Type: STT_SECTION + Section: .plt + Value: 0x1020 + - Name: .plt.got + Type: STT_SECTION + Section: .plt.got + Value: 0x1030 + - Name: .text + Type: STT_SECTION + Section: .text + Value: 0x1040 + - Name: .fini + Type: STT_SECTION + Section: .fini + Value: 0x11B8 + - Name: .rodata + Type: STT_SECTION + Section: .rodata + Value: 0x2000 + - Name: .eh_frame_hdr + Type: STT_SECTION + Section: .eh_frame_hdr + Value: 0x2004 + - Name: .eh_frame + Type: STT_SECTION + Section: .eh_frame + Value: 0x2040 + - Name: .init_array + Type: STT_SECTION + Section: .init_array + Value: 0x3DF0 + - Name: .fini_array + Type: STT_SECTION + Section: .fini_array + Value: 0x3DF8 + - Name: .dynamic + Type: STT_SECTION + Section: .dynamic + Value: 0x3E00 + - Name: .got + Type: STT_SECTION + Section: .got + Value: 0x3FC0 + - Name: .data + Type: STT_SECTION + Section: .data + Value: 0x4000 + - Name: .bss + Type: STT_SECTION + Section: .bss + Value: 0x4010 + - Name: .comment + Type: STT_SECTION + Section: .comment + - Name: .debug_aranges + Type: STT_SECTION + Section: .debug_aranges + - Name: .debug_info + Type: STT_SECTION + Section: .debug_info + - Name: .debug_abbrev + Type: STT_SECTION + Section: .debug_abbrev + - Name: .debug_line + Type: STT_SECTION + Section: .debug_line + - Name: .debug_str + Type: STT_SECTION + Section: .debug_str + - Name: crtstuff.c + Type: STT_FILE + Index: SHN_ABS + - Name: deregister_tm_clones + Type: STT_FUNC + Section: .text + Value: 0x1070 + - Name: register_tm_clones + Type: STT_FUNC + Section: .text + Value: 0x10A0 + - Name: __do_global_dtors_aux + Type: STT_FUNC + Section: .text + Value: 0x10E0 + - Name: completed.8061 + Type: STT_OBJECT + Section: .bss + Value: 0x4010 + Size: 0x1 + - Name: __do_global_dtors_aux_fini_array_entry + Type: STT_OBJECT + Section: .fini_array + Value: 0x3DF8 + - Name: frame_dummy + Type: STT_FUNC + Section: .text + Value: 0x1120 + - Name: __frame_dummy_init_array_entry + Type: STT_OBJECT + Section: .init_array + Value: 0x3DF0 + - Name: main.cpp + Type: STT_FILE + Index: SHN_ABS + - Name: 'crtstuff.c (1)' + Type: STT_FILE + Index: SHN_ABS + - Name: __FRAME_END__ + Type: STT_OBJECT + Section: .eh_frame + Value: 0x212C + - Type: STT_FILE + Index: SHN_ABS + - Name: __init_array_end + Section: .init_array + Value: 0x3DF8 + - Name: _DYNAMIC + Type: STT_OBJECT + Section: .dynamic + Value: 0x3E00 + - Name: __init_array_start + Section: .init_array + Value: 0x3DF0 + - Name: __GNU_EH_FRAME_HDR + Section: .eh_frame_hdr + Value: 0x2004 + - Name: _GLOBAL_OFFSET_TABLE_ + Type: STT_OBJECT + Section: .got + Value: 0x3FC0 + - Name: _init + Type: STT_FUNC + Section: .init + Value: 0x1000 + - Name: __libc_csu_fini + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x11B0 + Size: 0x5 + - Name: _ITM_deregisterTMCloneTable + Binding: STB_WEAK + - Name: data_start + Section: .data + Binding: STB_WEAK + Value: 0x4000 + - Name: _edata + Section: .data + Binding: STB_GLOBAL + Value: 0x4010 + - Name: _fini + Type: STT_FUNC + Section: .fini + Binding: STB_GLOBAL + Value: 0x11B8 + Other: [ STV_HIDDEN ] + - Name: '__libc_start_main@@GLIBC_2.2.5' + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: __data_start + Section: .data + Binding: STB_GLOBAL + Value: 0x4000 + - Name: __gmon_start__ + Binding: STB_WEAK + - Name: __dso_handle + Type: STT_OBJECT + Section: .data + Binding: STB_GLOBAL + Value: 0x4008 + Other: [ STV_HIDDEN ] + - Name: _IO_stdin_used + Type: STT_OBJECT + Section: .rodata + Binding: STB_GLOBAL + Value: 0x2000 + Size: 0x4 + - Name: __libc_csu_init + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x1140 + Size: 0x65 + - Name: _end + Section: .bss + Binding: STB_GLOBAL + Value: 0x4018 + - Name: _start + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x1040 + Size: 0x2F + - Name: __bss_start + Section: .bss + Binding: STB_GLOBAL + Value: 0x4010 + - Name: main + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x1129 + Size: 0xF + - Name: __TMC_END__ + Type: STT_OBJECT + Section: .data + Binding: STB_GLOBAL + Value: 0x4010 + Other: [ STV_HIDDEN ] + - Name: _ITM_registerTMCloneTable + Binding: STB_WEAK + - Name: '__cxa_finalize@@GLIBC_2.2.5' + Type: STT_FUNC + Binding: STB_WEAK +DynamicSymbols: + - Name: _ITM_deregisterTMCloneTable + Binding: STB_WEAK + - Name: __libc_start_main + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: __gmon_start__ + Binding: STB_WEAK + - Name: _ITM_registerTMCloneTable + Binding: STB_WEAK + - Name: __cxa_finalize + Type: STT_FUNC + Binding: STB_WEAK +DWARF: + debug_str: + - Type + - '_Optional_payload' + - main.cpp + - 'GNU C++14 9.4.0 -mtune=generic -march=x86-64 -g -O0 -fasynchronous-unwind-tables -fstack-protector-strong -fstack-clash-protection -fcf-protection' + - main + - '_Optional_payload' + - '/root/os-llvm/llvm-project' + debug_aranges: + - Length: 0x2C + Version: 2 + CuOffset: 0x0 + AddressSize: 0x8 + Descriptors: + - Address: 0x1129 + Length: 0xF +... From 2584f80270210ea72217168bc9b8e0c222bcc9f8 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Wed, 29 Oct 2025 12:14:56 +0000 Subject: [PATCH 100/539] [llvm][Bitcode][ObjC] Fix order of setter/getter argument to DIObjCProperty constructor (#165421) Depends on: * https://github.com/llvm/llvm-project/pull/165401 We weren't testing `DIObjCProperty` roundtripping. So this was never caught. The consequence of this is that the `setter:` would have the getter name and `getter:` would have the setter name. --- llvm/lib/Bitcode/Reader/MetadataLoader.cpp | 5 ++- llvm/test/Bitcode/dwarf-objc-property.ll | 46 ++++++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Bitcode/dwarf-objc-property.ll diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp index ed0443f599a44..4df500b948abf 100644 --- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp @@ -2323,8 +2323,9 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( GET_OR_DISTINCT(DIObjCProperty, (Context, getMDString(Record[1]), getMDOrNull(Record[2]), Record[3], - getMDString(Record[4]), getMDString(Record[5]), - Record[6], getDITypeRefOrNull(Record[7]))), + /*GetterName=*/getMDString(Record[5]), + /*SetterName=*/getMDString(Record[4]), Record[6], + getDITypeRefOrNull(Record[7]))), NextMetadataNo); NextMetadataNo++; break; diff --git a/llvm/test/Bitcode/dwarf-objc-property.ll b/llvm/test/Bitcode/dwarf-objc-property.ll new file mode 100644 index 0000000000000..f054f572feffa --- /dev/null +++ b/llvm/test/Bitcode/dwarf-objc-property.ll @@ -0,0 +1,46 @@ +; RUN: llvm-as < %s | llvm-dis | FileCheck %s +; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s + +; CHECK: !DIObjCProperty(name: "autoSynthProp", file: !3, line: 5, attributes: 2316, type: !8) +; CHECK: !DIObjCProperty(name: "synthProp", file: !3, line: 6, attributes: 2316, type: !8) +; CHECK: !DIObjCProperty(name: "customGetterProp", file: !3, line: 7, getter: "customGetter", attributes: 2318, type: !8) +; CHECK: !DIObjCProperty(name: "customSetterProp", file: !3, line: 8, setter: "customSetter:", attributes: 2444, type: !8) +; CHECK: !DIObjCProperty(name: "customAccessorsProp", file: !3, line: 9, setter: "customSetter:", getter: "customGetter", attributes: 2446, type: !8) + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} + +!0 = !{i32 7, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = distinct !DICompileUnit(language: DW_LANG_ObjC, file: !3, producer: "hand written", isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, retainedTypes: !4, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: Apple) +!3 = !DIFile(filename: "main.m", directory: "/tmp") +!4 = !{!5} +!5 = !DICompositeType(tag: DW_TAG_structure_type, name: "Foo", scope: !3, file: !3, line: 1, size: 128, flags: DIFlagObjcClassComplete, elements: !6, runtimeLang: DW_LANG_ObjC) +!6 = !{!7, !9, !10, !11, !12, !13, !14, !15, !16, !17, !24, !27, !28, !29, !30, !31, !32} +!7 = !DIObjCProperty(name: "autoSynthProp", file: !3, line: 5, attributes: 2316, type: !8) +!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!9 = !DIObjCProperty(name: "synthProp", file: !3, line: 6, attributes: 2316, type: !8) +!10 = !DIObjCProperty(name: "customGetterProp", file: !3, line: 7, getter: "customGetter", attributes: 2318, type: !8) +!11 = !DIObjCProperty(name: "customSetterProp", file: !3, line: 8, setter: "customSetter:", attributes: 2444, type: !8) +!12 = !DIObjCProperty(name: "customAccessorsProp", file: !3, line: 9, setter: "customSetter:", getter: "customGetter", attributes: 2446, type: !8) +!13 = !DIDerivedType(tag: DW_TAG_member, name: "someBackingIvar", scope: !3, file: !3, line: 2, baseType: !8, size: 32, flags: DIFlagProtected, extraData: !9) +!14 = !DIDerivedType(tag: DW_TAG_member, name: "_autoSynthProp", scope: !3, file: !3, line: 5, baseType: !8, size: 32, flags: DIFlagPrivate, extraData: !7) +!15 = !DIDerivedType(tag: DW_TAG_member, name: "_customGetterProp", scope: !3, file: !3, line: 7, baseType: !8, size: 32, flags: DIFlagPrivate, extraData: !10) +!16 = !DIDerivedType(tag: DW_TAG_member, name: "_customSetterProp", scope: !3, file: !3, line: 8, baseType: !8, size: 32, flags: DIFlagPrivate, extraData: !11) +!17 = !DISubprogram(name: "-[Foo customGetter]", scope: !5, file: !3, line: 19, type: !18, scopeLine: 19, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit) +!18 = !DISubroutineType(types: !19) +!19 = !{!8, !20, !21} +!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer) +!21 = !DIDerivedType(tag: DW_TAG_typedef, name: "SEL", file: !3, baseType: !22, flags: DIFlagArtificial) +!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 64) +!23 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_selector", file: !3, flags: DIFlagFwdDecl) +!24 = !DISubprogram(name: "-[Foo customSetter:]", scope: !5, file: !3, line: 23, type: !25, scopeLine: 23, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit) +!25 = !DISubroutineType(types: !26) +!26 = !{null, !20, !21, !8} +!27 = !DISubprogram(name: "-[Foo synthProp]", scope: !5, file: !3, line: 17, type: !18, scopeLine: 17, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit) +!28 = !DISubprogram(name: "-[Foo setSynthProp:]", scope: !5, file: !3, line: 17, type: !25, scopeLine: 17, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit) +!29 = !DISubprogram(name: "-[Foo autoSynthProp]", scope: !5, file: !3, line: 5, type: !18, scopeLine: 5, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit) +!30 = !DISubprogram(name: "-[Foo setAutoSynthProp:]", scope: !5, file: !3, line: 5, type: !25, scopeLine: 5, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit) +!31 = !DISubprogram(name: "-[Foo setCustomGetterProp:]", scope: !5, file: !3, line: 7, type: !25, scopeLine: 7, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit) +!32 = !DISubprogram(name: "-[Foo customSetterProp]", scope: !5, file: !3, line: 8, type: !18, scopeLine: 8, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit) + From 1067205f39c06373e1fd043f6103997520fcc1c5 Mon Sep 17 00:00:00 2001 From: Kunwar Grover Date: Wed, 29 Oct 2025 12:26:37 +0000 Subject: [PATCH 101/539] [mlir][linalg] Do not set insertion point inside padding function (#165420) Remove insertion point in rewriteAsPaddedOp. There is no gurantee that the sizes provided by the user are before the operation to pad. It's better to let the user handle where to insert the newly created operations, as long as they are after the origin operation to pad. --- mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h | 3 +++ mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp | 2 ++ mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp | 4 ---- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h index c89fc59c91830..d00183a1e16a1 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -653,6 +653,9 @@ struct PadTilingInterfaceResult { // interpreted as the bounding box (dynamic) value to pad to. /// * Use "options.paddingValues" to set the padding value of the created // tensor::PadOp. +// +// The transformation assumes that the insertion point is set after the +// operation to pad. FailureOr rewriteAsPaddedOp(OpBuilder &, TilingInterface toPad, PadTilingInterfaceOptions options, diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 794dda96d1dfa..8b89244486339 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -2464,6 +2464,8 @@ transform::PadTilingInterfaceOp::apply(transform::TransformRewriter &rewriter, .setPaddingSizes(getMixedPaddingSizes()) .setPadToMultipleOf(getPadToMultipleOf()); + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPointAfter(targetOp); auto maybePadOps = rewriteAsPaddedOp( rewriter, cast(targetOp.getOperation()), options); if (failed(maybePadOps)) { diff --git a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp index 3e787a2ad0ef5..52ab92f180575 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp @@ -288,10 +288,6 @@ FailureOr linalg::rewriteAsPaddedOp( return failure(); } - OpBuilder::InsertionGuard g(builder); - // Set IP after toPad because we also take the dims of toPad's output. - builder.setInsertionPointAfter(toPad); - // 1. Get the loopUpperBounds from the TilingInterface. SmallVector iterationDomain = toPad.getIterationDomain(builder); From 61fc33846af1e62c60e2c0cbd606becddbc473cf Mon Sep 17 00:00:00 2001 From: Eugene Epshteyn Date: Wed, 29 Oct 2025 08:34:51 -0400 Subject: [PATCH 102/539] [flang] Implement IGNORE_TKR(P) (#165469) Implemented IGNORE_TKR(P), which allows ignoring pointer and allocatable matching (can pass an allocatable array to routine with pointer array argument and vice versa). Updated documentation. --- flang/docs/Directives.md | 48 +++++++++++----------- flang/include/flang/Support/Fortran.h | 3 +- flang/lib/Semantics/check-call.cpp | 6 ++- flang/lib/Semantics/check-declarations.cpp | 3 +- flang/lib/Semantics/mod-file.cpp | 3 ++ flang/lib/Semantics/resolve-names.cpp | 3 ++ flang/lib/Support/Fortran.cpp | 3 ++ flang/test/Semantics/ignore_tkr04.f90 | 26 ++++++++++++ 8 files changed, 68 insertions(+), 27 deletions(-) create mode 100644 flang/test/Semantics/ignore_tkr04.f90 diff --git a/flang/docs/Directives.md b/flang/docs/Directives.md index 3ebb08c486228..2f16a8d579f8b 100644 --- a/flang/docs/Directives.md +++ b/flang/docs/Directives.md @@ -1,9 +1,9 @@ - # Compiler directives supported by Flang @@ -12,16 +12,18 @@ A list of non-standard directives supported by Flang * `!dir$ fixed` and `!dir$ free` select Fortran source forms. Their effect persists to the end of the current source file. -* `!dir$ ignore_tkr [[(TKRDMAC)] dummy-arg-name]...` in an interface definition +* `!dir$ ignore_tkr [[(TKRDMACP)] dummy-arg-name]...` in an interface definition disables some semantic checks at call sites for the actual arguments that - correspond to some named dummy arguments (or all of them, by default). - The directive allow actual arguments that would otherwise be diagnosed - as incompatible in type (T), kind (K), rank (R), CUDA device (D), or - managed (M) status. The letter (A) is a shorthand for all of these, - and is the default when no letters appear. The letter (C) checks for - contiguity for example allowing an element of an assumed-shape array to be - passed as a dummy argument. For example, if one wanted to call a "set all - bytes to zero" utility that could be applied to arrays of any type or rank: + correspond to some named dummy arguments (or all of them, by default). The + directive allow actual arguments that would otherwise be diagnosed as + incompatible in type (T), kind (K), rank (R), CUDA device (D), or managed (M) + status. The letter (A) is a shorthand for (TKRDM), and is the default when no + letters appear. The letter (C) checks for contiguity, for example allowing an + element of an assumed-shape array to be passed as a dummy argument. The + letter (P) ignores pointer and allocatable matching, so that one can pass an + allocatable array to routine with pointer array argument and vice versa. For + example, if one wanted to call a "set all bytes to zero" utility that could + be applied to arrays of any type or rank: ``` interface subroutine clear(arr,bytes) @@ -46,27 +48,27 @@ A list of non-standard directives supported by Flang unroll the loop. Some compilers accept an optional `=` before the `n` when `n` is present in the directive. Flang does not. * `!dir$ unroll_and_jam [N]` control how many times a loop should be unrolled and - jammed. It must be placed immediately before a loop that follows. `N` is an optional - integer that specifying the unrolling factor. When `N` is `0` or `1`, the loop + jammed. It must be placed immediately before a loop that follows. `N` is an optional + integer that specifying the unrolling factor. When `N` is `0` or `1`, the loop should not be unrolled at all. If `N` is omitted the optimizer will selects the number of times to unroll the loop. * `!dir$ novector` disabling vectorization on the following loop. * `!dir$ nounroll` disabling unrolling on the following loop. * `!dir$ nounroll_and_jam` disabling unrolling and jamming on the following loop. -* `!dir$ inline` instructs the compiler to attempt to inline the called routines if the - directive is specified before a call statement or all call statements within the loop - body if specified before a DO LOOP or all function references if specified before an +* `!dir$ inline` instructs the compiler to attempt to inline the called routines if the + directive is specified before a call statement or all call statements within the loop + body if specified before a DO LOOP or all function references if specified before an assignment statement. -* `!dir$ forceinline` works in the same way as the `inline` directive, but it forces +* `!dir$ forceinline` works in the same way as the `inline` directive, but it forces inlining by the compiler on a function call statement. -* `!dir$ noinline` works in the same way as the `inline` directive, but prevents +* `!dir$ noinline` works in the same way as the `inline` directive, but prevents any attempt of inlining by the compiler on a function call statement. # Directive Details ## Introduction -Directives are commonly used in Fortran programs to specify additional actions -to be performed by the compiler. The directives are always specified with the +Directives are commonly used in Fortran programs to specify additional actions +to be performed by the compiler. The directives are always specified with the `!dir$` or `cdir$` prefix. ## Loop Directives @@ -97,7 +99,7 @@ check that that construct matches the expected construct for the directive. Skipping other intermediate directives allows multiple directives to appear on the same construct. -## Lowering +## Lowering Evaluation is extended with a new field called dirs for representing directives associated with that Evaluation. When lowering loop directives, the associated Do Loop's evaluation is found and the directive is added to it. This information @@ -109,7 +111,7 @@ about the loop. For example, the `llvm.loop.vectorize.enable` metadata informs the optimizer that a loop can be vectorized without considering its cost-model. This attribute is added to the loop condition branch. -### Representation in MLIR +### Representation in MLIR The MLIR LLVM dialect models this by an attribute called LoopAnnotation Attribute. The attribute can be added to the latch of the loop in the cf dialect and is then carried through lowering to the LLVM dialect. diff --git a/flang/include/flang/Support/Fortran.h b/flang/include/flang/Support/Fortran.h index ea0344ecb0830..cf39781c1e8a7 100644 --- a/flang/include/flang/Support/Fortran.h +++ b/flang/include/flang/Support/Fortran.h @@ -86,8 +86,9 @@ ENUM_CLASS(IgnoreTKR, Rank, // R - don't check ranks Device, // D - don't check host/device residence Managed, // M - don't check managed storage - Contiguous) // C - don't check for storage sequence association with a + Contiguous, // C - don't check for storage sequence association with a // potentially non-contiguous object + Pointer) // P - ignore pointer and allocatable matching using IgnoreTKRSet = EnumSet; // IGNORE_TKR(A) = IGNORE_TKR(TKRDM) static constexpr IgnoreTKRSet ignoreTKRAll{IgnoreTKR::Type, IgnoreTKR::Kind, diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp index c51d40b9e5039..995deaa12dd3b 100644 --- a/flang/lib/Semantics/check-call.cpp +++ b/flang/lib/Semantics/check-call.cpp @@ -914,7 +914,8 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy, dummyName); } // INTENT(OUT) and INTENT(IN OUT) cases are caught elsewhere - } else { + } else if (!actualIsAllocatable && + !dummy.ignoreTKR.test(common::IgnoreTKR::Pointer)) { messages.Say( "ALLOCATABLE %s must be associated with an ALLOCATABLE actual argument"_err_en_US, dummyName); @@ -929,7 +930,8 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy, dummy, actual, *scope, /*isAssumedRank=*/dummyIsAssumedRank, actualIsPointer); } - } else if (!actualIsPointer) { + } else if (!actualIsPointer && + !dummy.ignoreTKR.test(common::IgnoreTKR::Pointer)) { messages.Say( "Actual argument associated with POINTER %s must also be POINTER unless INTENT(IN)"_err_en_US, dummyName); diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp index 549ee83b70fce..de407d3b1e125 100644 --- a/flang/lib/Semantics/check-declarations.cpp +++ b/flang/lib/Semantics/check-declarations.cpp @@ -949,7 +949,8 @@ void CheckHelper::CheckObjectEntity( "!DIR$ IGNORE_TKR(R) may not apply in an ELEMENTAL procedure"_err_en_US); } if (IsPassedViaDescriptor(symbol)) { - if (IsAllocatableOrObjectPointer(&symbol)) { + if (IsAllocatableOrObjectPointer(&symbol) && + !ignoreTKR.test(common::IgnoreTKR::Pointer)) { if (inExplicitExternalInterface) { Warn(common::UsageWarning::IgnoreTKRUsage, "!DIR$ IGNORE_TKR should not apply to an allocatable or pointer"_warn_en_US); diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp index 556259d1e5e63..b419864f73b8e 100644 --- a/flang/lib/Semantics/mod-file.cpp +++ b/flang/lib/Semantics/mod-file.cpp @@ -1021,6 +1021,9 @@ void ModFileWriter::PutObjectEntity( case common::IgnoreTKR::Contiguous: os << 'c'; break; + case common::IgnoreTKR::Pointer: + os << 'p'; + break; } }); os << ") " << symbol.name() << '\n'; diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 0e6d4c71b30de..f88af5fac0bbd 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -10109,6 +10109,9 @@ void ResolveNamesVisitor::Post(const parser::CompilerDirective &x) { case 'c': set.set(common::IgnoreTKR::Contiguous); break; + case 'p': + set.set(common::IgnoreTKR::Pointer); + break; case 'a': set = common::ignoreTKRAll; break; diff --git a/flang/lib/Support/Fortran.cpp b/flang/lib/Support/Fortran.cpp index 3a8ebbb7d61ef..05d6e0e709e91 100644 --- a/flang/lib/Support/Fortran.cpp +++ b/flang/lib/Support/Fortran.cpp @@ -95,6 +95,9 @@ std::string AsFortran(IgnoreTKRSet tkr) { if (tkr.test(IgnoreTKR::Contiguous)) { result += 'C'; } + if (tkr.test(IgnoreTKR::Pointer)) { + result += 'P'; + } return result; } diff --git a/flang/test/Semantics/ignore_tkr04.f90 b/flang/test/Semantics/ignore_tkr04.f90 new file mode 100644 index 0000000000000..8becc85857bb1 --- /dev/null +++ b/flang/test/Semantics/ignore_tkr04.f90 @@ -0,0 +1,26 @@ +! RUN: %python %S/test_errors.py %s %flang_fc1 +! Tests for ignore_tkr(p) +module ignore_tkr_4_m +interface + subroutine s(a) + real, pointer :: a(:) +!dir$ ignore_tkr(p) a + end subroutine + subroutine s1(a) + real, allocatable :: a(:) +!dir$ ignore_tkr(p) a + end subroutine +end interface +end module +program t + use ignore_tkr_4_m + real, allocatable :: x(:) + real, pointer :: x1(:) + call s(x) +!CHECK-NOT: error +!CHECK-NOT: warning + call s1(x1) +!CHECK-NOT: error +!CHECK-NOT: warning +end + From aef2ec73c9e1da170646758ea9c37108074c427e Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Wed, 29 Oct 2025 13:08:40 +0000 Subject: [PATCH 103/539] [LLVM][IR] Emit diagnostic for invalid pointee type for constant GEP. (#165383) Fixes https://github.com/llvm/llvm-project/issues/165137 --- llvm/lib/AsmParser/LLParser.cpp | 3 +++ .../Assembler/constant-getelementptr-scalable_pointee.ll | 8 ++++++++ 2 files changed, 11 insertions(+) create mode 100644 llvm/test/Assembler/constant-getelementptr-scalable_pointee.ll diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index e7a04d98df2af..4cc47c0d0260e 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -4538,6 +4538,9 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) { if (!Indices.empty() && !Ty->isSized(&Visited)) return error(ID.Loc, "base element of getelementptr must be sized"); + if (!ConstantExpr::isSupportedGetElementPtr(Ty)) + return error(ID.Loc, "invalid base element for constant getelementptr"); + if (!GetElementPtrInst::getIndexedType(Ty, Indices)) return error(ID.Loc, "invalid getelementptr indices"); diff --git a/llvm/test/Assembler/constant-getelementptr-scalable_pointee.ll b/llvm/test/Assembler/constant-getelementptr-scalable_pointee.ll new file mode 100644 index 0000000000000..d39039964b3b3 --- /dev/null +++ b/llvm/test/Assembler/constant-getelementptr-scalable_pointee.ll @@ -0,0 +1,8 @@ +; RUN: not llvm-as < %s 2>&1 | FileCheck %s +; Test the case of an invalid pointee type on a constant GEP + +; CHECK: invalid base element for constant getelementptr + +define ptr @test_scalable_vector_gep(ptr %a) { + ret ptr getelementptr (, ptr @a, i64 1) +} From e490a556bbb1551e7a11d66e2ca6cc1fe3e6b4d2 Mon Sep 17 00:00:00 2001 From: Lukacma Date: Wed, 29 Oct 2025 13:22:44 +0000 Subject: [PATCH 104/539] [AArch64][NEON] Add eor3 patterns for V64 xors (#165376) This patch enables NEON EOR3 instruction to be emitted even for 64 bit vectors. --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 24 ++-- llvm/test/CodeGen/AArch64/eor3.ll | 151 ++++++++++++++++++++ 2 files changed, 167 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index b9e299ef37454..2871a20e28b65 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1805,14 +1805,22 @@ def : SHA3_pattern; def : SHA3_pattern; def : SHA3_pattern; -class EOR3_pattern - : Pat<(xor (xor (VecTy V128:$Vn), (VecTy V128:$Vm)), (VecTy V128:$Va)), - (EOR3 (VecTy V128:$Vn), (VecTy V128:$Vm), (VecTy V128:$Va))>; - -def : EOR3_pattern; -def : EOR3_pattern; -def : EOR3_pattern; -def : EOR3_pattern; +multiclass EOR3_pattern{ + def : Pat<(xor (xor (Vec128Ty V128:$Vn), (Vec128Ty V128:$Vm)), (Vec128Ty V128:$Va)), + (EOR3 (Vec128Ty V128:$Vn), (Vec128Ty V128:$Vm), (Vec128Ty V128:$Va))>; + def : Pat<(xor (xor (Vec64Ty V64:$Vn), (Vec64Ty V64:$Vm)), (Vec64Ty V64:$Va)), + (EXTRACT_SUBREG + (EOR3 + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vn, dsub), + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vm, dsub), + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Va, dsub)), + dsub)>; +} + +defm : EOR3_pattern; +defm : EOR3_pattern; +defm : EOR3_pattern; +defm : EOR3_pattern; class BCAX_pattern : Pat<(xor (VecTy V128:$Vn), (and (VecTy V128:$Vm), (vnot (VecTy V128:$Va)))), diff --git a/llvm/test/CodeGen/AArch64/eor3.ll b/llvm/test/CodeGen/AArch64/eor3.ll index eccd09131b525..594a73f70a7f9 100644 --- a/llvm/test/CodeGen/AArch64/eor3.ll +++ b/llvm/test/CodeGen/AArch64/eor3.ll @@ -277,3 +277,154 @@ define <2 x i64> @eor3_vnot(<2 x i64> %0, <2 x i64> %1) { ret <2 x i64> %4 } +define <1 x i64> @eor3_1x64(<1 x i64> %0, <1 x i64> %1, <1 x i64> %2) { +; SHA3-LABEL: eor3_1x64: +; SHA3: // %bb.0: +; SHA3-NEXT: // kill: def $d0 killed $d0 def $q0 +; SHA3-NEXT: // kill: def $d2 killed $d2 def $q2 +; SHA3-NEXT: // kill: def $d1 killed $d1 def $q1 +; SHA3-NEXT: eor3 v0.16b, v1.16b, v2.16b, v0.16b +; SHA3-NEXT: // kill: def $d0 killed $d0 killed $q0 +; SHA3-NEXT: ret +; +; NOSHA3-LABEL: eor3_1x64: +; NOSHA3: // %bb.0: +; NOSHA3-NEXT: eor v1.8b, v1.8b, v2.8b +; NOSHA3-NEXT: eor v0.8b, v1.8b, v0.8b +; NOSHA3-NEXT: ret +; +; SVE2-LABEL: eor3_1x64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1 +; SVE2-NEXT: // kill: def $d2 killed $d2 def $z2 +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: eor3 z1.d, z1.d, z2.d, z0.d +; SVE2-NEXT: fmov d0, d1 +; SVE2-NEXT: ret +; +; SHA3-SVE2-LABEL: eor3_1x64: +; SHA3-SVE2: // %bb.0: +; SHA3-SVE2-NEXT: // kill: def $d0 killed $d0 def $q0 +; SHA3-SVE2-NEXT: // kill: def $d2 killed $d2 def $q2 +; SHA3-SVE2-NEXT: // kill: def $d1 killed $d1 def $q1 +; SHA3-SVE2-NEXT: eor3 v0.16b, v1.16b, v2.16b, v0.16b +; SHA3-SVE2-NEXT: // kill: def $d0 killed $d0 killed $q0 +; SHA3-SVE2-NEXT: ret + %4 = xor <1 x i64> %1, %2 + %5 = xor <1 x i64> %4, %0 + ret <1 x i64> %5 +} + +define <2 x i32> @eor3_2x32(<2 x i32> %0, <2 x i32> %1, <2 x i32> %2) { +; SHA3-LABEL: eor3_2x32: +; SHA3: // %bb.0: +; SHA3-NEXT: // kill: def $d0 killed $d0 def $q0 +; SHA3-NEXT: // kill: def $d2 killed $d2 def $q2 +; SHA3-NEXT: // kill: def $d1 killed $d1 def $q1 +; SHA3-NEXT: eor3 v0.16b, v1.16b, v2.16b, v0.16b +; SHA3-NEXT: // kill: def $d0 killed $d0 killed $q0 +; SHA3-NEXT: ret +; +; NOSHA3-LABEL: eor3_2x32: +; NOSHA3: // %bb.0: +; NOSHA3-NEXT: eor v1.8b, v1.8b, v2.8b +; NOSHA3-NEXT: eor v0.8b, v1.8b, v0.8b +; NOSHA3-NEXT: ret +; +; SVE2-LABEL: eor3_2x32: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1 +; SVE2-NEXT: // kill: def $d2 killed $d2 def $z2 +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: eor3 z1.d, z1.d, z2.d, z0.d +; SVE2-NEXT: fmov d0, d1 +; SVE2-NEXT: ret +; +; SHA3-SVE2-LABEL: eor3_2x32: +; SHA3-SVE2: // %bb.0: +; SHA3-SVE2-NEXT: // kill: def $d0 killed $d0 def $q0 +; SHA3-SVE2-NEXT: // kill: def $d2 killed $d2 def $q2 +; SHA3-SVE2-NEXT: // kill: def $d1 killed $d1 def $q1 +; SHA3-SVE2-NEXT: eor3 v0.16b, v1.16b, v2.16b, v0.16b +; SHA3-SVE2-NEXT: // kill: def $d0 killed $d0 killed $q0 +; SHA3-SVE2-NEXT: ret + %4 = xor <2 x i32> %1, %2 + %5 = xor <2 x i32> %4, %0 + ret <2 x i32> %5 +} + +define <4 x i16> @eor3_4x16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) { +; SHA3-LABEL: eor3_4x16: +; SHA3: // %bb.0: +; SHA3-NEXT: // kill: def $d0 killed $d0 def $q0 +; SHA3-NEXT: // kill: def $d2 killed $d2 def $q2 +; SHA3-NEXT: // kill: def $d1 killed $d1 def $q1 +; SHA3-NEXT: eor3 v0.16b, v1.16b, v2.16b, v0.16b +; SHA3-NEXT: // kill: def $d0 killed $d0 killed $q0 +; SHA3-NEXT: ret +; +; NOSHA3-LABEL: eor3_4x16: +; NOSHA3: // %bb.0: +; NOSHA3-NEXT: eor v1.8b, v1.8b, v2.8b +; NOSHA3-NEXT: eor v0.8b, v1.8b, v0.8b +; NOSHA3-NEXT: ret +; +; SVE2-LABEL: eor3_4x16: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1 +; SVE2-NEXT: // kill: def $d2 killed $d2 def $z2 +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: eor3 z1.d, z1.d, z2.d, z0.d +; SVE2-NEXT: fmov d0, d1 +; SVE2-NEXT: ret +; +; SHA3-SVE2-LABEL: eor3_4x16: +; SHA3-SVE2: // %bb.0: +; SHA3-SVE2-NEXT: // kill: def $d0 killed $d0 def $q0 +; SHA3-SVE2-NEXT: // kill: def $d2 killed $d2 def $q2 +; SHA3-SVE2-NEXT: // kill: def $d1 killed $d1 def $q1 +; SHA3-SVE2-NEXT: eor3 v0.16b, v1.16b, v2.16b, v0.16b +; SHA3-SVE2-NEXT: // kill: def $d0 killed $d0 killed $q0 +; SHA3-SVE2-NEXT: ret + %4 = xor <4 x i16> %1, %2 + %5 = xor <4 x i16> %4, %0 + ret <4 x i16> %5 +} + +define <8 x i8> @eor3_8x8(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2) { +; SHA3-LABEL: eor3_8x8: +; SHA3: // %bb.0: +; SHA3-NEXT: // kill: def $d0 killed $d0 def $q0 +; SHA3-NEXT: // kill: def $d2 killed $d2 def $q2 +; SHA3-NEXT: // kill: def $d1 killed $d1 def $q1 +; SHA3-NEXT: eor3 v0.16b, v1.16b, v2.16b, v0.16b +; SHA3-NEXT: // kill: def $d0 killed $d0 killed $q0 +; SHA3-NEXT: ret +; +; NOSHA3-LABEL: eor3_8x8: +; NOSHA3: // %bb.0: +; NOSHA3-NEXT: eor v1.8b, v1.8b, v2.8b +; NOSHA3-NEXT: eor v0.8b, v1.8b, v0.8b +; NOSHA3-NEXT: ret +; +; SVE2-LABEL: eor3_8x8: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1 +; SVE2-NEXT: // kill: def $d2 killed $d2 def $z2 +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: eor3 z1.d, z1.d, z2.d, z0.d +; SVE2-NEXT: fmov d0, d1 +; SVE2-NEXT: ret +; +; SHA3-SVE2-LABEL: eor3_8x8: +; SHA3-SVE2: // %bb.0: +; SHA3-SVE2-NEXT: // kill: def $d0 killed $d0 def $q0 +; SHA3-SVE2-NEXT: // kill: def $d2 killed $d2 def $q2 +; SHA3-SVE2-NEXT: // kill: def $d1 killed $d1 def $q1 +; SHA3-SVE2-NEXT: eor3 v0.16b, v1.16b, v2.16b, v0.16b +; SHA3-SVE2-NEXT: // kill: def $d0 killed $d0 killed $q0 +; SHA3-SVE2-NEXT: ret + %4 = xor <8 x i8> %1, %2 + %5 = xor <8 x i8> %4, %0 + ret <8 x i8> %5 +} From a7c34afb0b2f748779ead64329aac55ca4786b89 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 29 Oct 2025 13:42:59 +0000 Subject: [PATCH 105/539] [X86] atomic-load-store.ll - cleanup test check-prefix hierarchies to improve reuse and fix missing AVX2/AVX512 checks (#165552) -mcpu=x86-64 is still SSE codegen, and there were missing AVX2/AVX512 checks where the common CHECK-AVX prefix clashed Noticed while reviewing #148897 --- llvm/test/CodeGen/X86/atomic-load-store.ll | 504 +++++---------------- 1 file changed, 106 insertions(+), 398 deletions(-) diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 3e7b73a65fe07..1173c45b4bfd8 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O3 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O3 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O0 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O0 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O0 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O0 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-AVX-O3,CHECK-AVX2-O3 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-AVX-O3,CHECK-AVX512-O3 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-AVX-O0,CHECK-AVX2-O0 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-AVX-O0,CHECK-AVX512-O0 define void @test1(ptr %ptr, i32 %val1) { ; CHECK-LABEL: test1: @@ -50,30 +50,10 @@ define <1 x i8> @atomic_vec1_i8(ptr %x) { ; CHECK-O3-NEXT: movzbl (%rdi), %eax ; CHECK-O3-NEXT: retq ; -; CHECK-SSE-O3-LABEL: atomic_vec1_i8: -; CHECK-SSE-O3: # %bb.0: -; CHECK-SSE-O3-NEXT: movzbl (%rdi), %eax -; CHECK-SSE-O3-NEXT: retq -; -; CHECK-AVX-O3-LABEL: atomic_vec1_i8: -; CHECK-AVX-O3: # %bb.0: -; CHECK-AVX-O3-NEXT: movzbl (%rdi), %eax -; CHECK-AVX-O3-NEXT: retq -; ; CHECK-O0-LABEL: atomic_vec1_i8: ; CHECK-O0: # %bb.0: ; CHECK-O0-NEXT: movb (%rdi), %al ; CHECK-O0-NEXT: retq -; -; CHECK-SSE-O0-LABEL: atomic_vec1_i8: -; CHECK-SSE-O0: # %bb.0: -; CHECK-SSE-O0-NEXT: movb (%rdi), %al -; CHECK-SSE-O0-NEXT: retq -; -; CHECK-AVX-O0-LABEL: atomic_vec1_i8: -; CHECK-AVX-O0: # %bb.0: -; CHECK-AVX-O0-NEXT: movb (%rdi), %al -; CHECK-AVX-O0-NEXT: retq %ret = load atomic <1 x i8>, ptr %x acquire, align 1 ret <1 x i8> %ret } @@ -84,30 +64,10 @@ define <1 x i16> @atomic_vec1_i16(ptr %x) { ; CHECK-O3-NEXT: movzwl (%rdi), %eax ; CHECK-O3-NEXT: retq ; -; CHECK-SSE-O3-LABEL: atomic_vec1_i16: -; CHECK-SSE-O3: # %bb.0: -; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax -; CHECK-SSE-O3-NEXT: retq -; -; CHECK-AVX-O3-LABEL: atomic_vec1_i16: -; CHECK-AVX-O3: # %bb.0: -; CHECK-AVX-O3-NEXT: movzwl (%rdi), %eax -; CHECK-AVX-O3-NEXT: retq -; ; CHECK-O0-LABEL: atomic_vec1_i16: ; CHECK-O0: # %bb.0: ; CHECK-O0-NEXT: movw (%rdi), %ax ; CHECK-O0-NEXT: retq -; -; CHECK-SSE-O0-LABEL: atomic_vec1_i16: -; CHECK-SSE-O0: # %bb.0: -; CHECK-SSE-O0-NEXT: movw (%rdi), %ax -; CHECK-SSE-O0-NEXT: retq -; -; CHECK-AVX-O0-LABEL: atomic_vec1_i16: -; CHECK-AVX-O0: # %bb.0: -; CHECK-AVX-O0-NEXT: movw (%rdi), %ax -; CHECK-AVX-O0-NEXT: retq %ret = load atomic <1 x i16>, ptr %x acquire, align 2 ret <1 x i16> %ret } @@ -119,35 +79,11 @@ define <1 x i32> @atomic_vec1_i8_zext(ptr %x) { ; CHECK-O3-NEXT: movzbl %al, %eax ; CHECK-O3-NEXT: retq ; -; CHECK-SSE-O3-LABEL: atomic_vec1_i8_zext: -; CHECK-SSE-O3: # %bb.0: -; CHECK-SSE-O3-NEXT: movzbl (%rdi), %eax -; CHECK-SSE-O3-NEXT: movzbl %al, %eax -; CHECK-SSE-O3-NEXT: retq -; -; CHECK-AVX-O3-LABEL: atomic_vec1_i8_zext: -; CHECK-AVX-O3: # %bb.0: -; CHECK-AVX-O3-NEXT: movzbl (%rdi), %eax -; CHECK-AVX-O3-NEXT: movzbl %al, %eax -; CHECK-AVX-O3-NEXT: retq -; ; CHECK-O0-LABEL: atomic_vec1_i8_zext: ; CHECK-O0: # %bb.0: ; CHECK-O0-NEXT: movb (%rdi), %al ; CHECK-O0-NEXT: movzbl %al, %eax ; CHECK-O0-NEXT: retq -; -; CHECK-SSE-O0-LABEL: atomic_vec1_i8_zext: -; CHECK-SSE-O0: # %bb.0: -; CHECK-SSE-O0-NEXT: movb (%rdi), %al -; CHECK-SSE-O0-NEXT: movzbl %al, %eax -; CHECK-SSE-O0-NEXT: retq -; -; CHECK-AVX-O0-LABEL: atomic_vec1_i8_zext: -; CHECK-AVX-O0: # %bb.0: -; CHECK-AVX-O0-NEXT: movb (%rdi), %al -; CHECK-AVX-O0-NEXT: movzbl %al, %eax -; CHECK-AVX-O0-NEXT: retq %ret = load atomic <1 x i8>, ptr %x acquire, align 1 %zret = zext <1 x i8> %ret to <1 x i32> ret <1 x i32> %zret @@ -160,35 +96,11 @@ define <1 x i64> @atomic_vec1_i16_sext(ptr %x) { ; CHECK-O3-NEXT: movswq %ax, %rax ; CHECK-O3-NEXT: retq ; -; CHECK-SSE-O3-LABEL: atomic_vec1_i16_sext: -; CHECK-SSE-O3: # %bb.0: -; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax -; CHECK-SSE-O3-NEXT: movswq %ax, %rax -; CHECK-SSE-O3-NEXT: retq -; -; CHECK-AVX-O3-LABEL: atomic_vec1_i16_sext: -; CHECK-AVX-O3: # %bb.0: -; CHECK-AVX-O3-NEXT: movzwl (%rdi), %eax -; CHECK-AVX-O3-NEXT: movswq %ax, %rax -; CHECK-AVX-O3-NEXT: retq -; ; CHECK-O0-LABEL: atomic_vec1_i16_sext: ; CHECK-O0: # %bb.0: ; CHECK-O0-NEXT: movw (%rdi), %ax ; CHECK-O0-NEXT: movswq %ax, %rax ; CHECK-O0-NEXT: retq -; -; CHECK-SSE-O0-LABEL: atomic_vec1_i16_sext: -; CHECK-SSE-O0: # %bb.0: -; CHECK-SSE-O0-NEXT: movw (%rdi), %ax -; CHECK-SSE-O0-NEXT: movswq %ax, %rax -; CHECK-SSE-O0-NEXT: retq -; -; CHECK-AVX-O0-LABEL: atomic_vec1_i16_sext: -; CHECK-AVX-O0: # %bb.0: -; CHECK-AVX-O0-NEXT: movw (%rdi), %ax -; CHECK-AVX-O0-NEXT: movswq %ax, %rax -; CHECK-AVX-O0-NEXT: retq %ret = load atomic <1 x i16>, ptr %x acquire, align 2 %sret = sext <1 x i16> %ret to <1 x i64> ret <1 x i64> %sret @@ -204,12 +116,6 @@ define <1 x ptr addrspace(270)> @atomic_vec1_ptr270(ptr %x) { } define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) { -; CHECK-O3-LABEL: atomic_vec1_bfloat: -; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: movzwl (%rdi), %eax -; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-O3-NEXT: retq -; ; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat: ; CHECK-SSE-O3: # %bb.0: ; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax @@ -222,15 +128,6 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) { ; CHECK-AVX-O3-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-AVX-O3-NEXT: retq ; -; CHECK-O0-LABEL: atomic_vec1_bfloat: -; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: movw (%rdi), %cx -; CHECK-O0-NEXT: # implicit-def: $eax -; CHECK-O0-NEXT: movw %cx, %ax -; CHECK-O0-NEXT: # implicit-def: $xmm0 -; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-O0-NEXT: retq -; ; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat: ; CHECK-SSE-O0: # %bb.0: ; CHECK-SSE-O0-NEXT: movw (%rdi), %cx @@ -283,30 +180,6 @@ define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind { ; CHECK-O3-NEXT: popq %rcx ; CHECK-O3-NEXT: retq ; -; CHECK-SSE-O3-LABEL: atomic_vec1_ptr: -; CHECK-SSE-O3: # %bb.0: -; CHECK-SSE-O3-NEXT: pushq %rax -; CHECK-SSE-O3-NEXT: movq %rdi, %rsi -; CHECK-SSE-O3-NEXT: movq %rsp, %rdx -; CHECK-SSE-O3-NEXT: movl $8, %edi -; CHECK-SSE-O3-NEXT: movl $2, %ecx -; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT -; CHECK-SSE-O3-NEXT: movq (%rsp), %rax -; CHECK-SSE-O3-NEXT: popq %rcx -; CHECK-SSE-O3-NEXT: retq -; -; CHECK-AVX-O3-LABEL: atomic_vec1_ptr: -; CHECK-AVX-O3: # %bb.0: -; CHECK-AVX-O3-NEXT: pushq %rax -; CHECK-AVX-O3-NEXT: movq %rdi, %rsi -; CHECK-AVX-O3-NEXT: movq %rsp, %rdx -; CHECK-AVX-O3-NEXT: movl $8, %edi -; CHECK-AVX-O3-NEXT: movl $2, %ecx -; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT -; CHECK-AVX-O3-NEXT: movq (%rsp), %rax -; CHECK-AVX-O3-NEXT: popq %rcx -; CHECK-AVX-O3-NEXT: retq -; ; CHECK-O0-LABEL: atomic_vec1_ptr: ; CHECK-O0: # %bb.0: ; CHECK-O0-NEXT: pushq %rax @@ -318,41 +191,11 @@ define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind { ; CHECK-O0-NEXT: movq (%rsp), %rax ; CHECK-O0-NEXT: popq %rcx ; CHECK-O0-NEXT: retq -; -; CHECK-SSE-O0-LABEL: atomic_vec1_ptr: -; CHECK-SSE-O0: # %bb.0: -; CHECK-SSE-O0-NEXT: pushq %rax -; CHECK-SSE-O0-NEXT: movq %rdi, %rsi -; CHECK-SSE-O0-NEXT: movl $8, %edi -; CHECK-SSE-O0-NEXT: movq %rsp, %rdx -; CHECK-SSE-O0-NEXT: movl $2, %ecx -; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT -; CHECK-SSE-O0-NEXT: movq (%rsp), %rax -; CHECK-SSE-O0-NEXT: popq %rcx -; CHECK-SSE-O0-NEXT: retq -; -; CHECK-AVX-O0-LABEL: atomic_vec1_ptr: -; CHECK-AVX-O0: # %bb.0: -; CHECK-AVX-O0-NEXT: pushq %rax -; CHECK-AVX-O0-NEXT: movq %rdi, %rsi -; CHECK-AVX-O0-NEXT: movl $8, %edi -; CHECK-AVX-O0-NEXT: movq %rsp, %rdx -; CHECK-AVX-O0-NEXT: movl $2, %ecx -; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT -; CHECK-AVX-O0-NEXT: movq (%rsp), %rax -; CHECK-AVX-O0-NEXT: popq %rcx -; CHECK-AVX-O0-NEXT: retq %ret = load atomic <1 x ptr>, ptr %x acquire, align 4 ret <1 x ptr> %ret } define <1 x half> @atomic_vec1_half(ptr %x) { -; CHECK-O3-LABEL: atomic_vec1_half: -; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: movzwl (%rdi), %eax -; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-O3-NEXT: retq -; ; CHECK-SSE-O3-LABEL: atomic_vec1_half: ; CHECK-SSE-O3: # %bb.0: ; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax @@ -365,15 +208,6 @@ define <1 x half> @atomic_vec1_half(ptr %x) { ; CHECK-AVX-O3-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; CHECK-AVX-O3-NEXT: retq ; -; CHECK-O0-LABEL: atomic_vec1_half: -; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: movw (%rdi), %cx -; CHECK-O0-NEXT: # implicit-def: $eax -; CHECK-O0-NEXT: movw %cx, %ax -; CHECK-O0-NEXT: # implicit-def: $xmm0 -; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-O0-NEXT: retq -; ; CHECK-SSE-O0-LABEL: atomic_vec1_half: ; CHECK-SSE-O0: # %bb.0: ; CHECK-SSE-O0-NEXT: movw (%rdi), %cx @@ -396,11 +230,6 @@ define <1 x half> @atomic_vec1_half(ptr %x) { } define <1 x float> @atomic_vec1_float(ptr %x) { -; CHECK-O3-LABEL: atomic_vec1_float: -; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-O3-NEXT: retq -; ; CHECK-SSE-O3-LABEL: atomic_vec1_float: ; CHECK-SSE-O3: # %bb.0: ; CHECK-SSE-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -411,11 +240,6 @@ define <1 x float> @atomic_vec1_float(ptr %x) { ; CHECK-AVX-O3-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-AVX-O3-NEXT: retq ; -; CHECK-O0-LABEL: atomic_vec1_float: -; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-O0-NEXT: retq -; ; CHECK-SSE-O0-LABEL: atomic_vec1_float: ; CHECK-SSE-O0: # %bb.0: ; CHECK-SSE-O0-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -430,11 +254,6 @@ define <1 x float> @atomic_vec1_float(ptr %x) { } define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind { -; CHECK-O3-LABEL: atomic_vec1_double_align: -; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-O3-NEXT: retq -; ; CHECK-SSE-O3-LABEL: atomic_vec1_double_align: ; CHECK-SSE-O3: # %bb.0: ; CHECK-SSE-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero @@ -445,11 +264,6 @@ define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind { ; CHECK-AVX-O3-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-AVX-O3-NEXT: retq ; -; CHECK-O0-LABEL: atomic_vec1_double_align: -; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-O0-NEXT: retq -; ; CHECK-SSE-O0-LABEL: atomic_vec1_double_align: ; CHECK-SSE-O0: # %bb.0: ; CHECK-SSE-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero @@ -476,30 +290,6 @@ define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind { ; CHECK-O3-NEXT: popq %rcx ; CHECK-O3-NEXT: retq ; -; CHECK-SSE-O3-LABEL: atomic_vec1_i64: -; CHECK-SSE-O3: # %bb.0: -; CHECK-SSE-O3-NEXT: pushq %rax -; CHECK-SSE-O3-NEXT: movq %rdi, %rsi -; CHECK-SSE-O3-NEXT: movq %rsp, %rdx -; CHECK-SSE-O3-NEXT: movl $8, %edi -; CHECK-SSE-O3-NEXT: movl $2, %ecx -; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT -; CHECK-SSE-O3-NEXT: movq (%rsp), %rax -; CHECK-SSE-O3-NEXT: popq %rcx -; CHECK-SSE-O3-NEXT: retq -; -; CHECK-AVX-O3-LABEL: atomic_vec1_i64: -; CHECK-AVX-O3: # %bb.0: -; CHECK-AVX-O3-NEXT: pushq %rax -; CHECK-AVX-O3-NEXT: movq %rdi, %rsi -; CHECK-AVX-O3-NEXT: movq %rsp, %rdx -; CHECK-AVX-O3-NEXT: movl $8, %edi -; CHECK-AVX-O3-NEXT: movl $2, %ecx -; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT -; CHECK-AVX-O3-NEXT: movq (%rsp), %rax -; CHECK-AVX-O3-NEXT: popq %rcx -; CHECK-AVX-O3-NEXT: retq -; ; CHECK-O0-LABEL: atomic_vec1_i64: ; CHECK-O0: # %bb.0: ; CHECK-O0-NEXT: pushq %rax @@ -511,47 +301,11 @@ define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind { ; CHECK-O0-NEXT: movq (%rsp), %rax ; CHECK-O0-NEXT: popq %rcx ; CHECK-O0-NEXT: retq -; -; CHECK-SSE-O0-LABEL: atomic_vec1_i64: -; CHECK-SSE-O0: # %bb.0: -; CHECK-SSE-O0-NEXT: pushq %rax -; CHECK-SSE-O0-NEXT: movq %rdi, %rsi -; CHECK-SSE-O0-NEXT: movl $8, %edi -; CHECK-SSE-O0-NEXT: movq %rsp, %rdx -; CHECK-SSE-O0-NEXT: movl $2, %ecx -; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT -; CHECK-SSE-O0-NEXT: movq (%rsp), %rax -; CHECK-SSE-O0-NEXT: popq %rcx -; CHECK-SSE-O0-NEXT: retq -; -; CHECK-AVX-O0-LABEL: atomic_vec1_i64: -; CHECK-AVX-O0: # %bb.0: -; CHECK-AVX-O0-NEXT: pushq %rax -; CHECK-AVX-O0-NEXT: movq %rdi, %rsi -; CHECK-AVX-O0-NEXT: movl $8, %edi -; CHECK-AVX-O0-NEXT: movq %rsp, %rdx -; CHECK-AVX-O0-NEXT: movl $2, %ecx -; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT -; CHECK-AVX-O0-NEXT: movq (%rsp), %rax -; CHECK-AVX-O0-NEXT: popq %rcx -; CHECK-AVX-O0-NEXT: retq %ret = load atomic <1 x i64>, ptr %x acquire, align 4 ret <1 x i64> %ret } define <1 x double> @atomic_vec1_double(ptr %x) nounwind { -; CHECK-O3-LABEL: atomic_vec1_double: -; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: pushq %rax -; CHECK-O3-NEXT: movq %rdi, %rsi -; CHECK-O3-NEXT: movq %rsp, %rdx -; CHECK-O3-NEXT: movl $8, %edi -; CHECK-O3-NEXT: movl $2, %ecx -; CHECK-O3-NEXT: callq __atomic_load@PLT -; CHECK-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-O3-NEXT: popq %rax -; CHECK-O3-NEXT: retq -; ; CHECK-SSE-O3-LABEL: atomic_vec1_double: ; CHECK-SSE-O3: # %bb.0: ; CHECK-SSE-O3-NEXT: pushq %rax @@ -576,18 +330,6 @@ define <1 x double> @atomic_vec1_double(ptr %x) nounwind { ; CHECK-AVX-O3-NEXT: popq %rax ; CHECK-AVX-O3-NEXT: retq ; -; CHECK-O0-LABEL: atomic_vec1_double: -; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: pushq %rax -; CHECK-O0-NEXT: movq %rdi, %rsi -; CHECK-O0-NEXT: movl $8, %edi -; CHECK-O0-NEXT: movq %rsp, %rdx -; CHECK-O0-NEXT: movl $2, %ecx -; CHECK-O0-NEXT: callq __atomic_load@PLT -; CHECK-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-O0-NEXT: popq %rax -; CHECK-O0-NEXT: retq -; ; CHECK-SSE-O0-LABEL: atomic_vec1_double: ; CHECK-SSE-O0: # %bb.0: ; CHECK-SSE-O0-NEXT: pushq %rax @@ -616,18 +358,6 @@ define <1 x double> @atomic_vec1_double(ptr %x) nounwind { } define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind { -; CHECK-O3-LABEL: atomic_vec2_i32: -; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: pushq %rax -; CHECK-O3-NEXT: movq %rdi, %rsi -; CHECK-O3-NEXT: movq %rsp, %rdx -; CHECK-O3-NEXT: movl $8, %edi -; CHECK-O3-NEXT: movl $2, %ecx -; CHECK-O3-NEXT: callq __atomic_load@PLT -; CHECK-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-O3-NEXT: popq %rax -; CHECK-O3-NEXT: retq -; ; CHECK-SSE-O3-LABEL: atomic_vec2_i32: ; CHECK-SSE-O3: # %bb.0: ; CHECK-SSE-O3-NEXT: pushq %rax @@ -652,18 +382,6 @@ define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind { ; CHECK-AVX-O3-NEXT: popq %rax ; CHECK-AVX-O3-NEXT: retq ; -; CHECK-O0-LABEL: atomic_vec2_i32: -; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: pushq %rax -; CHECK-O0-NEXT: movq %rdi, %rsi -; CHECK-O0-NEXT: movl $8, %edi -; CHECK-O0-NEXT: movq %rsp, %rdx -; CHECK-O0-NEXT: movl $2, %ecx -; CHECK-O0-NEXT: callq __atomic_load@PLT -; CHECK-O0-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; CHECK-O0-NEXT: popq %rax -; CHECK-O0-NEXT: retq -; ; CHECK-SSE-O0-LABEL: atomic_vec2_i32: ; CHECK-SSE-O0: # %bb.0: ; CHECK-SSE-O0-NEXT: pushq %rax @@ -692,18 +410,6 @@ define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind { } define <4 x float> @atomic_vec4_float(ptr %x) nounwind { -; CHECK-O3-LABEL: atomic_vec4_float: -; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: subq $24, %rsp -; CHECK-O3-NEXT: movq %rdi, %rsi -; CHECK-O3-NEXT: movq %rsp, %rdx -; CHECK-O3-NEXT: movl $16, %edi -; CHECK-O3-NEXT: movl $2, %ecx -; CHECK-O3-NEXT: callq __atomic_load@PLT -; CHECK-O3-NEXT: movaps (%rsp), %xmm0 -; CHECK-O3-NEXT: addq $24, %rsp -; CHECK-O3-NEXT: retq -; ; CHECK-SSE-O3-LABEL: atomic_vec4_float: ; CHECK-SSE-O3: # %bb.0: ; CHECK-SSE-O3-NEXT: subq $24, %rsp @@ -728,18 +434,6 @@ define <4 x float> @atomic_vec4_float(ptr %x) nounwind { ; CHECK-AVX-O3-NEXT: addq $24, %rsp ; CHECK-AVX-O3-NEXT: retq ; -; CHECK-O0-LABEL: atomic_vec4_float: -; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: subq $24, %rsp -; CHECK-O0-NEXT: movq %rdi, %rsi -; CHECK-O0-NEXT: movl $16, %edi -; CHECK-O0-NEXT: movq %rsp, %rdx -; CHECK-O0-NEXT: movl $2, %ecx -; CHECK-O0-NEXT: callq __atomic_load@PLT -; CHECK-O0-NEXT: movaps (%rsp), %xmm0 -; CHECK-O0-NEXT: addq $24, %rsp -; CHECK-O0-NEXT: retq -; ; CHECK-SSE-O0-LABEL: atomic_vec4_float: ; CHECK-SSE-O0: # %bb.0: ; CHECK-SSE-O0-NEXT: subq $24, %rsp @@ -768,21 +462,6 @@ define <4 x float> @atomic_vec4_float(ptr %x) nounwind { } define <8 x double> @atomic_vec8_double(ptr %x) nounwind { -; CHECK-O3-LABEL: atomic_vec8_double: -; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: subq $72, %rsp -; CHECK-O3-NEXT: movq %rdi, %rsi -; CHECK-O3-NEXT: movq %rsp, %rdx -; CHECK-O3-NEXT: movl $64, %edi -; CHECK-O3-NEXT: movl $2, %ecx -; CHECK-O3-NEXT: callq __atomic_load@PLT -; CHECK-O3-NEXT: movaps (%rsp), %xmm0 -; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 -; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 -; CHECK-O3-NEXT: addq $72, %rsp -; CHECK-O3-NEXT: retq -; ; CHECK-SSE-O3-LABEL: atomic_vec8_double: ; CHECK-SSE-O3: # %bb.0: ; CHECK-SSE-O3-NEXT: subq $72, %rsp @@ -798,20 +477,30 @@ define <8 x double> @atomic_vec8_double(ptr %x) nounwind { ; CHECK-SSE-O3-NEXT: addq $72, %rsp ; CHECK-SSE-O3-NEXT: retq ; -; CHECK-O0-LABEL: atomic_vec8_double: -; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: subq $72, %rsp -; CHECK-O0-NEXT: movq %rdi, %rsi -; CHECK-O0-NEXT: movl $64, %edi -; CHECK-O0-NEXT: movq %rsp, %rdx -; CHECK-O0-NEXT: movl $2, %ecx -; CHECK-O0-NEXT: callq __atomic_load@PLT -; CHECK-O0-NEXT: movapd (%rsp), %xmm0 -; CHECK-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; CHECK-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; CHECK-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; CHECK-O0-NEXT: addq $72, %rsp -; CHECK-O0-NEXT: retq +; CHECK-AVX2-O3-LABEL: atomic_vec8_double: +; CHECK-AVX2-O3: # %bb.0: +; CHECK-AVX2-O3-NEXT: subq $72, %rsp +; CHECK-AVX2-O3-NEXT: movq %rdi, %rsi +; CHECK-AVX2-O3-NEXT: movq %rsp, %rdx +; CHECK-AVX2-O3-NEXT: movl $64, %edi +; CHECK-AVX2-O3-NEXT: movl $2, %ecx +; CHECK-AVX2-O3-NEXT: callq __atomic_load@PLT +; CHECK-AVX2-O3-NEXT: vmovups (%rsp), %ymm0 +; CHECK-AVX2-O3-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 +; CHECK-AVX2-O3-NEXT: addq $72, %rsp +; CHECK-AVX2-O3-NEXT: retq +; +; CHECK-AVX512-O3-LABEL: atomic_vec8_double: +; CHECK-AVX512-O3: # %bb.0: +; CHECK-AVX512-O3-NEXT: subq $72, %rsp +; CHECK-AVX512-O3-NEXT: movq %rdi, %rsi +; CHECK-AVX512-O3-NEXT: movq %rsp, %rdx +; CHECK-AVX512-O3-NEXT: movl $64, %edi +; CHECK-AVX512-O3-NEXT: movl $2, %ecx +; CHECK-AVX512-O3-NEXT: callq __atomic_load@PLT +; CHECK-AVX512-O3-NEXT: vmovups (%rsp), %zmm0 +; CHECK-AVX512-O3-NEXT: addq $72, %rsp +; CHECK-AVX512-O3-NEXT: retq ; ; CHECK-SSE-O0-LABEL: atomic_vec8_double: ; CHECK-SSE-O0: # %bb.0: @@ -827,24 +516,36 @@ define <8 x double> @atomic_vec8_double(ptr %x) nounwind { ; CHECK-SSE-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 ; CHECK-SSE-O0-NEXT: addq $72, %rsp ; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX2-O0-LABEL: atomic_vec8_double: +; CHECK-AVX2-O0: # %bb.0: +; CHECK-AVX2-O0-NEXT: subq $72, %rsp +; CHECK-AVX2-O0-NEXT: movq %rdi, %rsi +; CHECK-AVX2-O0-NEXT: movl $64, %edi +; CHECK-AVX2-O0-NEXT: movq %rsp, %rdx +; CHECK-AVX2-O0-NEXT: movl $2, %ecx +; CHECK-AVX2-O0-NEXT: callq __atomic_load@PLT +; CHECK-AVX2-O0-NEXT: vmovupd (%rsp), %ymm0 +; CHECK-AVX2-O0-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm1 +; CHECK-AVX2-O0-NEXT: addq $72, %rsp +; CHECK-AVX2-O0-NEXT: retq +; +; CHECK-AVX512-O0-LABEL: atomic_vec8_double: +; CHECK-AVX512-O0: # %bb.0: +; CHECK-AVX512-O0-NEXT: subq $72, %rsp +; CHECK-AVX512-O0-NEXT: movq %rdi, %rsi +; CHECK-AVX512-O0-NEXT: movl $64, %edi +; CHECK-AVX512-O0-NEXT: movq %rsp, %rdx +; CHECK-AVX512-O0-NEXT: movl $2, %ecx +; CHECK-AVX512-O0-NEXT: callq __atomic_load@PLT +; CHECK-AVX512-O0-NEXT: vmovupd (%rsp), %zmm0 +; CHECK-AVX512-O0-NEXT: addq $72, %rsp +; CHECK-AVX512-O0-NEXT: retq %ret = load atomic <8 x double>, ptr %x acquire, align 4 ret <8 x double> %ret } define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind { -; CHECK-O3-LABEL: atomic_vec16_bfloat: -; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: subq $40, %rsp -; CHECK-O3-NEXT: movq %rdi, %rsi -; CHECK-O3-NEXT: movq %rsp, %rdx -; CHECK-O3-NEXT: movl $32, %edi -; CHECK-O3-NEXT: movl $2, %ecx -; CHECK-O3-NEXT: callq __atomic_load@PLT -; CHECK-O3-NEXT: movaps (%rsp), %xmm0 -; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; CHECK-O3-NEXT: addq $40, %rsp -; CHECK-O3-NEXT: retq -; ; CHECK-SSE-O3-LABEL: atomic_vec16_bfloat: ; CHECK-SSE-O3: # %bb.0: ; CHECK-SSE-O3-NEXT: subq $40, %rsp @@ -870,19 +571,6 @@ define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind { ; CHECK-AVX-O3-NEXT: addq $40, %rsp ; CHECK-AVX-O3-NEXT: retq ; -; CHECK-O0-LABEL: atomic_vec16_bfloat: -; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: subq $40, %rsp -; CHECK-O0-NEXT: movq %rdi, %rsi -; CHECK-O0-NEXT: movl $32, %edi -; CHECK-O0-NEXT: movq %rsp, %rdx -; CHECK-O0-NEXT: movl $2, %ecx -; CHECK-O0-NEXT: callq __atomic_load@PLT -; CHECK-O0-NEXT: movaps (%rsp), %xmm0 -; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; CHECK-O0-NEXT: addq $40, %rsp -; CHECK-O0-NEXT: retq -; ; CHECK-SSE-O0-LABEL: atomic_vec16_bfloat: ; CHECK-SSE-O0: # %bb.0: ; CHECK-SSE-O0-NEXT: subq $40, %rsp @@ -912,21 +600,6 @@ define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind { } define <32 x half> @atomic_vec32_half(ptr %x) nounwind { -; CHECK-O3-LABEL: atomic_vec32_half: -; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: subq $72, %rsp -; CHECK-O3-NEXT: movq %rdi, %rsi -; CHECK-O3-NEXT: movq %rsp, %rdx -; CHECK-O3-NEXT: movl $64, %edi -; CHECK-O3-NEXT: movl $2, %ecx -; CHECK-O3-NEXT: callq __atomic_load@PLT -; CHECK-O3-NEXT: movaps (%rsp), %xmm0 -; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 -; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 -; CHECK-O3-NEXT: addq $72, %rsp -; CHECK-O3-NEXT: retq -; ; CHECK-SSE-O3-LABEL: atomic_vec32_half: ; CHECK-SSE-O3: # %bb.0: ; CHECK-SSE-O3-NEXT: subq $72, %rsp @@ -942,20 +615,30 @@ define <32 x half> @atomic_vec32_half(ptr %x) nounwind { ; CHECK-SSE-O3-NEXT: addq $72, %rsp ; CHECK-SSE-O3-NEXT: retq ; -; CHECK-O0-LABEL: atomic_vec32_half: -; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: subq $72, %rsp -; CHECK-O0-NEXT: movq %rdi, %rsi -; CHECK-O0-NEXT: movl $64, %edi -; CHECK-O0-NEXT: movq %rsp, %rdx -; CHECK-O0-NEXT: movl $2, %ecx -; CHECK-O0-NEXT: callq __atomic_load@PLT -; CHECK-O0-NEXT: movaps (%rsp), %xmm0 -; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 -; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 -; CHECK-O0-NEXT: addq $72, %rsp -; CHECK-O0-NEXT: retq +; CHECK-AVX2-O3-LABEL: atomic_vec32_half: +; CHECK-AVX2-O3: # %bb.0: +; CHECK-AVX2-O3-NEXT: subq $72, %rsp +; CHECK-AVX2-O3-NEXT: movq %rdi, %rsi +; CHECK-AVX2-O3-NEXT: movq %rsp, %rdx +; CHECK-AVX2-O3-NEXT: movl $64, %edi +; CHECK-AVX2-O3-NEXT: movl $2, %ecx +; CHECK-AVX2-O3-NEXT: callq __atomic_load@PLT +; CHECK-AVX2-O3-NEXT: vmovups (%rsp), %ymm0 +; CHECK-AVX2-O3-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 +; CHECK-AVX2-O3-NEXT: addq $72, %rsp +; CHECK-AVX2-O3-NEXT: retq +; +; CHECK-AVX512-O3-LABEL: atomic_vec32_half: +; CHECK-AVX512-O3: # %bb.0: +; CHECK-AVX512-O3-NEXT: subq $72, %rsp +; CHECK-AVX512-O3-NEXT: movq %rdi, %rsi +; CHECK-AVX512-O3-NEXT: movq %rsp, %rdx +; CHECK-AVX512-O3-NEXT: movl $64, %edi +; CHECK-AVX512-O3-NEXT: movl $2, %ecx +; CHECK-AVX512-O3-NEXT: callq __atomic_load@PLT +; CHECK-AVX512-O3-NEXT: vmovups (%rsp), %zmm0 +; CHECK-AVX512-O3-NEXT: addq $72, %rsp +; CHECK-AVX512-O3-NEXT: retq ; ; CHECK-SSE-O0-LABEL: atomic_vec32_half: ; CHECK-SSE-O0: # %bb.0: @@ -971,6 +654,31 @@ define <32 x half> @atomic_vec32_half(ptr %x) nounwind { ; CHECK-SSE-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 ; CHECK-SSE-O0-NEXT: addq $72, %rsp ; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX2-O0-LABEL: atomic_vec32_half: +; CHECK-AVX2-O0: # %bb.0: +; CHECK-AVX2-O0-NEXT: subq $72, %rsp +; CHECK-AVX2-O0-NEXT: movq %rdi, %rsi +; CHECK-AVX2-O0-NEXT: movl $64, %edi +; CHECK-AVX2-O0-NEXT: movq %rsp, %rdx +; CHECK-AVX2-O0-NEXT: movl $2, %ecx +; CHECK-AVX2-O0-NEXT: callq __atomic_load@PLT +; CHECK-AVX2-O0-NEXT: vmovups (%rsp), %ymm0 +; CHECK-AVX2-O0-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 +; CHECK-AVX2-O0-NEXT: addq $72, %rsp +; CHECK-AVX2-O0-NEXT: retq +; +; CHECK-AVX512-O0-LABEL: atomic_vec32_half: +; CHECK-AVX512-O0: # %bb.0: +; CHECK-AVX512-O0-NEXT: subq $72, %rsp +; CHECK-AVX512-O0-NEXT: movq %rdi, %rsi +; CHECK-AVX512-O0-NEXT: movl $64, %edi +; CHECK-AVX512-O0-NEXT: movq %rsp, %rdx +; CHECK-AVX512-O0-NEXT: movl $2, %ecx +; CHECK-AVX512-O0-NEXT: callq __atomic_load@PLT +; CHECK-AVX512-O0-NEXT: vmovups (%rsp), %zmm0 +; CHECK-AVX512-O0-NEXT: addq $72, %rsp +; CHECK-AVX512-O0-NEXT: retq %ret = load atomic <32 x half>, ptr %x acquire, align 4 ret <32 x half> %ret } From dab19f47c8c3d9aa618c3353e5e08c850f7fb7da Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 29 Oct 2025 13:46:30 +0000 Subject: [PATCH 106/539] [X86] and-mask-variable.ll - remove unnecessary "+fast-bextr" attribute from tests (#165553) Unnecessary copy+paste inclusion from some other BMI tests --- llvm/test/CodeGen/X86/and-mask-variable.ll | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/test/CodeGen/X86/and-mask-variable.ll b/llvm/test/CodeGen/X86/and-mask-variable.ll index d89f0db6a0c5b..3e5bd6952147c 100644 --- a/llvm/test/CodeGen/X86/and-mask-variable.ll +++ b/llvm/test/CodeGen/X86/and-mask-variable.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X86-NOBMI -; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X86-BMI2 -; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X86-BMI2 -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X64-NOBMI -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X64-BMI2 -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X64-BMI2 +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2 < %s | FileCheck %s --check-prefixes=X86-NOBMI +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2 < %s | FileCheck %s --check-prefixes=X86-BMI2 +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2 < %s | FileCheck %s --check-prefixes=X86-BMI2 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2 < %s | FileCheck %s --check-prefixes=X64-NOBMI +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2 < %s | FileCheck %s --check-prefixes=X64-BMI2 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2 < %s | FileCheck %s --check-prefixes=X64-BMI2 define i32 @mask_pair(i32 %x, i32 %y) nounwind { ; X86-NOBMI-LABEL: mask_pair: From 244773015a24e794e0cfca58ac009d0755213bf6 Mon Sep 17 00:00:00 2001 From: Krish Gupta Date: Wed, 29 Oct 2025 19:25:52 +0530 Subject: [PATCH 107/539] [OpenMP][Flang] Fix atomic operations on complex types (#165366) Fixes https://github.com/llvm/llvm-project/issues/165184 In OMPIRBuilder::createAtomicRead() and createAtomicWrite(), the size parameter for __atomic_load/__atomic_store was incorrectly computed from the pointer type instead of the pointee (element) type. On 64-bit systems, this resulted in only 8 bytes being transferred regardless of the actual struct size. Changed both functions to use XElemTy (element type) instead of the pointer type when computing LoadSize. This ensures the full struct is transferred. --- .../test/Lower/OpenMP/atomic-read-complex.f90 | 34 ++++++++ .../Lower/OpenMP/atomic-write-complex.f90 | 34 ++++++++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 15 ++-- .../Frontend/OpenMPIRBuilderTest.cpp | 82 ++++++++++++++++++- 4 files changed, 155 insertions(+), 10 deletions(-) create mode 100644 flang/test/Lower/OpenMP/atomic-read-complex.f90 create mode 100644 flang/test/Lower/OpenMP/atomic-write-complex.f90 diff --git a/flang/test/Lower/OpenMP/atomic-read-complex.f90 b/flang/test/Lower/OpenMP/atomic-read-complex.f90 new file mode 100644 index 0000000000000..2f51f03820926 --- /dev/null +++ b/flang/test/Lower/OpenMP/atomic-read-complex.f90 @@ -0,0 +1,34 @@ +! Test lowering of atomic read to LLVM IR for complex types. +! This is a regression test for issue #165184. + +! RUN: %flang_fc1 -emit-llvm -fopenmp -o - %s | FileCheck %s + +! Test that atomic read operations with complex types emit the correct +! size parameter to __atomic_load: +! - complex(4) (8 bytes total): should call __atomic_load(i64 8, ...) +! - complex(8) (16 bytes total): should call __atomic_load(i64 16, ...) + +program atomic_read_complex + implicit none + + ! Test complex(4) - single precision (8 bytes) + complex(4) :: c41, c42 + ! Test complex(8) - double precision (16 bytes) + complex(8) :: c81, c82 + + c42 = (1.0_4, 1.0_4) + c82 = (1.0_8, 1.0_8) + + ! CHECK-LABEL: define {{.*}} @_QQmain + + ! Single precision complex: 8 bytes + ! CHECK: call void @__atomic_load(i64 8, ptr {{.*}}, ptr {{.*}}, i32 {{.*}}) +!$omp atomic read + c41 = c42 + + ! Double precision complex: 16 bytes (this was broken before the fix) + ! CHECK: call void @__atomic_load(i64 16, ptr {{.*}}, ptr {{.*}}, i32 {{.*}}) +!$omp atomic read + c81 = c82 + +end program atomic_read_complex diff --git a/flang/test/Lower/OpenMP/atomic-write-complex.f90 b/flang/test/Lower/OpenMP/atomic-write-complex.f90 new file mode 100644 index 0000000000000..48cfe26ca5a49 --- /dev/null +++ b/flang/test/Lower/OpenMP/atomic-write-complex.f90 @@ -0,0 +1,34 @@ +! Test lowering of atomic write to LLVM IR for complex types. +! This is a regression test for issue #165184. + +! RUN: %flang_fc1 -emit-llvm -fopenmp -o - %s | FileCheck %s + +! Test that atomic write operations with complex types emit the correct +! size parameter to __atomic_store: +! - complex(4) (8 bytes total): should call __atomic_store(i64 8, ...) +! - complex(8) (16 bytes total): should call __atomic_store(i64 16, ...) + +program atomic_write_complex + implicit none + + ! Test complex(4) - single precision (8 bytes) + complex(4) :: c41, c42 + ! Test complex(8) - double precision (16 bytes) + complex(8) :: c81, c82 + + c42 = (1.0_4, 1.0_4) + c82 = (1.0_8, 1.0_8) + + ! CHECK-LABEL: define {{.*}} @_QQmain + + ! Single precision complex: 8 bytes + ! CHECK: call void @__atomic_store(i64 8, ptr {{.*}}, ptr {{.*}}, i32 {{.*}}) +!$omp atomic write + c41 = c42 + + ! Double precision complex: 16 bytes (this was broken before the fix) + ! CHECK: call void @__atomic_store(i64 16, ptr {{.*}}, ptr {{.*}}, i32 {{.*}}) +!$omp atomic write + c81 = c82 + +end program atomic_write_complex diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 286ed039b1214..0e5926ff0fb18 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -5473,7 +5473,8 @@ OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef Loops, } // TODO: Enable UndefinedSanitizer to diagnose an overflow here. - CollapsedTripCount = Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount); + CollapsedTripCount = + Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount); } // Create the collapsed loop control flow. @@ -9338,9 +9339,8 @@ OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc, // target does not support `atomicrmw` of the size of the struct LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read"); OldVal->setAtomic(AO); - const DataLayout &LoadDL = OldVal->getModule()->getDataLayout(); - unsigned LoadSize = - LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType()); + const DataLayout &DL = OldVal->getModule()->getDataLayout(); + unsigned LoadSize = DL.getTypeStoreSize(XElemTy); OpenMPIRBuilder::AtomicInfo atomicInfo( &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(), OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var); @@ -9384,9 +9384,8 @@ OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc, XSt->setAtomic(AO); } else if (XElemTy->isStructTy()) { LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read"); - const DataLayout &LoadDL = OldVal->getModule()->getDataLayout(); - unsigned LoadSize = - LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType()); + const DataLayout &DL = OldVal->getModule()->getDataLayout(); + unsigned LoadSize = DL.getTypeStoreSize(XElemTy); OpenMPIRBuilder::AtomicInfo atomicInfo( &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(), OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var); @@ -9581,7 +9580,7 @@ Expected> OpenMPIRBuilder::emitAtomicUpdate( OldVal->setAtomic(AO); // CurBB // | /---\ - // ContBB | + // ContBB | // | \---/ // ExitBB BasicBlock *CurBB = Builder.GetInsertBlock(); diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index e56872320b4ac..0b3ae643e1494 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -4534,6 +4534,85 @@ TEST_F(OpenMPIRBuilderTest, OMPAtomicCompareCapture) { EXPECT_FALSE(verifyModule(*M, &errs())); } +TEST_F(OpenMPIRBuilderTest, OMPAtomicRWStructType) { + // Test for issue #165184: atomic read/write on struct types should use + // element type size, not pointer size. + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.initialize(); + F->setName("func"); + IRBuilder<> Builder(BB); + + OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL}); + BasicBlock *EntryBB = BB; + OpenMPIRBuilder::InsertPointTy AllocaIP(EntryBB, + EntryBB->getFirstInsertionPt()); + + LLVMContext &Ctx = M->getContext(); + + // Create a struct type {double, double} to simulate complex(8) - 16 bytes + StructType *Complex8Ty = StructType::create( + Ctx, {Type::getDoubleTy(Ctx), Type::getDoubleTy(Ctx)}, "complex"); + + AllocaInst *XVal = Builder.CreateAlloca(Complex8Ty); + XVal->setName("AtomicVar"); + OpenMPIRBuilder::AtomicOpValue X = {XVal, Complex8Ty, false, false}; + AtomicOrdering AO = AtomicOrdering::SequentiallyConsistent; + + // Create value to write: {1.0, 1.0} + Constant *Real = ConstantFP::get(Type::getDoubleTy(Ctx), 1.0); + Constant *Imag = ConstantFP::get(Type::getDoubleTy(Ctx), 1.0); + Constant *ValToWrite = ConstantStruct::get(Complex8Ty, {Real, Imag}); + + // Test atomic write + Builder.restoreIP( + OMPBuilder.createAtomicWrite(Loc, X, ValToWrite, AO, AllocaIP)); + + // Test atomic read + AllocaInst *VVal = Builder.CreateAlloca(Complex8Ty); + VVal->setName("ReadDest"); + OpenMPIRBuilder::AtomicOpValue V = {VVal, Complex8Ty, false, false}; + + Builder.restoreIP(OMPBuilder.createAtomicRead(Loc, X, V, AO, AllocaIP)); + + Builder.CreateRetVoid(); + OMPBuilder.finalize(); + EXPECT_FALSE(verifyModule(*M, &errs())); + + // Verify that __atomic_store and __atomic_load are called with size 16 + bool FoundAtomicStore = false; + bool FoundAtomicLoad = false; + + for (Function &Fn : *M) { + if (Fn.getName().starts_with("__atomic_store")) { + // Check that first call to __atomic_store has size argument = 16 + for (User *U : Fn.users()) { + if (auto *CB = dyn_cast(U)) { + if (auto *SizeArg = dyn_cast(CB->getArgOperand(0))) { + EXPECT_EQ(SizeArg->getZExtValue(), 16U); + FoundAtomicStore = true; + break; + } + } + } + } + if (Fn.getName().starts_with("__atomic_load")) { + // Check that first call to __atomic_load has size argument = 16 + for (User *U : Fn.users()) { + if (auto *CB = dyn_cast(U)) { + if (auto *SizeArg = dyn_cast(CB->getArgOperand(0))) { + EXPECT_EQ(SizeArg->getZExtValue(), 16U); + FoundAtomicLoad = true; + break; + } + } + } + } + } + + EXPECT_TRUE(FoundAtomicStore) << "Did not find __atomic_store call"; + EXPECT_TRUE(FoundAtomicLoad) << "Did not find __atomic_load call"; +} + TEST_F(OpenMPIRBuilderTest, CreateTeams) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); @@ -7576,8 +7655,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskgroup) { // Checking the general structure of the IR generated is same as expected. Instruction *GeneratedStoreInst = TaskgroupCall->getNextNode(); EXPECT_EQ(GeneratedStoreInst, InternalStoreInst); - Instruction *GeneratedLoad32 = - GeneratedStoreInst->getNextNode(); + Instruction *GeneratedLoad32 = GeneratedStoreInst->getNextNode(); EXPECT_EQ(GeneratedLoad32, InternalLoad32); Instruction *GeneratedLoad128 = GeneratedLoad32->getNextNode(); EXPECT_EQ(GeneratedLoad128, InternalLoad128); From 4834c6b17bc45fd9282e051d2c5dd22c80fbdab9 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 29 Oct 2025 07:35:57 -0700 Subject: [PATCH 108/539] [X86] Remove a redundant cast (NFC) (#165509) ShiftAmt is already of type int. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5785440a20e43..89b42da9a40f0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12213,7 +12213,7 @@ static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale); ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8) : MVT::getVectorVT(ShiftSVT, Size / Scale); - return (int)ShiftAmt; + return ShiftAmt; }; // SSE/AVX supports logical shifts up to 64-bit integers - so we can just From 166eb94779d6cf2bad4bda2a414cc1823bdd464e Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 29 Oct 2025 07:36:05 -0700 Subject: [PATCH 109/539] [Instrumentation] Remove a redundant control flow statement (NFC) (#165510) --- llvm/lib/Transforms/Instrumentation/MemProfUse.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp index a6ec6c1207767..2f256dfd7b0e2 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp @@ -216,7 +216,6 @@ static void HandleUnsupportedAnnotationKinds(GlobalVariable &GVar, } LLVM_DEBUG(dbgs() << "Skip annotation for " << GVar.getName() << " due to " << Reason << ".\n"); - return; } struct AllocMatchInfo { From 8abf3c24a377e11c534998246fdf54eb6f23e5ad Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Wed, 29 Oct 2025 11:02:28 -0400 Subject: [PATCH 110/539] use Twine instead of char* for function args (#165569) Changed the function arguments to take `const Twine&` instead of `const char*`. This will avoid converting StringRef's to C strings too soon (or ever). --- clang/lib/Basic/SourceManager.cpp | 3 +-- llvm/include/llvm/Support/AutoConvert.h | 7 ++++--- llvm/lib/Support/AutoConvert.cpp | 6 +++--- llvm/lib/Support/MemoryBuffer.cpp | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index d8ec837f0f7b9..938c6485125ee 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -608,8 +608,7 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename, return FileID::get(LoadedID); } unsigned FileSize = File.getSize(); - llvm::ErrorOr NeedConversion = - llvm::needConversion(Filename.str().c_str()); + llvm::ErrorOr NeedConversion = llvm::needConversion(Filename); if (NeedConversion && *NeedConversion) { // Buffer size may increase due to potential z/OS EBCDIC to UTF-8 // conversion. diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h index 1e6792636e169..15f1ec8af6c57 100644 --- a/llvm/include/llvm/Support/AutoConvert.h +++ b/llvm/include/llvm/Support/AutoConvert.h @@ -18,6 +18,7 @@ #include <_Ccsid.h> #endif #ifdef __cplusplus +#include "llvm/ADT/Twine.h" #include "llvm/Support/Error.h" #include #endif /* __cplusplus */ @@ -47,12 +48,12 @@ namespace llvm { std::error_code setzOSFileTag(int FD, int CCSID, bool Text); /** \brief Get the the tag ccsid for a file name or a file descriptor. */ -ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1); +ErrorOr<__ccsid_t> getzOSFileTag(const Twine &FileName, const int FD = -1); /** \brief Query the file tag to determine if it needs conversion to UTF-8 * codepage. */ -ErrorOr needzOSConversion(const char *FileName, const int FD = -1); +ErrorOr needzOSConversion(const Twine &FileName, const int FD = -1); #endif /* __MVS__*/ @@ -87,7 +88,7 @@ inline std::error_code setFileTag(int FD, int CCSID, bool Text) { return std::error_code(); } -inline ErrorOr needConversion(const char *FileName, const int FD = -1) { +inline ErrorOr needConversion(const Twine &FileName, const int FD = -1) { #ifdef __MVS__ return needzOSConversion(FileName, FD); #endif diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp index 0b6928e10ef5a..741bb7bd2c5b0 100644 --- a/llvm/lib/Support/AutoConvert.cpp +++ b/llvm/lib/Support/AutoConvert.cpp @@ -96,7 +96,7 @@ std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) { return std::error_code(); } -ErrorOr<__ccsid_t> llvm::getzOSFileTag(const char *FileName, const int FD) { +ErrorOr<__ccsid_t> llvm::getzOSFileTag(const Twine &FileName, const int FD) { // If we have a file descriptor, use it to find out file tagging. Otherwise we // need to use stat() with the file path. if (FD != -1) { @@ -110,12 +110,12 @@ ErrorOr<__ccsid_t> llvm::getzOSFileTag(const char *FileName, const int FD) { return Query.fccsid; } struct stat Attr; - if (stat(FileName, &Attr) == -1) + if (stat(FileName.str().c_str(), &Attr) == -1) return std::error_code(errno, std::generic_category()); return Attr.st_tag.ft_ccsid; } -ErrorOr llvm::needzOSConversion(const char *FileName, const int FD) { +ErrorOr llvm::needzOSConversion(const Twine &FileName, const int FD) { ErrorOr<__ccsid_t> Ccsid = getzOSFileTag(FileName, FD); if (std::error_code EC = Ccsid.getError()) return EC; diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp index 1c4645ad83641..23b9f8c5790d2 100644 --- a/llvm/lib/Support/MemoryBuffer.cpp +++ b/llvm/lib/Support/MemoryBuffer.cpp @@ -512,7 +512,7 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize, } #ifdef __MVS__ - ErrorOr NeedsConversion = needConversion(Filename.str().c_str(), FD); + ErrorOr NeedsConversion = needConversion(Filename, FD); if (std::error_code EC = NeedsConversion.getError()) return EC; // File size may increase due to EBCDIC -> UTF-8 conversion, therefore we From f9eb70e80e0c233f01d5ecd9451ce531676f73e1 Mon Sep 17 00:00:00 2001 From: Sirui Mu Date: Wed, 29 Oct 2025 23:04:26 +0800 Subject: [PATCH 111/539] [clang-tidy] Allow thread-local variables in avoid-non-const-global-variables (#164442) This patch adds an option named `AllowThreadLocal` to the `cppcoreguidelines-avoid-non-const-global-variables` check. When set to true, the option suppresses warnings generated for non-const global variables with thread-local storage duration. By default, the option is set to false. --- .../AvoidNonConstGlobalVariablesCheck.cpp | 5 ++++- .../AvoidNonConstGlobalVariablesCheck.h | 1 + clang-tools-extra/docs/ReleaseNotes.rst | 5 +++++ .../avoid-non-const-global-variables.rst | 5 +++++ .../avoid-non-const-global-variables.cpp | 14 ++++++++++++++ 5 files changed, 29 insertions(+), 1 deletion(-) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.cpp index f0e66e44690b2..2c0baa5716954 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.cpp @@ -17,7 +17,8 @@ namespace clang::tidy::cppcoreguidelines { AvoidNonConstGlobalVariablesCheck::AvoidNonConstGlobalVariablesCheck( StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - AllowInternalLinkage(Options.get("AllowInternalLinkage", false)) {} + AllowInternalLinkage(Options.get("AllowInternalLinkage", false)), + AllowThreadLocal(Options.get("AllowThreadLocal", false)) {} void AvoidNonConstGlobalVariablesCheck::registerMatchers(MatchFinder *Finder) { auto NamespaceMatcher = AllowInternalLinkage @@ -31,6 +32,8 @@ void AvoidNonConstGlobalVariablesCheck::registerMatchers(MatchFinder *Finder) { GlobalContext, AllowInternalLinkage ? varDecl(unless(isStaticStorageClass())) : varDecl(), + AllowThreadLocal ? varDecl(unless(hasThreadStorageDuration())) + : varDecl(), unless(anyOf( isConstexpr(), hasType(isConstQualified()), hasType(referenceType())))); // References can't be changed, only the diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.h index 5e7c968b12f97..d8f2a733e3b01 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.h @@ -27,6 +27,7 @@ class AvoidNonConstGlobalVariablesCheck : public ClangTidyCheck { private: const bool AllowInternalLinkage; + const bool AllowThreadLocal; }; } // namespace clang::tidy::cppcoreguidelines diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 8f4be0d1cb259..6701bf25df166 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -331,6 +331,11 @@ Changes in existing checks an additional matcher that generalizes the copy-and-swap idiom pattern detection. +- Improved :doc:`cppcoreguidelines-avoid-non-const-global-variables + ` check + by adding a new option `AllowThreadLocal` that suppresses warnings on + non-const global variables with thread-local storage duration. + - Improved :doc:`cppcoreguidelines-init-variables ` check by fixing the insertion location for function pointers with multiple parameters. diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-non-const-global-variables.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-non-const-global-variables.rst index 8da284ca13e3d..3d5fef3a07dca 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-non-const-global-variables.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-non-const-global-variables.rst @@ -49,3 +49,8 @@ Options When set to `true`, static non-const variables and variables in anonymous namespaces will not generate a warning. The default value is `false`. + +.. option:: AllowThreadLocal + + When set to `true`, non-const global variables with thread-local storage + duration will not generate a warning. The default value is `false`. diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-non-const-global-variables.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-non-const-global-variables.cpp index 334332def216f..30bdd68a21b84 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-non-const-global-variables.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-non-const-global-variables.cpp @@ -1,6 +1,8 @@ // RUN: %check_clang_tidy %s -check-suffixes=,DEFAULT cppcoreguidelines-avoid-non-const-global-variables %t // RUN: %check_clang_tidy %s -check-suffixes=,INTERNAL-LINKAGE cppcoreguidelines-avoid-non-const-global-variables %t -- \ // RUN: -config="{CheckOptions: {cppcoreguidelines-avoid-non-const-global-variables.AllowInternalLinkage : 'true'}}" +// RUN: %check_clang_tidy %s -check-suffixes=,THREAD-LOCAL cppcoreguidelines-avoid-non-const-global-variables %t -- \ +// RUN: -config="{CheckOptions: {cppcoreguidelines-avoid-non-const-global-variables.AllowThreadLocal : 'true'}}" int nonConstInt = 0; // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: variable 'nonConstInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] @@ -42,14 +44,23 @@ namespace { int nonConstAnonymousNamespaceInt = 0; // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:5: warning: variable 'nonConstAnonymousNamespaceInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] // CHECK-MESSAGES-INTERNAL-LINKAGE-NOT: :[[@LINE-2]]:5: warning: variable 'nonConstAnonymousNamespaceInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] +// CHECK-MESSAGES-THREAD-LOCAL: :[[@LINE-3]]:5: warning: variable 'nonConstAnonymousNamespaceInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] } // namespace static int nonConstStaticInt = 0; // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:12: warning: variable 'nonConstStaticInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] // CHECK-MESSAGES-INTERNAL-LINKAGE-NOT: :[[@LINE-2]]:12: warning: variable 'nonConstStaticInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] +// CHECK-MESSAGES-THREAD-LOCAL: :[[@LINE-3]]:12: warning: variable 'nonConstStaticInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] static const int constStaticInt = 0; +thread_local int threadLocalInt = 0; +// CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:18: warning: variable 'threadLocalInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] +// CHECK-MESSAGES-INTERNAL-LINKAGE: :[[@LINE-2]]:18: warning: variable 'threadLocalInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] +// CHECK-MESSAGES-THREAD-LOCAL-NOT: :[[@LINE-3]]:18: warning: variable 'threadLocalInt' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] + +thread_local const int threadLocalConstInt = 0; + class DummyClass { public: int nonConstPublicMemberVariable = 0; @@ -137,6 +148,7 @@ DummyEnum nonConstAnonymousNamespaceEnumInstance = DummyEnum::first; } // CHECK-MESSAGES-DEFAULT: :[[@LINE-2]]:11: warning: variable 'nonConstAnonymousNamespaceEnumInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] // CHECK-MESSAGES-INTERNAL-LINKAGE-NOT: :[[@LINE-2]]:11: warning: variable 'nonConstAnonymousNamespaceEnumInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] +// CHECK-MESSAGES-THREAD-LOCAL: :[[@LINE-4]]:11: warning: variable 'nonConstAnonymousNamespaceEnumInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] // CHECKING FOR NON-CONST GLOBAL STRUCT /////////////////////////////////////// struct DummyStruct { @@ -181,6 +193,7 @@ DummyStruct nonConstAnonymousNamespaceStructInstance; } // CHECK-MESSAGES-DEFAULT: :[[@LINE-2]]:13: warning: variable 'nonConstAnonymousNamespaceStructInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] // CHECK-MESSAGES-INTERNAL-LINKAGE-NOT: :[[@LINE-2]]:11: warning: variable 'nonConstAnonymousNamespaceEnumInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] +// CHECK-MESSAGES-THREAD-LOCAL: :[[@LINE-4]]:13: warning: variable 'nonConstAnonymousNamespaceStructInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] // CHECKING FOR NON-CONST GLOBAL UNION //////////////////////////////////////// union DummyUnion { @@ -222,6 +235,7 @@ DummyUnion nonConstAnonymousNamespaceUnionInstance = {0x0}; } // CHECK-MESSAGES-DEFAULT: :[[@LINE-2]]:12: warning: variable 'nonConstAnonymousNamespaceUnionInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] // CHECK-MESSAGES-INTERNAL-LINKAGE-NOT: :[[@LINE-3]]:12: warning: variable 'nonConstAnonymousNamespaceUnionInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] +// CHECK-MESSAGES-THREAD-LOCAL: :[[@LINE-4]]:12: warning: variable 'nonConstAnonymousNamespaceUnionInstance' is non-const and globally accessible, consider making it const [cppcoreguidelines-avoid-non-const-global-variables] // CHECKING FOR NON-CONST GLOBAL FUNCTION POINTER ///////////////////////////// int dummyFunction() { From c3d489a91d7cbcc9ab6ab4e4a68a1300abb6ae67 Mon Sep 17 00:00:00 2001 From: Tarun Prabhu Date: Wed, 29 Oct 2025 09:13:17 -0600 Subject: [PATCH 112/539] [flang][driver] Use -Xflang in diagnostics When an option that is only available in `flang -fc1` is provided to `flang`, emit a diagnostic with a suggestion containing "did you mean -Xflang '-foo'". Partially addresses #163550. --- clang/lib/Driver/Driver.cpp | 15 ++++++++++++--- flang/test/Driver/flang-f-opts.f90 | 21 +++++++++++++++++++-- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 40ea513e85427..71c52807091ba 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -308,9 +308,18 @@ InputArgList Driver::ParseArgStrings(ArrayRef ArgStrings, auto ArgString = A->getAsString(Args); std::string Nearest; if (getOpts().findNearest(ArgString, Nearest, VisibilityMask) > 1) { - if (!IsCLMode() && - getOpts().findExact(ArgString, Nearest, - llvm::opt::Visibility(options::CC1Option))) { + if (IsFlangMode()) { + if (getOpts().findExact(ArgString, Nearest, + llvm::opt::Visibility(options::FC1Option))) { + DiagID = diag::err_drv_unknown_argument_with_suggestion; + Diags.Report(DiagID) << ArgString << "-Xflang " + Nearest; + } else { + DiagID = diag::err_drv_unknown_argument; + Diags.Report(DiagID) << ArgString; + } + } else if (!IsCLMode() && getOpts().findExact(ArgString, Nearest, + llvm::opt::Visibility( + options::CC1Option))) { DiagID = diag::err_drv_unknown_argument_with_suggestion; Diags.Report(DiagID) << ArgString << "-Xclang " + Nearest; } else { diff --git a/flang/test/Driver/flang-f-opts.f90 b/flang/test/Driver/flang-f-opts.f90 index 77bb4d7aa8a91..9ef0abaa176f0 100644 --- a/flang/test/Driver/flang-f-opts.f90 +++ b/flang/test/Driver/flang-f-opts.f90 @@ -1,5 +1,5 @@ -! Test for warnings generated when parsing driver options. You can use this file for relatively small tests and to avoid creating -! new test files. +! Test for errors and warnings generated when parsing driver options. You can +! use this file for relatively small tests and to avoid creating new test files. ! RUN: %flang -### -S -O4 -ffp-contract=on %s 2>&1 | FileCheck %s @@ -26,3 +26,20 @@ ! RUN: | FileCheck %s -check-prefix=WARN-BUILTIN-MULTIPLE ! WARN-BUILTIN-MULTIPLE: warning: '-fbuiltin' is not valid for Fortran ! WARN-BUILTIN-MULTIPLE: warning: '-fno-builtin' is not valid for Fortran + +! When emitting an error with a suggestion, ensure that the diagnostic message +! uses '-Xflang' instead of '-Xclang'. This is typically emitted when an option +! that is available for `flang -fc1` is passed to `flang`. We use -complex-range +! since it is only available for fc1. If this option is ever exposed to `flang`, +! a different option will have to be used in the test below. +! +! RUN: not %flang -### -complex-range=full %s 2>&1 \ +! RUN: | FileCheck %s -check-prefix UNKNOWN-SUGGEST +! +! UNKNOWN-SUGGEST: error: unknown argument '-complex-range=full'; +! UNKNOWN-SUGGEST-SAME: did you mean '-Xflang -complex-range=full' +! +! RUN: not %flang -### -not-an-option %s 2>&1 \ +! RUN: | FileCheck %s -check-prefix UNKNOWN-NO-SUGGEST +! +! UNKNOWN-NO-SUGGEST: error: unknown argument: '-not-an-option'{{$}} From 58f89fda6991019654d5d220ce27591ce86343f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=88=E5=AD=90=E6=98=82?= <121872494+Michael-Chen-NJU@users.noreply.github.com> Date: Wed, 29 Oct 2025 23:17:38 +0800 Subject: [PATCH 113/539] [DAG] Add generic m_TernaryOp() / m_c_TernaryOp() matchers (#165520) Similar to the m_BinOp/m_c_BinOp matchers, this patch introduces generic matchers for SelectionDAG nodes with three operands. This includes: - Adding m_TernaryOp() and m_c_TernaryOp() templates in SDPatternMatch.h. - Adding comprehensive test coverage in SelectionDAGPatternMatchTest.cpp. Fixes #165378 --- llvm/include/llvm/CodeGen/SDPatternMatch.h | 12 ++++ .../CodeGen/SelectionDAGPatternMatchTest.cpp | 70 +++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h index 0dcf400962393..9a6bf5ffdd227 100644 --- a/llvm/include/llvm/CodeGen/SDPatternMatch.h +++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h @@ -583,6 +583,18 @@ m_InsertSubvector(const LHS &Base, const RHS &Sub, const IDX &Idx) { return TernaryOpc_match(ISD::INSERT_SUBVECTOR, Base, Sub, Idx); } +template +inline TernaryOpc_match +m_TernaryOp(unsigned Opc, const T0_P &Op0, const T1_P &Op1, const T2_P &Op2) { + return TernaryOpc_match(Opc, Op0, Op1, Op2); +} + +template +inline TernaryOpc_match +m_c_TernaryOp(unsigned Opc, const T0_P &Op0, const T1_P &Op1, const T2_P &Op2) { + return TernaryOpc_match(Opc, Op0, Op1, Op2); +} + template inline auto m_SelectCC(const LTy &L, const RTy &R, const TTy &T, const FTy &F, const CCTy &CC) { diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp index aa56aafa2812c..ceaee52a3948b 100644 --- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp +++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp @@ -354,6 +354,76 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) { sd_match(InsertELT, m_InsertElt(m_Value(), m_Value(), m_SpecificInt(1)))); } +TEST_F(SelectionDAGPatternMatchTest, matchGenericTernaryOp) { + SDLoc DL; + auto Float32VT = EVT::getFloatingPointVT(32); + + SDValue Op0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Float32VT); + SDValue Op1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, Float32VT); + SDValue Op2 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 3, Float32VT); + + SDValue FMA = DAG->getNode(ISD::FMA, DL, Float32VT, Op0, Op1, Op2); + SDValue FAdd = DAG->getNode(ISD::FADD, DL, Float32VT, Op0, Op1); + + using namespace SDPatternMatch; + SDValue A, B, C; + + EXPECT_TRUE(sd_match(FMA, m_TernaryOp(ISD::FMA, m_Specific(Op0), + m_Specific(Op1), m_Specific(Op2)))); + EXPECT_FALSE(sd_match(FMA, m_TernaryOp(ISD::FADD, m_Specific(Op0), + m_Specific(Op1), m_Specific(Op2)))); + EXPECT_FALSE( + sd_match(FAdd, m_TernaryOp(ISD::FMA, m_Value(), m_Value(), m_Value()))); + EXPECT_FALSE(sd_match(FMA, m_TernaryOp(ISD::FMA, m_Specific(Op1), + m_Specific(Op0), m_Specific(Op2)))); + + EXPECT_TRUE( + sd_match(FMA, m_TernaryOp(ISD::FMA, m_Value(A), m_Value(B), m_Value(C)))); + EXPECT_EQ(A, Op0); + EXPECT_EQ(B, Op1); + EXPECT_EQ(C, Op2); + + A = B = C = SDValue(); + + EXPECT_TRUE(sd_match(FMA, m_c_TernaryOp(ISD::FMA, m_Specific(Op0), + m_Specific(Op1), m_Specific(Op2)))); + EXPECT_TRUE(sd_match(FMA, m_c_TernaryOp(ISD::FMA, m_Specific(Op1), + m_Specific(Op0), m_Specific(Op2)))); + + EXPECT_FALSE(sd_match(FMA, m_c_TernaryOp(ISD::FMA, m_Specific(Op2), + m_Specific(Op1), m_Specific(Op0)))); + EXPECT_FALSE(sd_match(FMA, m_c_TernaryOp(ISD::FMA, m_Specific(Op2), + m_Specific(Op0), m_Specific(Op1)))); + + EXPECT_FALSE(sd_match(FMA, m_c_TernaryOp(ISD::FMA, m_Specific(Op0), + m_Specific(Op2), m_Specific(Op1)))); + EXPECT_FALSE(sd_match(FMA, m_c_TernaryOp(ISD::FMA, m_Specific(Op1), + m_Specific(Op2), m_Specific(Op0)))); + + EXPECT_TRUE(sd_match( + FMA, m_c_TernaryOp(ISD::FMA, m_Value(A), m_Value(B), m_Value(C)))); + EXPECT_EQ(A, Op0); + EXPECT_EQ(B, Op1); + EXPECT_EQ(C, Op2); + + A = B = C = SDValue(); + EXPECT_TRUE(sd_match( + FMA, m_c_TernaryOp(ISD::FMA, m_Value(B), m_Value(A), m_Value(C)))); + EXPECT_EQ(A, Op1); + EXPECT_EQ(B, Op0); + EXPECT_EQ(C, Op2); + + A = B = C = SDValue(); + EXPECT_TRUE(sd_match( + FMA, m_c_TernaryOp(ISD::FMA, m_Value(A), m_Value(B), m_Value(C)))); + EXPECT_EQ(A, Op0); + EXPECT_EQ(B, Op1); + EXPECT_EQ(C, Op2); + + EXPECT_FALSE( + sd_match(FAdd, m_c_TernaryOp(ISD::FMA, m_Value(), m_Value(), m_Value()))); +} + TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) { SDLoc DL; auto Int32VT = EVT::getIntegerVT(Context, 32); From 891ee65c5678137fd9115feb08286d0ac43cf7fc Mon Sep 17 00:00:00 2001 From: Orlando Cazalet-Hyams Date: Wed, 29 Oct 2025 15:23:46 +0000 Subject: [PATCH 114/539] [DebugInfo] Add dataSize to DIBasicType to add DW_AT_bit_size to _BitInt types (#164372) DW_TAG_base_type DIEs are permitted to have both byte_size and bit_size attributes "If the value of an object of the given type does not fully occupy the storage described by a byte size attribute" * Add DataSizeInBits to DIBasicType (`DIBasicType(... dataSize: n ...)` in IR). * Change Clang to add DataSizeInBits to _BitInt type metadata. * Change LLVM to add DW_AT_bit_size to base_type DIEs that have non-zero DataSizeInBits. TODO: Do we need to emit DW_AT_data_bit_offset for big endian targets? See discussion on the PR. Fixes [#61952](https://github.com/llvm/llvm-project/issues/61952) --------- Co-authored-by: David Stenberg --- clang/lib/CodeGen/CGDebugInfo.cpp | 5 +- clang/test/DebugInfo/Generic/bit-int.c | 8 ++ llvm/include/llvm/IR/DIBuilder.h | 7 +- llvm/include/llvm/IR/DebugInfoMetadata.h | 78 ++++++++++++------- llvm/lib/AsmParser/LLParser.cpp | 9 ++- llvm/lib/Bitcode/Reader/MetadataLoader.cpp | 8 +- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 1 + .../CodeGen/AsmPrinter/DwarfCompileUnit.cpp | 6 +- llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp | 15 +++- llvm/lib/IR/AsmWriter.cpp | 1 + llvm/lib/IR/DIBuilder.cpp | 6 +- llvm/lib/IR/DebugInfoMetadata.cpp | 15 ++-- llvm/lib/IR/LLVMContextImpl.h | 11 ++- llvm/test/Bitcode/dbg-data-size-roundtrip.ll | 19 +++++ llvm/test/DebugInfo/X86/base-type-size.ll | 3 + llvm/test/DebugInfo/bit-int-size.ll | 38 +++++++++ 16 files changed, 173 insertions(+), 57 deletions(-) create mode 100644 clang/test/DebugInfo/Generic/bit-int.c create mode 100644 llvm/test/Bitcode/dbg-data-size-roundtrip.ll create mode 100644 llvm/test/DebugInfo/bit-int-size.ll diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 6af806686a3b9..07a2cfb21bef2 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -1174,14 +1174,13 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) { } llvm::DIType *CGDebugInfo::CreateType(const BitIntType *Ty) { - StringRef Name = Ty->isUnsigned() ? "unsigned _BitInt" : "_BitInt"; llvm::dwarf::TypeKind Encoding = Ty->isUnsigned() ? llvm::dwarf::DW_ATE_unsigned : llvm::dwarf::DW_ATE_signed; - return DBuilder.createBasicType(Name, CGM.getContext().getTypeSize(Ty), - Encoding); + Encoding, llvm::DINode::FlagZero, 0, + Ty->getNumBits()); } llvm::DIType *CGDebugInfo::CreateType(const ComplexType *Ty) { diff --git a/clang/test/DebugInfo/Generic/bit-int.c b/clang/test/DebugInfo/Generic/bit-int.c new file mode 100644 index 0000000000000..94b93013e3b46 --- /dev/null +++ b/clang/test/DebugInfo/Generic/bit-int.c @@ -0,0 +1,8 @@ +// RUN: %clang_cc1 -x c++ %s -debug-info-kind=standalone -gno-column-info -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -x c %s -debug-info-kind=standalone -gno-column-info -emit-llvm -o - | FileCheck %s + +unsigned _BitInt(17) a; +_BitInt(2) b; + +// CHECK: !DIBasicType(name: "_BitInt", size: 8, dataSize: 2, encoding: DW_ATE_signed) +// CHECK: !DIBasicType(name: "unsigned _BitInt", size: 32, dataSize: 17, encoding: DW_ATE_unsigned) diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h index f3839c9694f34..4228ec9c3ef7a 100644 --- a/llvm/include/llvm/IR/DIBuilder.h +++ b/llvm/include/llvm/IR/DIBuilder.h @@ -209,10 +209,15 @@ namespace llvm { /// \param NumExtraInhabitants The number of extra inhabitants of the type. /// An extra inhabitant is a bit pattern that does not represent a valid /// value for instances of a given type. This is used by the Swift language. + /// \param DataSizeInBits Optionally describes the number of bits used by + /// the value of the object when this is less than the storage size of + /// SizeInBits. Default value of zero indicates the object value and storage + /// sizes are equal. LLVM_ABI DIBasicType * createBasicType(StringRef Name, uint64_t SizeInBits, unsigned Encoding, DINode::DIFlags Flags = DINode::FlagZero, - uint32_t NumExtraInhabitants = 0); + uint32_t NumExtraInhabitants = 0, + uint32_t DataSizeInBits = 0); /// Create debugging information entry for a binary fixed-point type. /// \param Name Type name. diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h index c626efc9daaa4..7ade6b8e13308 100644 --- a/llvm/include/llvm/IR/DebugInfoMetadata.h +++ b/llvm/include/llvm/IR/DebugInfoMetadata.h @@ -891,96 +891,114 @@ class DIBasicType : public DIType { friend class MDNode; unsigned Encoding; + /// Describes the number of bits used by the value of the object. Non-zero + /// when the value of an object does not fully occupy the storage size + /// specified by SizeInBits. + uint32_t DataSizeInBits; protected: DIBasicType(LLVMContext &C, StorageType Storage, unsigned Tag, uint32_t AlignInBits, unsigned Encoding, - uint32_t NumExtraInhabitants, DIFlags Flags, - ArrayRef Ops) + uint32_t NumExtraInhabitants, uint32_t DataSizeInBits, + DIFlags Flags, ArrayRef Ops) : DIType(C, DIBasicTypeKind, Storage, Tag, 0, AlignInBits, NumExtraInhabitants, Flags, Ops), - Encoding(Encoding) {} + Encoding(Encoding), DataSizeInBits(DataSizeInBits) {} DIBasicType(LLVMContext &C, unsigned ID, StorageType Storage, unsigned Tag, uint32_t AlignInBits, unsigned Encoding, - uint32_t NumExtraInhabitants, DIFlags Flags, - ArrayRef Ops) + uint32_t NumExtraInhabitants, uint32_t DataSizeInBits, + DIFlags Flags, ArrayRef Ops) : DIType(C, ID, Storage, Tag, 0, AlignInBits, NumExtraInhabitants, Flags, Ops), - Encoding(Encoding) {} + Encoding(Encoding), DataSizeInBits(DataSizeInBits) {} ~DIBasicType() = default; static DIBasicType *getImpl(LLVMContext &Context, unsigned Tag, StringRef Name, uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding, - uint32_t NumExtraInhabitants, DIFlags Flags, + uint32_t NumExtraInhabitants, + uint32_t DataSizeInBits, DIFlags Flags, StorageType Storage, bool ShouldCreate = true) { return getImpl(Context, Tag, getCanonicalMDString(Context, Name), SizeInBits, AlignInBits, Encoding, NumExtraInhabitants, - Flags, Storage, ShouldCreate); + DataSizeInBits, Flags, Storage, ShouldCreate); } static DIBasicType *getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding, - uint32_t NumExtraInhabitants, DIFlags Flags, + uint32_t NumExtraInhabitants, + uint32_t DataSizeInBits, DIFlags Flags, StorageType Storage, bool ShouldCreate = true) { auto *SizeInBitsNode = ConstantAsMetadata::get( ConstantInt::get(Type::getInt64Ty(Context), SizeInBits)); return getImpl(Context, Tag, Name, SizeInBitsNode, AlignInBits, Encoding, - NumExtraInhabitants, Flags, Storage, ShouldCreate); + NumExtraInhabitants, DataSizeInBits, Flags, Storage, + ShouldCreate); } - LLVM_ABI static DIBasicType *getImpl(LLVMContext &Context, unsigned Tag, - MDString *Name, Metadata *SizeInBits, - uint32_t AlignInBits, unsigned Encoding, - uint32_t NumExtraInhabitants, - DIFlags Flags, StorageType Storage, - bool ShouldCreate = true); + LLVM_ABI static DIBasicType * + getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, + Metadata *SizeInBits, uint32_t AlignInBits, unsigned Encoding, + uint32_t NumExtraInhabitants, uint32_t DataSizeInBits, DIFlags Flags, + StorageType Storage, bool ShouldCreate = true); TempDIBasicType cloneImpl() const { return getTemporary(getContext(), getTag(), getRawName(), getRawSizeInBits(), getAlignInBits(), getEncoding(), - getNumExtraInhabitants(), getFlags()); + getNumExtraInhabitants(), getDataSizeInBits(), + getFlags()); } public: DEFINE_MDNODE_GET(DIBasicType, (unsigned Tag, StringRef Name), - (Tag, Name, 0, 0, 0, 0, FlagZero)) + (Tag, Name, 0, 0, 0, 0, 0, FlagZero)) DEFINE_MDNODE_GET(DIBasicType, (unsigned Tag, StringRef Name, uint64_t SizeInBits), - (Tag, Name, SizeInBits, 0, 0, 0, FlagZero)) + (Tag, Name, SizeInBits, 0, 0, 0, 0, FlagZero)) DEFINE_MDNODE_GET(DIBasicType, (unsigned Tag, MDString *Name, uint64_t SizeInBits), - (Tag, Name, SizeInBits, 0, 0, 0, FlagZero)) + (Tag, Name, SizeInBits, 0, 0, 0, 0, FlagZero)) DEFINE_MDNODE_GET(DIBasicType, (unsigned Tag, StringRef Name, uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding, DIFlags Flags), - (Tag, Name, SizeInBits, AlignInBits, Encoding, 0, Flags)) + (Tag, Name, SizeInBits, AlignInBits, Encoding, 0, 0, Flags)) DEFINE_MDNODE_GET(DIBasicType, (unsigned Tag, MDString *Name, uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding, DIFlags Flags), - (Tag, Name, SizeInBits, AlignInBits, Encoding, 0, Flags)) + (Tag, Name, SizeInBits, AlignInBits, Encoding, 0, 0, Flags)) DEFINE_MDNODE_GET(DIBasicType, (unsigned Tag, StringRef Name, uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding, uint32_t NumExtraInhabitants, DIFlags Flags), (Tag, Name, SizeInBits, AlignInBits, Encoding, - NumExtraInhabitants, Flags)) + NumExtraInhabitants, 0, Flags)) + DEFINE_MDNODE_GET(DIBasicType, + (unsigned Tag, StringRef Name, uint64_t SizeInBits, + uint32_t AlignInBits, unsigned Encoding, + uint32_t NumExtraInhabitants, uint32_t DataSizeInBits, + DIFlags Flags), + (Tag, Name, SizeInBits, AlignInBits, Encoding, + NumExtraInhabitants, DataSizeInBits, Flags)) DEFINE_MDNODE_GET(DIBasicType, (unsigned Tag, MDString *Name, uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding, - uint32_t NumExtraInhabitants, DIFlags Flags), + uint32_t NumExtraInhabitants, uint32_t DataSizeInBits, + DIFlags Flags), (Tag, Name, SizeInBits, AlignInBits, Encoding, - NumExtraInhabitants, Flags)) + NumExtraInhabitants, DataSizeInBits, Flags)) DEFINE_MDNODE_GET(DIBasicType, (unsigned Tag, MDString *Name, Metadata *SizeInBits, uint32_t AlignInBits, unsigned Encoding, - uint32_t NumExtraInhabitants, DIFlags Flags), + uint32_t NumExtraInhabitants, uint32_t DataSizeInBits, + DIFlags Flags), (Tag, Name, SizeInBits, AlignInBits, Encoding, - NumExtraInhabitants, Flags)) + NumExtraInhabitants, DataSizeInBits, Flags)) TempDIBasicType clone() const { return cloneImpl(); } unsigned getEncoding() const { return Encoding; } + uint32_t getDataSizeInBits() const { return DataSizeInBits; } + enum class Signedness { Signed, Unsigned }; /// Return the signedness of this type, or std::nullopt if this type is @@ -1010,7 +1028,7 @@ class DIFixedPointType : public DIBasicType { uint32_t AlignInBits, unsigned Encoding, DIFlags Flags, unsigned Kind, int Factor, ArrayRef Ops) : DIBasicType(C, DIFixedPointTypeKind, Storage, Tag, AlignInBits, - Encoding, 0, Flags, Ops), + Encoding, 0, 0, Flags, Ops), Kind(Kind), Factor(Factor) { assert(Kind == FixedPointBinary || Kind == FixedPointDecimal); } @@ -1019,7 +1037,7 @@ class DIFixedPointType : public DIBasicType { unsigned Kind, APInt Numerator, APInt Denominator, ArrayRef Ops) : DIBasicType(C, DIFixedPointTypeKind, Storage, Tag, AlignInBits, - Encoding, 0, Flags, Ops), + Encoding, 0, 0, Flags, Ops), Kind(Kind), Factor(0), Numerator(Numerator), Denominator(Denominator) { assert(Kind == FixedPointRational); } @@ -1028,7 +1046,7 @@ class DIFixedPointType : public DIBasicType { unsigned Kind, int Factor, APInt Numerator, APInt Denominator, ArrayRef Ops) : DIBasicType(C, DIFixedPointTypeKind, Storage, Tag, AlignInBits, - Encoding, 0, Flags, Ops), + Encoding, 0, 0, Flags, Ops), Kind(Kind), Factor(Factor), Numerator(Numerator), Denominator(Denominator) {} ~DIFixedPointType() = default; diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 4cc47c0d0260e..8e3ce4990f437 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -5642,16 +5642,17 @@ bool LLParser::parseDIBasicType(MDNode *&Result, bool IsDistinct) { OPTIONAL(name, MDStringField, ); \ OPTIONAL(size, MDUnsignedOrMDField, (0, UINT64_MAX)); \ OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX)); \ + OPTIONAL(dataSize, MDUnsignedField, (0, UINT32_MAX)); \ OPTIONAL(encoding, DwarfAttEncodingField, ); \ OPTIONAL(num_extra_inhabitants, MDUnsignedField, (0, UINT32_MAX)); \ OPTIONAL(flags, DIFlagField, ); PARSE_MD_FIELDS(); #undef VISIT_MD_FIELDS - Result = GET_OR_DISTINCT(DIBasicType, (Context, tag.Val, name.Val, - size.getValueAsMetadata(Context), - align.Val, encoding.Val, - num_extra_inhabitants.Val, flags.Val)); + Result = GET_OR_DISTINCT( + DIBasicType, + (Context, tag.Val, name.Val, size.getValueAsMetadata(Context), align.Val, + encoding.Val, num_extra_inhabitants.Val, dataSize.Val, flags.Val)); return false; } diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp index 4df500b948abf..c63dc8f00785e 100644 --- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp @@ -1531,7 +1531,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( break; } case bitc::METADATA_BASIC_TYPE: { - if (Record.size() < 6 || Record.size() > 8) + if (Record.size() < 6 || Record.size() > 9) return error("Invalid record"); IsDistinct = Record[0] & 1; @@ -1540,13 +1540,13 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( ? static_cast(Record[6]) : DINode::FlagZero; uint32_t NumExtraInhabitants = (Record.size() > 7) ? Record[7] : 0; - + uint32_t DataSizeInBits = (Record.size() > 8) ? Record[8] : 0; Metadata *SizeInBits = getMetadataOrConstant(SizeIsMetadata, Record[3]); - MetadataList.assignValue( GET_OR_DISTINCT(DIBasicType, (Context, Record[1], getMDString(Record[2]), SizeInBits, - Record[4], Record[5], NumExtraInhabitants, Flags)), + Record[4], Record[5], NumExtraInhabitants, + DataSizeInBits, Flags)), NextMetadataNo); NextMetadataNo++; break; diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 61aa7c2f5af53..f17656c7c3b03 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -1925,6 +1925,7 @@ void ModuleBitcodeWriter::writeDIBasicType(const DIBasicType *N, Record.push_back(N->getEncoding()); Record.push_back(N->getFlags()); Record.push_back(N->getNumExtraInhabitants()); + Record.push_back(N->getDataSizeInBits()); Stream.EmitRecord(bitc::METADATA_BASIC_TYPE, Record, Abbrev); Record.clear(); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index 518121e200190..751d3735d3b2b 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -1793,9 +1793,13 @@ void DwarfCompileUnit::createBaseTypeDIEs() { "_" + Twine(Btr.BitSize)).toStringRef(Str)); addUInt(Die, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, Btr.Encoding); // Round up to smallest number of bytes that contains this number of bits. + // ExprRefedBaseTypes is populated with types referenced by + // DW_OP_LLVM_convert operations in location expressions. These are often + // byte-sized, but one common counter-example is 1-bit sized conversions + // from `i1` types. TODO: Should these use DW_AT_bit_size? See + // DwarfUnit::constructTypeDIE. addUInt(Die, dwarf::DW_AT_byte_size, std::nullopt, divideCeil(Btr.BitSize, 8)); - Btr.Die = &Die; } } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index e40fb768027b8..555c56fd322bb 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -766,8 +766,19 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIBasicType *BTy) { addUInt(Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, BTy->getEncoding()); - uint64_t Size = BTy->getSizeInBits() >> 3; - addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, Size); + uint64_t SizeInBytes = divideCeil(BTy->getSizeInBits(), 8); + addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, SizeInBytes); + if (BTy->getTag() == dwarf::Tag::DW_TAG_base_type) { + // DW_TAG_base_type: + // If the value of an object of the given type does not fully occupy the + // storage described by a byte size attribute, the base type entry may also + // have a DW_AT_bit_size [...] attribute. + // TODO: Do big endian targets need DW_AT_data_bit_offset? See discussion in + // pull request #164372. + if (uint64_t DataSizeInBits = BTy->getDataSizeInBits(); + DataSizeInBits && DataSizeInBits != SizeInBytes * 8) + addUInt(Buffer, dwarf::DW_AT_bit_size, std::nullopt, DataSizeInBits); + } if (BTy->isBigEndian()) addUInt(Buffer, dwarf::DW_AT_endianity, std::nullopt, dwarf::DW_END_big); diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 3c222f54fd406..95d954f6b8174 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -2199,6 +2199,7 @@ static void writeDIBasicType(raw_ostream &Out, const DIBasicType *N, Printer.printString("name", N->getName()); Printer.printMetadataOrInt("size", N->getRawSizeInBits(), true); Printer.printInt("align", N->getAlignInBits()); + Printer.printInt("dataSize", N->getDataSizeInBits()); Printer.printDwarfEnum("encoding", N->getEncoding(), dwarf::AttributeEncodingString); Printer.printInt("num_extra_inhabitants", N->getNumExtraInhabitants()); diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index 07a870f0630a5..ca11ecf2f473e 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -261,10 +261,12 @@ DIBasicType *DIBuilder::createNullPtrType() { DIBasicType *DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits, unsigned Encoding, DINode::DIFlags Flags, - uint32_t NumExtraInhabitants) { + uint32_t NumExtraInhabitants, + uint32_t DataSizeInBits) { assert(!Name.empty() && "Unable to create type without name"); return DIBasicType::get(VMContext, dwarf::DW_TAG_base_type, Name, SizeInBits, - 0, Encoding, NumExtraInhabitants, Flags); + 0, Encoding, NumExtraInhabitants, DataSizeInBits, + Flags); } DIFixedPointType * diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp index e30df88e6b56b..fafc3254120de 100644 --- a/llvm/lib/IR/DebugInfoMetadata.cpp +++ b/llvm/lib/IR/DebugInfoMetadata.cpp @@ -872,15 +872,18 @@ DIEnumerator *DIEnumerator::getImpl(LLVMContext &Context, const APInt &Value, DIBasicType *DIBasicType::getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *SizeInBits, uint32_t AlignInBits, unsigned Encoding, - uint32_t NumExtraInhabitants, DIFlags Flags, + uint32_t NumExtraInhabitants, + uint32_t DataSizeInBits, DIFlags Flags, StorageType Storage, bool ShouldCreate) { assert(isCanonical(Name) && "Expected canonical MDString"); - DEFINE_GETIMPL_LOOKUP(DIBasicType, (Tag, Name, SizeInBits, AlignInBits, - Encoding, NumExtraInhabitants, Flags)); + DEFINE_GETIMPL_LOOKUP(DIBasicType, + (Tag, Name, SizeInBits, AlignInBits, Encoding, + NumExtraInhabitants, DataSizeInBits, Flags)); Metadata *Ops[] = {nullptr, nullptr, Name, SizeInBits, nullptr}; - DEFINE_GETIMPL_STORE(DIBasicType, - (Tag, AlignInBits, Encoding, NumExtraInhabitants, Flags), - Ops); + DEFINE_GETIMPL_STORE( + DIBasicType, + (Tag, AlignInBits, Encoding, NumExtraInhabitants, DataSizeInBits, Flags), + Ops); } std::optional DIBasicType::getSignedness() const { diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h index e03f993297e54..2c9921df0422e 100644 --- a/llvm/lib/IR/LLVMContextImpl.h +++ b/llvm/lib/IR/LLVMContextImpl.h @@ -480,20 +480,22 @@ template <> struct MDNodeKeyImpl { uint32_t AlignInBits; unsigned Encoding; uint32_t NumExtraInhabitants; + uint32_t DataSizeInBits; unsigned Flags; MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *SizeInBits, uint32_t AlignInBits, unsigned Encoding, - uint32_t NumExtraInhabitants, unsigned Flags) + uint32_t NumExtraInhabitants, uint32_t DataSizeInBits, + unsigned Flags) : Tag(Tag), Name(Name), SizeInBits(SizeInBits), AlignInBits(AlignInBits), Encoding(Encoding), NumExtraInhabitants(NumExtraInhabitants), - Flags(Flags) {} + DataSizeInBits(DataSizeInBits), Flags(Flags) {} MDNodeKeyImpl(const DIBasicType *N) : Tag(N->getTag()), Name(N->getRawName()), SizeInBits(N->getRawSizeInBits()), AlignInBits(N->getAlignInBits()), Encoding(N->getEncoding()), - NumExtraInhabitants(N->getNumExtraInhabitants()), Flags(N->getFlags()) { - } + NumExtraInhabitants(N->getNumExtraInhabitants()), + DataSizeInBits(N->getDataSizeInBits()), Flags(N->getFlags()) {} bool isKeyOf(const DIBasicType *RHS) const { return Tag == RHS->getTag() && Name == RHS->getRawName() && @@ -501,6 +503,7 @@ template <> struct MDNodeKeyImpl { AlignInBits == RHS->getAlignInBits() && Encoding == RHS->getEncoding() && NumExtraInhabitants == RHS->getNumExtraInhabitants() && + DataSizeInBits == RHS->getDataSizeInBits() && Flags == RHS->getFlags(); } diff --git a/llvm/test/Bitcode/dbg-data-size-roundtrip.ll b/llvm/test/Bitcode/dbg-data-size-roundtrip.ll new file mode 100644 index 0000000000000..36a92538b8b7c --- /dev/null +++ b/llvm/test/Bitcode/dbg-data-size-roundtrip.ll @@ -0,0 +1,19 @@ +; RUN: opt %s -o - -S | llvm-as - | llvm-dis - | FileCheck %s + +; CHECK: !DIBasicType(name: "unsigned _BitInt", size: 32, dataSize: 17, encoding: DW_ATE_unsigned) + +@a = global i8 0, align 1, !dbg !0 + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!6, !7} +!llvm.ident = !{!8} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "a", scope: !2, file: !3, line: 4, type: !5, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 22.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None) +!3 = !DIFile(filename: "bit-int.c", directory: "/") +!4 = !{!0} +!5 = !DIBasicType(name: "unsigned _BitInt", size: 32, dataSize: 17, encoding: DW_ATE_unsigned) +!6 = !{i32 2, !"Debug Info Version", i32 3} +!7 = !{i32 1, !"wchar_size", i32 4} +!8 = !{!"clang version 22.0.0git"} diff --git a/llvm/test/DebugInfo/X86/base-type-size.ll b/llvm/test/DebugInfo/X86/base-type-size.ll index 3a8dc37bdc65f..2f0ff2f60e95f 100644 --- a/llvm/test/DebugInfo/X86/base-type-size.ll +++ b/llvm/test/DebugInfo/X86/base-type-size.ll @@ -11,7 +11,10 @@ ; CHECK: DW_TAG_base_type ; CHECK-NEXT: DW_AT_name ("DW_ATE_unsigned_1") ; CHECK-NEXT: DW_AT_encoding (DW_ATE_unsigned) +;; TODO: Should this type use bit_size? +; CHECK-NOT: DW_AT_bit_size ; CHECK-NEXT: DW_AT_byte_size (0x01) +; CHECK-NOT: DW_AT_bit_size target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/DebugInfo/bit-int-size.ll b/llvm/test/DebugInfo/bit-int-size.ll new file mode 100644 index 0000000000000..e28921dc83db3 --- /dev/null +++ b/llvm/test/DebugInfo/bit-int-size.ll @@ -0,0 +1,38 @@ +; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s +; REQUIRES: object-emission + +;; Check base types with bit-sizes that don't fit fully fit within a byte +;; multiple get both a a byte_size and bit_size attribute. + +; CHECK: DW_TAG_base_type +; CHECK-NEXT: DW_AT_name ("unsigned _BitInt") +; CHECK-NEXT: DW_AT_encoding (DW_ATE_unsigned) +; CHECK-NEXT: DW_AT_byte_size (0x04) +; CHECK-NEXT: DW_AT_bit_size (0x11) + +; CHECK: DW_TAG_base_type +; CHECK-NEXT: DW_AT_name ("_BitInt") +; CHECK-NEXT: DW_AT_encoding (DW_ATE_signed) +; CHECK-NEXT: DW_AT_byte_size (0x01) +; CHECK-NEXT: DW_AT_bit_size (0x02) + +@a = global i8 0, align 1, !dbg !0 +@b = global i8 0, align 1, !dbg !5 + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!10, !11} +!llvm.ident = !{!12} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "a", scope: !2, file: !7, line: 4, type: !9, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 22.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None) +!3 = !DIFile(filename: "bit-int.c", directory: "/") +!4 = !{!0, !5} +!5 = !DIGlobalVariableExpression(var: !6, expr: !DIExpression()) +!6 = distinct !DIGlobalVariable(name: "b", scope: !2, file: !7, line: 5, type: !8, isLocal: false, isDefinition: true) +!7 = !DIFile(filename: "bit-int.c", directory: "/") +!8 = !DIBasicType(name: "_BitInt", size: 8, dataSize: 2, encoding: DW_ATE_signed) +!9 = !DIBasicType(name: "unsigned _BitInt", size: 32, dataSize: 17, encoding: DW_ATE_unsigned) +!10 = !{i32 2, !"Debug Info Version", i32 3} +!11 = !{i32 1, !"wchar_size", i32 4} +!12 = !{!"clang version 22.0.0git"} From a4fa29928e1a4cd493f0ae4841e01b6763bb1c92 Mon Sep 17 00:00:00 2001 From: Tarun Prabhu Date: Wed, 29 Oct 2025 09:26:35 -0600 Subject: [PATCH 115/539] [flang][Driver] Enable -pie and -no-pie in flang's driver Passing -pie to flang will pass the flag on to the linker. Passing -no-pie will ensure that -pie is *not* passed to the linker. This behavior is consistent with both clang and gfortran. Fixes #159970 --- clang/include/clang/Driver/Options.td | 4 +- flang/test/Driver/linker-options.f90 | 106 ++++++++++++++++++++++++++ flang/test/Driver/misc-flags.f90 | 15 ---- 3 files changed, 108 insertions(+), 17 deletions(-) create mode 100644 flang/test/Driver/linker-options.f90 delete mode 100644 flang/test/Driver/misc-flags.f90 diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 8784c9d7d206d..cb5cb888c6da7 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5999,7 +5999,6 @@ def nofixprebinding : Flag<["-"], "nofixprebinding">; def nolibc : Flag<["-"], "nolibc">; def nomultidefs : Flag<["-"], "nomultidefs">; def nopie : Flag<["-"], "nopie">, Visibility<[ClangOption, FlangOption]>, Flags<[TargetSpecific]>; // OpenBSD -def no_pie : Flag<["-"], "no-pie">, Visibility<[ClangOption, FlangOption]>; def noprebind : Flag<["-"], "noprebind">; def noprofilelib : Flag<["-"], "noprofilelib">; def noseglinkedit : Flag<["-"], "noseglinkedit">; @@ -6113,7 +6112,6 @@ defm pthread : BoolOption<"", "pthread", PosFlag, NegFlag, BothFlags<[], [ClangOption, CC1Option, FlangOption, FC1Option]>>; -def pie : Flag<["-"], "pie">, Group; def static_pie : Flag<["-"], "static-pie">, Group; def read__only__relocs : Separate<["-"], "read_only_relocs">; def remap : Flag<["-"], "remap">; @@ -6508,6 +6506,8 @@ def fpic : Flag<["-"], "fpic">, Group; def fno_pic : Flag<["-"], "fno-pic">, Group; def fpie : Flag<["-"], "fpie">, Group; def fno_pie : Flag<["-"], "fno-pie">, Group; +def pie : Flag<["-"], "pie">, Group; +def no_pie : Flag<["-"], "no-pie">, Group; } // let Vis = [Default, FlangOption] diff --git a/flang/test/Driver/linker-options.f90 b/flang/test/Driver/linker-options.f90 new file mode 100644 index 0000000000000..07f967b4bac5d --- /dev/null +++ b/flang/test/Driver/linker-options.f90 @@ -0,0 +1,106 @@ +! Make sure that `-l` is "visible" to Flang's driver +! RUN: %flang -lpgmath -### %s + +! Make sure that `-Wl` is "visible" to Flang's driver +! RUN: %flang -Wl,abs -### %s + +! Make sure that `-fuse-ld' is "visible" to Flang's driver +! RUN: %flang -fuse-ld= -### %s + +! Make sure that `-L' is "visible" to Flang's driver +! RUN: %flang -L/ -### %s + +! ------------------------------------------------------------------------------ +! Check that '-pie' and '-no-pie' are "visible" to Flang's driver. Check that +! the correct option is added to the link line. +! +! Last match "wins" +! RUN: %flang -target x86_64-pc-linux-gnu -pie -no-pie -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=NO-PIE +! RUN: %flang -target x86_64-pc-linux-gnu -no-pie -pie -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=PIE +! RUN: %flang -target x86_64-pc-linux-gnu -pie -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=PIE +! RUN: %flang -target x86_64-pc-linux-gnu -no-pie -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=NO-PIE +! +! Ensure that "-pie" is passed to the linker. +! RUN: %flang -target i386-unknown-freebsd -pie -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=PIE +! RUN: %flang -target aarch64-pc-linux-gnu -pie -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=PIE +! +! On Musl Linux, PIE is enabled by default, but can be disabled. +! RUN: %flang -target x86_64-linux-musl -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=PIE +! RUN: %flang -target i686-linux-musl -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=PIE +! RUN: %flang -target armv6-linux-musleabihf %s -### 2>&1 \ +! RUN: | FileCheck %s --check-prefix=PIE +! RUN: %flang -target armv7-linux-musleabihf %s -### 2>&1 \ +! RUN: | FileCheck %s --check-prefix=PIE +! RUN: %flang --target=x86_64-linux-musl -no-pie -### 2>&1 \ +! RUN: | FileCheck %s --check-prefix=NO-PIE +! +! On OpenBSD, -pie is not passed to the linker, but can be forced. +! RUN: %flang -target amd64-pc-openbsd -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=NO-PIE +! RUN: %flang -target i386-pc-openbsd -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=NO-PIE +! RUN: %flang -target aarch64-unknown-openbsd -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=NO-PIE +! RUN: %flang -target arm-unknown-openbsd -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=NO-PIE +! RUN: %flang -target powerpc-unknown-openbsd -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=NO-PIE +! RUN: %flang -target sparc64-unknown-openbsd -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=NO-PIE +! RUN: %flang -target i386-pc-openbsd -pie -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=PIE +! +! On FreeBSD, -pie is not passed to the linker, but can be forced. +! RUN: %flang -target amd64-pc-freebsd -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=NO-PIE +! RUN: %flang -target i386-pc-freebsd -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=NO-PIE +! RUN: %flang -target aarch64-unknown-freebsd -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=NO-PIE +! RUN: %flang -target arm-unknown-freebsd -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=NO-PIE +! RUN: %flang -target powerpc-unknown-freebsd -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=NO-PIE +! RUN: %flang -target sparc64-unknown-freebsd -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=NO-PIE +! RUN: %flang -target i386-pc-freebsd -pie -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=PIE +! +! On AIX, -pie is never passed to the linker. +! RUN: %flang -target powerpc64-unknown-aix -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefixes=NO-PIE +! RUN: %flang -target powerpc64-unknown-aix -pie -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefixes=NO-PIE,UNUSED +! RUN: %flang -target powerpc64-unknown-aix -no-pie -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefixes=NO-PIE,UNUSED +! +! On MinGW and Windows, -pie may be specified, but it is ignored. +! RUN: %flang -target aarch64-pc-windows-gnu -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefixes=NO-PIE +! RUN: %flang -target x86_64-pc-windows-gnu -pie -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefixes=NO-PIE,UNUSED +! RUN: %flang -target i686-pc-windows-gnu -no-pie -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefixes=NO-PIE,UNUSED +! RUN: %flang -target aarch64-windows-msvc -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefixes=NO-PIE +! RUN: %flang -target aarch64-windows-msvc -pie -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefixes=NO-PIE,UNUSED +! RUN: %flang -target aarch64-windows-msvc -no-pie -### %s 2>&1 \ +! RUN: | FileCheck %s --check-prefixes=NO-PIE,UNUSED +! +! PIE: "-pie" +! NO-PIE-NOT: "-pie" +! UNUSED: warning: argument unused during compilation: '{{(-no)?}}-pie' +! ------------------------------------------------------------------------------ + +program hello + write(*,*), "Hello world!" +end program hello diff --git a/flang/test/Driver/misc-flags.f90 b/flang/test/Driver/misc-flags.f90 deleted file mode 100644 index 61d763c5b64dd..0000000000000 --- a/flang/test/Driver/misc-flags.f90 +++ /dev/null @@ -1,15 +0,0 @@ -! Make sure that `-l` is "visible" to Flang's driver -! RUN: %flang -lpgmath -### %s - -! Make sure that `-Wl` is "visible" to Flang's driver -! RUN: %flang -Wl,abs -### %s - -! Make sure that `-fuse-ld' is "visible" to Flang's driver -! RUN: %flang -fuse-ld= -### %s - -! Make sure that `-L' is "visible" to Flang's driver -! RUN: %flang -L/ -### %s - -program hello - write(*,*), "Hello world!" -end program hello From 98c0359aac61d1e81a9c6937f564f52a4f6778fe Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Wed, 29 Oct 2025 15:43:47 +0000 Subject: [PATCH 116/539] [DebugInfo] Propagate DebugLoc from switch in simplifySwitchOfPowersOfTwo (#165335) A recent commit 00f5a1e30b modified simplifySwitchOfPowersOfTwo to generate a branch to handle the non-power-of-2 case when appropriate, but does not set a DebugLoc on the new branch instruction; this patch propagates the switch's DebugLoc to the new branch, as for the other instructions generated in the same block. Found using #107279. --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 4 +- .../X86/debugloc-switch-powers-of-two.ll | 81 +++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SimplifyCFG/X86/debugloc-switch-powers-of-two.ll diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 4fac5d36ddb3f..90423d30aadb2 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -7623,7 +7623,9 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, auto *DefaultCaseBB = SI->getDefaultDest(); BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU); auto It = OrigBB->getTerminator()->getIterator(); - BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It); + auto *BI = BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It); + // BI is handling the default case for SI, and so should share its DebugLoc. + BI->setDebugLoc(SI->getDebugLoc()); It->eraseFromParent(); addPredecessorToBlock(DefaultCaseBB, OrigBB, SplitBB); diff --git a/llvm/test/Transforms/SimplifyCFG/X86/debugloc-switch-powers-of-two.ll b/llvm/test/Transforms/SimplifyCFG/X86/debugloc-switch-powers-of-two.ll new file mode 100644 index 0000000000000..a276067530669 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/X86/debugloc-switch-powers-of-two.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool ./build/bin/opt --version 6 +; RUN: opt -passes='simplifycfg' -simplifycfg-require-and-preserve-domtree=1 -S < %s | FileCheck %s +;; As we replace the switch statement with a set of instructions that may more +;; efficiently perform the conditional check, the DILocation of the switch +;; should be propagated to all of its replacing instructions. + +target triple = "x86_64-unknown-linux-gnu" + +define i32 @switch_of_powers_two_default_reachable(i32 %arg) !dbg !5 { +; CHECK-LABEL: define i32 @switch_of_powers_two_default_reachable( +; CHECK-SAME: i32 [[ARG:%.*]]) !dbg [[DBG5:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[ARG]]), !dbg [[DBG8:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 1, !dbg [[DBG8]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[ENTRY_SPLIT:.*]], label %[[RETURN:.*]], !dbg [[DBG8]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[ARG]], i1 true), !dbg [[DBG8]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 7, !dbg [[DBG8]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[SWITCH_LOOKUP:.*]], label %[[RETURN]], !dbg [[DBG8]] +; CHECK: [[SWITCH_LOOKUP]]: +; CHECK-NEXT: [[TMP4:%.*]] = zext nneg i32 [[TMP2]] to i64, !dbg [[DBG8]] +; CHECK-NEXT: [[SWITCH_GEP:%.*]] = getelementptr inbounds [7 x i32], ptr @switch.table.switch_of_powers_two_default_reachable, i64 0, i64 [[TMP4]], !dbg [[DBG8]] +; CHECK-NEXT: [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4, !dbg [[DBG8]] +; CHECK-NEXT: br label %[[RETURN]], !dbg [[DBG8]] +; CHECK: [[RETURN]]: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ 5, %[[ENTRY_SPLIT]] ], [ [[SWITCH_LOAD]], %[[SWITCH_LOOKUP]] ] +; CHECK-NEXT: ret i32 [[PHI]] +; +entry: + switch i32 %arg, label %default_case [ + i32 1, label %bb1 + i32 8, label %bb2 + i32 16, label %bb3 + i32 32, label %bb4 + i32 64, label %bb5 + ], !dbg !8 + +default_case: ; preds = %entry + br label %return + +bb1: ; preds = %entry + br label %return + +bb2: ; preds = %entry + br label %return + +bb3: ; preds = %entry + br label %return + +bb4: ; preds = %entry + br label %return + +bb5: ; preds = %entry + br label %return + +return: ; preds = %bb5, %bb4, %bb3, %bb2, %bb1, %default_case + %phi = phi i32 [ 3, %bb1 ], [ 2, %bb2 ], [ 1, %bb3 ], [ 0, %bb4 ], [ 42, %bb5 ], [ 5, %default_case ] + ret i32 %phi +} + +!llvm.dbg.cu = !{!0} +!llvm.debugify = !{!2, !3} +!llvm.module.flags = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "debugloc-switch-powers-of-two.ll", directory: "/") +!2 = !{i32 9} +!3 = !{i32 1} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "switch_of_powers_two_default_reachable", linkageName: "switch_of_powers_two_default_reachable", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !7) +!6 = !DISubroutineType(types: !7) +!7 = !{} +!8 = !DILocation(line: 1, column: 1, scope: !5) +;. +; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +; CHECK: [[META1]] = !DIFile(filename: "{{.*}}debugloc-switch-powers-of-two.ll", directory: {{.*}}) +; CHECK: [[DBG5]] = distinct !DISubprogram(name: "switch_of_powers_two_default_reachable", linkageName: "switch_of_powers_two_default_reachable", scope: null, file: [[META1]], line: 1, type: [[META6:![0-9]+]], scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META7:![0-9]+]]) +; CHECK: [[META6]] = !DISubroutineType(types: [[META7]]) +; CHECK: [[META7]] = !{} +; CHECK: [[DBG8]] = !DILocation(line: 1, column: 1, scope: [[DBG5]]) +;. From 2c4428307758107dcaab14b2ff70319c0c9bb89e Mon Sep 17 00:00:00 2001 From: nerix Date: Wed, 29 Oct 2025 16:51:38 +0100 Subject: [PATCH 117/539] [LLDB] Use native PDB reader by default (#165363) All PDB tests now pass when compiled without DIA on Windows, so they pass with the native reader. With this PR, the default reader changes to the native reader. The plan is to eventually remove the DIA reader (see https://discourse.llvm.org/t/rfc-removing-the-dia-pdb-plugin-from-lldb/87827 and #114906). For now, DIA can be used by setting `plugin.symbol-file.pdb.reader` to `dia` or by setting `LLDB_USE_NATIVE_PDB_READER=0` (mostly undocumented, but used in tests). --- .../Plugins/SymbolFile/PDB/SymbolFilePDB.cpp | 18 ++++++------------ .../SymbolFile/NativePDB/native-setting.cpp | 6 +++--- .../Shell/SymbolFile/PDB/native-setting.cpp | 8 ++++---- llvm/docs/ReleaseNotes.md | 4 ++++ 4 files changed, 17 insertions(+), 19 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp index 3b936c06b1072..0ccb1804bb13a 100644 --- a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp +++ b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp @@ -83,8 +83,8 @@ constexpr OptionEnumValueElement g_pdb_reader_enums[] = { { ePDBReaderDefault, "default", - "Use DIA PDB reader unless LLDB_USE_NATIVE_PDB_READER environment " - "variable is set", + "Use native PDB reader unless LLDB_USE_NATIVE_PDB_READER environment " + "is set to 0", }, { ePDBReaderDIA, @@ -109,16 +109,10 @@ enum { static const bool g_should_use_native_reader_by_default = [] { llvm::StringRef env_value = ::getenv("LLDB_USE_NATIVE_PDB_READER"); -#if !LLVM_ENABLE_DIA_SDK || !defined(_WIN32) - // if the environment value is unset, the native reader is requested - if (env_value.empty()) - return true; -#endif - - return env_value.equals_insensitive("on") || - env_value.equals_insensitive("yes") || - env_value.equals_insensitive("1") || - env_value.equals_insensitive("true"); + return !env_value.equals_insensitive("off") && + !env_value.equals_insensitive("no") && + !env_value.equals_insensitive("0") && + !env_value.equals_insensitive("false"); }(); class PluginProperties : public Properties { diff --git a/lldb/test/Shell/SymbolFile/NativePDB/native-setting.cpp b/lldb/test/Shell/SymbolFile/NativePDB/native-setting.cpp index dc26ec8d30cb4..91f451fd0dadc 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/native-setting.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/native-setting.cpp @@ -8,9 +8,9 @@ // RUN: env LLDB_USE_NATIVE_PDB_READER=0 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s // RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=foo %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=42 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=-1 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=foo %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=42 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=-1 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s // RUN: env LLDB_USE_NATIVE_PDB_READER=0 %lldb \ // RUN: -o 'settings set plugin.symbol-file.pdb.reader dia' \ diff --git a/lldb/test/Shell/SymbolFile/PDB/native-setting.cpp b/lldb/test/Shell/SymbolFile/PDB/native-setting.cpp index f5e54592b0b31..54b7f28a71259 100644 --- a/lldb/test/Shell/SymbolFile/PDB/native-setting.cpp +++ b/lldb/test/Shell/SymbolFile/PDB/native-setting.cpp @@ -8,9 +8,9 @@ // RUN: env LLDB_USE_NATIVE_PDB_READER=0 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s // RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=foo %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=42 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=-1 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=foo %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=42 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=-1 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s // RUN: env LLDB_USE_NATIVE_PDB_READER=0 %lldb \ // RUN: -o 'settings set plugin.symbol-file.pdb.reader dia' \ @@ -36,7 +36,7 @@ // NO-ENV-NOT: warning: // NO-ENV: (lldb) target modules dump symfile // NO-ENV: Dumping debug symbols for 1 modules. -// NO-ENV: SymbolFile pdb +// NO-ENV: SymbolFile native-pdb // ENV0-NOT: warning: // ENV0: (lldb) target modules dump symfile diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 36383b12788f9..49158fb4217b6 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -191,6 +191,10 @@ Changes to LLDB * The `show-progress` setting, which became a NOOP with the introduction of the statusline, now defaults to off and controls using OSC escape codes to show a native progress bar in supporting terminals like Ghostty and ConEmu. +* The default PDB reader on Windows was changed from DIA to native, which uses + LLVM's PDB and CodeView support. You can switch back to the DIA reader with + `settings set plugin.symbol-file.pdb.reader dia`. Note that support for the + DIA reader will be removed in a future version of LLDB. Changes to BOLT --------------------------------- From afe36d7666fb279651e929f00844888113ab0583 Mon Sep 17 00:00:00 2001 From: Hongyu Chen Date: Thu, 30 Oct 2025 00:34:48 +0800 Subject: [PATCH 118/539] [DFAJumpThreading] Add MaxOuterUseBlocks threshold (#163428) For every threadable path `B1 -> B2 -> ... -> Bn`, we need to insert phi nodes into every unduplicated successor of `Bi` if there are outer uses of duplicated definitions in `B_i`. To prevent the booming of phi nodes, this patch adds a threshold for the maximum number of unduplicated successors that may contain outer uses. This threshold makes sense especially when multi-target branches like switch/indirectbr/callbr are duplicated. Note that the O3 statistics in llvm-test-suite are not influenced. --- .../Transforms/Scalar/DFAJumpThreading.cpp | 38 +- .../DFAJumpThreading/max-outer-uses.ll | 326 ++++++++++++++++++ 2 files changed, 359 insertions(+), 5 deletions(-) create mode 100644 llvm/test/Transforms/DFAJumpThreading/max-outer-uses.ll diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 66e45ecbde7df..e84ca819b93d8 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -122,16 +122,22 @@ static cl::opt cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50)); -extern cl::opt ProfcheckDisableMetadataFixes; - -} // namespace llvm - static cl::opt MaxClonedRate( "dfa-max-cloned-rate", cl::desc( "Maximum cloned instructions rate accepted for the transformation"), cl::Hidden, cl::init(7.5)); +static cl::opt + MaxOuterUseBlocks("dfa-max-out-use-blocks", + cl::desc("Maximum unduplicated blocks with outer uses " + "accepted for the transformation"), + cl::Hidden, cl::init(40)); + +extern cl::opt ProfcheckDisableMetadataFixes; + +} // namespace llvm + namespace { class SelectInstToUnfold { SelectInst *SI; @@ -965,8 +971,16 @@ struct TransformDFA { // SLPVectorizer. // TODO: Thread the switch partially before reaching the threshold. uint64_t NumOrigInst = 0; - for (auto *BB : DuplicateMap.keys()) + uint64_t NumOuterUseBlock = 0; + for (auto *BB : DuplicateMap.keys()) { NumOrigInst += BB->sizeWithoutDebug(); + // Only unduplicated blocks with single predecessor require new phi + // nodes. + for (auto *Succ : successors(BB)) + if (!DuplicateMap.count(Succ) && Succ->getSinglePredecessor()) + NumOuterUseBlock++; + } + if (double(NumClonedInst) / double(NumOrigInst) > MaxClonedRate) { LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much " "instructions wll be cloned\n"); @@ -977,6 +991,20 @@ struct TransformDFA { return false; } + // Too much unduplicated blocks with outer uses may cause too much + // insertions of phi nodes for duplicated definitions. TODO: Drop this + // threshold if we come up with another way to reduce the number of inserted + // phi nodes. + if (NumOuterUseBlock > MaxOuterUseBlocks) { + LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much " + "blocks with outer uses\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable", Switch) + << "Too much blocks with outer uses."; + }); + return false; + } + InstructionCost DuplicationCost = 0; unsigned JumpTableSize = 0; diff --git a/llvm/test/Transforms/DFAJumpThreading/max-outer-uses.ll b/llvm/test/Transforms/DFAJumpThreading/max-outer-uses.ll new file mode 100644 index 0000000000000..dfcc5b1a5c3fe --- /dev/null +++ b/llvm/test/Transforms/DFAJumpThreading/max-outer-uses.ll @@ -0,0 +1,326 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=dfa-jump-threading -dfa-max-out-use-blocks=5 %s | FileCheck %s + +declare void @use(i32) + +define void @max_outer_uses_by_switch(i32 %cond, ptr %p) { +; CHECK-LABEL: define void @max_outer_uses_by_switch( +; CHECK-SAME: i32 [[COND:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[SWITCH_BB:.*]] +; CHECK: [[SWITCH_BB]]: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[DETERMINE:%.*]], %[[SUB_SWITCH_BB:.*]] ], [ 2, %[[CASE2:.*]] ] +; CHECK-NEXT: switch i32 [[PHI]], label %[[DEFAULT_DEST:.*]] [ +; CHECK-NEXT: i32 0, label %[[CASE1:.*]] +; CHECK-NEXT: i32 1, label %[[CASE2]] +; CHECK-NEXT: i32 2, label %[[CASE3:.*]] +; CHECK-NEXT: ] +; CHECK: [[CASE1]]: +; CHECK-NEXT: br label %[[SUB_SWITCH_BB]] +; CHECK: [[CASE3]]: +; CHECK-NEXT: br label %[[SUB_SWITCH_BB]] +; CHECK: [[SUB_SWITCH_BB]]: +; CHECK-NEXT: [[DETERMINE]] = phi i32 [ 1, %[[CASE1]] ], [ 3, %[[CASE3]] ] +; CHECK-NEXT: [[DEF:%.*]] = load i32, ptr [[P]], align 4 +; CHECK-NEXT: switch i32 [[COND]], label %[[SWITCH_BB]] [ +; CHECK-NEXT: i32 0, label %[[OUTER1:.*]] +; CHECK-NEXT: i32 1, label %[[OUTER2:.*]] +; CHECK-NEXT: i32 2, label %[[OUTER3:.*]] +; CHECK-NEXT: i32 3, label %[[OUTER4:.*]] +; CHECK-NEXT: ] +; CHECK: [[CASE2]]: +; CHECK-NEXT: br label %[[SWITCH_BB]] +; CHECK: [[OUTER1]]: +; CHECK-NEXT: call void @use(i32 [[DEF]]) +; CHECK-NEXT: ret void +; CHECK: [[OUTER2]]: +; CHECK-NEXT: call void @use(i32 [[DEF]]) +; CHECK-NEXT: ret void +; CHECK: [[OUTER3]]: +; CHECK-NEXT: call void @use(i32 [[DEF]]) +; CHECK-NEXT: ret void +; CHECK: [[OUTER4]]: +; CHECK-NEXT: call void @use(i32 [[DEF]]) +; CHECK-NEXT: ret void +; CHECK: [[DEFAULT_DEST]]: +; CHECK-NEXT: ret void +; +entry: + br label %switch_bb + +switch_bb: + %phi = phi i32 [ 0, %entry ], [ %determine, %sub_switch_bb ], [ 2, %case2 ] + switch i32 %phi, label %default_dest [ + i32 0, label %case1 + i32 1, label %case2 + i32 2, label %case3 + ] + +case1: + br label %sub_switch_bb + +case3: + br label %sub_switch_bb + +sub_switch_bb: + %determine = phi i32 [ 1, %case1 ], [ 3, %case3 ] + %def = load i32, ptr %p + switch i32 %cond, label %switch_bb [ + i32 0, label %outer1 + i32 1, label %outer2 + i32 2, label %outer3 + i32 3, label %outer4 + ] + +case2: + br label %switch_bb + +outer1: + call void @use(i32 %def) + ret void + +outer2: + call void @use(i32 %def) + ret void + +outer3: + call void @use(i32 %def) + ret void + +outer4: + call void @use(i32 %def) + ret void + +default_dest: + ret void +} + +define void @less_outer_uses_by_switch(i32 %cond, ptr %p) { +; CHECK-LABEL: define void @less_outer_uses_by_switch( +; CHECK-SAME: i32 [[COND:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[SWITCH_BB:.*]] +; CHECK: [[SWITCH_BB]]: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ poison, %[[SUB_SWITCH_BB:.*]] ] +; CHECK-NEXT: switch i32 [[PHI]], label %[[DEFAULT_DEST:.*]] [ +; CHECK-NEXT: i32 0, label %[[CASE1:.*]] +; CHECK-NEXT: i32 1, label %[[CASE2:.*]] +; CHECK-NEXT: i32 2, label %[[CASE3:.*]] +; CHECK-NEXT: ] +; CHECK: [[SWITCH_BB_JT2:.*]]: +; CHECK-NEXT: [[PHI_JT2:%.*]] = phi i32 [ 2, %[[CASE2]] ] +; CHECK-NEXT: br label %[[CASE3]] +; CHECK: [[SWITCH_BB_JT3:.*]]: +; CHECK-NEXT: [[PHI_JT3:%.*]] = phi i32 [ [[DETERMINE_JT3:%.*]], %[[SUB_SWITCH_BB_JT3:.*]] ] +; CHECK-NEXT: br label %[[DEFAULT_DEST]] +; CHECK: [[SWITCH_BB_JT1:.*]]: +; CHECK-NEXT: [[PHI_JT1:%.*]] = phi i32 [ [[DETERMINE_JT1:%.*]], %[[SUB_SWITCH_BB_JT1:.*]] ] +; CHECK-NEXT: br label %[[CASE2]] +; CHECK: [[CASE1]]: +; CHECK-NEXT: br label %[[SUB_SWITCH_BB_JT1]] +; CHECK: [[CASE3]]: +; CHECK-NEXT: br label %[[SUB_SWITCH_BB_JT3]] +; CHECK: [[SUB_SWITCH_BB]]: +; CHECK-NEXT: [[DEF:%.*]] = load i32, ptr [[P]], align 4 +; CHECK-NEXT: switch i32 [[COND]], label %[[SWITCH_BB]] [ +; CHECK-NEXT: i32 0, label %[[OUTER1:.*]] +; CHECK-NEXT: ] +; CHECK: [[SUB_SWITCH_BB_JT3]]: +; CHECK-NEXT: [[DETERMINE_JT3]] = phi i32 [ 3, %[[CASE3]] ] +; CHECK-NEXT: [[DEF_JT3:%.*]] = load i32, ptr [[P]], align 4 +; CHECK-NEXT: switch i32 [[COND]], label %[[SWITCH_BB_JT3]] [ +; CHECK-NEXT: i32 0, label %[[OUTER1]] +; CHECK-NEXT: ] +; CHECK: [[SUB_SWITCH_BB_JT1]]: +; CHECK-NEXT: [[DETERMINE_JT1]] = phi i32 [ 1, %[[CASE1]] ] +; CHECK-NEXT: [[DEF_JT1:%.*]] = load i32, ptr [[P]], align 4 +; CHECK-NEXT: switch i32 [[COND]], label %[[SWITCH_BB_JT1]] [ +; CHECK-NEXT: i32 0, label %[[OUTER1]] +; CHECK-NEXT: ] +; CHECK: [[CASE2]]: +; CHECK-NEXT: br label %[[SWITCH_BB_JT2]] +; CHECK: [[OUTER1]]: +; CHECK-NEXT: [[DEF1:%.*]] = phi i32 [ [[DEF_JT3]], %[[SUB_SWITCH_BB_JT3]] ], [ [[DEF_JT1]], %[[SUB_SWITCH_BB_JT1]] ], [ [[DEF]], %[[SUB_SWITCH_BB]] ] +; CHECK-NEXT: call void @use(i32 [[DEF1]]) +; CHECK-NEXT: ret void +; CHECK: [[DEFAULT_DEST]]: +; CHECK-NEXT: ret void +; +entry: + br label %switch_bb + +switch_bb: + %phi = phi i32 [ 0, %entry ], [ %determine, %sub_switch_bb ], [ 2, %case2 ] + switch i32 %phi, label %default_dest [ + i32 0, label %case1 + i32 1, label %case2 + i32 2, label %case3 + ] + +case1: + br label %sub_switch_bb + +case3: + br label %sub_switch_bb + +sub_switch_bb: + %determine = phi i32 [ 1, %case1 ], [ 3, %case3 ] + %def = load i32, ptr %p + switch i32 %cond, label %switch_bb [ + i32 0, label %outer1 + ] + +case2: + br label %switch_bb + +outer1: + call void @use(i32 %def) + ret void + +default_dest: + ret void +} + + +define void @max_outer_uses_multi_preds(i32 %cond, ptr %p) { +; CHECK-LABEL: define void @max_outer_uses_multi_preds( +; CHECK-SAME: i32 [[COND:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[SWITCH_BB:.*]] +; CHECK: [[SWITCH_BB]]: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ poison, %[[SUB_SWITCH_BB:.*]] ] +; CHECK-NEXT: switch i32 [[PHI]], label %[[DEFAULT_DEST:.*]] [ +; CHECK-NEXT: i32 0, label %[[CASE1:.*]] +; CHECK-NEXT: i32 1, label %[[CASE2:.*]] +; CHECK-NEXT: i32 2, label %[[CASE3:.*]] +; CHECK-NEXT: i32 3, label %[[CASE4:.*]] +; CHECK-NEXT: ] +; CHECK: [[SWITCH_BB_JT2:.*]]: +; CHECK-NEXT: [[PHI_JT2:%.*]] = phi i32 [ 2, %[[CASE2]] ] +; CHECK-NEXT: br label %[[CASE3]] +; CHECK: [[SWITCH_BB_JT3:.*]]: +; CHECK-NEXT: [[PHI_JT3:%.*]] = phi i32 [ [[DETERMINE_JT3:%.*]], %[[SUB_SWITCH_BB_JT3:.*]] ] +; CHECK-NEXT: br label %[[CASE4]] +; CHECK: [[SWITCH_BB_JT1:.*]]: +; CHECK-NEXT: [[PHI_JT1:%.*]] = phi i32 [ [[DETERMINE_JT1:%.*]], %[[SUB_SWITCH_BB_JT1:.*]] ] +; CHECK-NEXT: br label %[[CASE2]] +; CHECK: [[CASE1]]: +; CHECK-NEXT: br label %[[SUB_SWITCH_BB_JT1]] +; CHECK: [[CASE3]]: +; CHECK-NEXT: br label %[[SUB_SWITCH_BB_JT3]] +; CHECK: [[SUB_SWITCH_BB]]: +; CHECK-NEXT: [[DEF:%.*]] = load i32, ptr [[P]], align 4 +; CHECK-NEXT: switch i32 [[COND]], label %[[SWITCH_BB]] [ +; CHECK-NEXT: i32 0, label %[[OUTER1:.*]] +; CHECK-NEXT: i32 1, label %[[OUTER2:.*]] +; CHECK-NEXT: i32 2, label %[[OUTER3:.*]] +; CHECK-NEXT: i32 3, label %[[OUTER4:.*]] +; CHECK-NEXT: ] +; CHECK: [[SUB_SWITCH_BB_JT3]]: +; CHECK-NEXT: [[DETERMINE_JT3]] = phi i32 [ 3, %[[CASE3]] ] +; CHECK-NEXT: [[DEF_JT3:%.*]] = load i32, ptr [[P]], align 4 +; CHECK-NEXT: switch i32 [[COND]], label %[[SWITCH_BB_JT3]] [ +; CHECK-NEXT: i32 0, label %[[OUTER1]] +; CHECK-NEXT: i32 1, label %[[OUTER2]] +; CHECK-NEXT: i32 2, label %[[OUTER3]] +; CHECK-NEXT: i32 3, label %[[OUTER4]] +; CHECK-NEXT: ] +; CHECK: [[SUB_SWITCH_BB_JT1]]: +; CHECK-NEXT: [[DETERMINE_JT1]] = phi i32 [ 1, %[[CASE1]] ] +; CHECK-NEXT: [[DEF_JT1:%.*]] = load i32, ptr [[P]], align 4 +; CHECK-NEXT: switch i32 [[COND]], label %[[SWITCH_BB_JT1]] [ +; CHECK-NEXT: i32 0, label %[[OUTER1]] +; CHECK-NEXT: i32 1, label %[[OUTER2]] +; CHECK-NEXT: i32 2, label %[[OUTER3]] +; CHECK-NEXT: i32 3, label %[[OUTER4]] +; CHECK-NEXT: ] +; CHECK: [[CASE4]]: +; CHECK-NEXT: [[DEF1:%.*]] = load i32, ptr [[P]], align 4 +; CHECK-NEXT: switch i32 [[COND]], label %[[OUTER4]] [ +; CHECK-NEXT: i32 0, label %[[OUTER1]] +; CHECK-NEXT: i32 1, label %[[OUTER2]] +; CHECK-NEXT: i32 2, label %[[OUTER3]] +; CHECK-NEXT: ] +; CHECK: [[CASE2]]: +; CHECK-NEXT: br label %[[SWITCH_BB_JT2]] +; CHECK: [[OUTER1]]: +; CHECK-NEXT: [[PHI1:%.*]] = phi i32 [ [[DEF]], %[[SUB_SWITCH_BB]] ], [ [[DEF1]], %[[CASE4]] ], [ [[DEF_JT1]], %[[SUB_SWITCH_BB_JT1]] ], [ [[DEF_JT3]], %[[SUB_SWITCH_BB_JT3]] ] +; CHECK-NEXT: call void @use(i32 [[PHI1]]) +; CHECK-NEXT: ret void +; CHECK: [[OUTER2]]: +; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ [[DEF]], %[[SUB_SWITCH_BB]] ], [ [[DEF1]], %[[CASE4]] ], [ [[DEF_JT1]], %[[SUB_SWITCH_BB_JT1]] ], [ [[DEF_JT3]], %[[SUB_SWITCH_BB_JT3]] ] +; CHECK-NEXT: call void @use(i32 [[PHI2]]) +; CHECK-NEXT: ret void +; CHECK: [[OUTER3]]: +; CHECK-NEXT: [[PHI3:%.*]] = phi i32 [ [[DEF]], %[[SUB_SWITCH_BB]] ], [ [[DEF1]], %[[CASE4]] ], [ [[DEF_JT1]], %[[SUB_SWITCH_BB_JT1]] ], [ [[DEF_JT3]], %[[SUB_SWITCH_BB_JT3]] ] +; CHECK-NEXT: call void @use(i32 [[PHI3]]) +; CHECK-NEXT: ret void +; CHECK: [[OUTER4]]: +; CHECK-NEXT: [[PHI4:%.*]] = phi i32 [ [[DEF]], %[[SUB_SWITCH_BB]] ], [ [[DEF1]], %[[CASE4]] ], [ [[DEF_JT1]], %[[SUB_SWITCH_BB_JT1]] ], [ [[DEF_JT3]], %[[SUB_SWITCH_BB_JT3]] ] +; CHECK-NEXT: call void @use(i32 [[PHI4]]) +; CHECK-NEXT: ret void +; CHECK: [[DEFAULT_DEST]]: +; CHECK-NEXT: ret void +; +entry: + br label %switch_bb + +switch_bb: + %phi = phi i32 [ 0, %entry ], [ %determine, %sub_switch_bb ], [ 2, %case2 ] + switch i32 %phi, label %default_dest [ + i32 0, label %case1 + i32 1, label %case2 + i32 2, label %case3 + i32 3, label %case4 + ] + +case1: + br label %sub_switch_bb + +case3: + br label %sub_switch_bb + +sub_switch_bb: + %determine = phi i32 [ 1, %case1 ], [ 3, %case3 ] + %def = load i32, ptr %p + switch i32 %cond, label %switch_bb [ + i32 0, label %outer1 + i32 1, label %outer2 + i32 2, label %outer3 + i32 3, label %outer4 + ] + +case4: + %def1 = load i32, ptr %p + switch i32 %cond, label %outer4 [ + i32 0, label %outer1 + i32 1, label %outer2 + i32 2, label %outer3 + ] + +case2: + br label %switch_bb + +outer1: + %phi1 = phi i32 [ %def, %sub_switch_bb ], [ %def1, %case4 ] + call void @use(i32 %phi1) + ret void + +outer2: + %phi2 = phi i32 [ %def, %sub_switch_bb ], [ %def1, %case4 ] + call void @use(i32 %phi2) + ret void + +outer3: + %phi3 = phi i32 [ %def, %sub_switch_bb ], [ %def1, %case4 ] + call void @use(i32 %phi3) + ret void + +outer4: + %phi4 = phi i32 [ %def, %sub_switch_bb ], [ %def1, %case4 ] + call void @use(i32 %phi4) + ret void + +default_dest: + ret void +} From f17ef6fa3239ba2e8edbe36f8bac61f5159cb7ba Mon Sep 17 00:00:00 2001 From: Princeton Ferro Date: Wed, 29 Oct 2025 09:46:01 -0700 Subject: [PATCH 119/539] [DAGCombiner] Lower dynamic insertelt chain more efficiently (#162368) For an insertelt with a dynamic index, the default handling in DAGTypeLegalizer and LegalizeDAG will reserve a stack slot for the vector, lower the insertelt to a store, then load the modified vector back into temporaries. The vector store and load may be legalized into a sequence of smaller operations depending on the target. Let V = the vector size and L = the length of a chain of insertelts with dynamic indices. In the worse case, this chain will lower to O(VL) operations, which can increase code size dramatically. Instead, identify such chains, reserve one stack slot for the vector, and lower all of the insertelts to stores at once. This requires only O(V + L) operations. This change only affects the default lowering behavior. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 87 +++ llvm/test/CodeGen/NVPTX/insertelt-dynamic.ll | 738 ++++++++++++++++++ llvm/test/CodeGen/PowerPC/vec_insert_elt.ll | 58 +- 3 files changed, 848 insertions(+), 35 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/insertelt-dynamic.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index cf221bba1e3a3..1ef5dc2863eb6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23506,6 +23506,93 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... > if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) return DAG.getSplat(VT, DL, InVal); + + // Extend this type to be byte-addressable + EVT OldVT = VT; + EVT EltVT = VT.getVectorElementType(); + bool IsByteSized = EltVT.isByteSized(); + if (!IsByteSized) { + EltVT = + EltVT.changeTypeToInteger().getRoundIntegerType(*DAG.getContext()); + VT = VT.changeElementType(EltVT); + } + + // Check if this operation will be handled the default way for its type. + auto IsTypeDefaultHandled = [this](EVT VT) { + return TLI.getTypeAction(*DAG.getContext(), VT) == + TargetLowering::TypeSplitVector || + TLI.isOperationExpand(ISD::INSERT_VECTOR_ELT, VT); + }; + + // Check if this operation is illegal and will be handled the default way, + // even after extending the type to be byte-addressable. + if (IsTypeDefaultHandled(OldVT) && IsTypeDefaultHandled(VT)) { + // For each dynamic insertelt, the default way will save the vector to + // the stack, store at an offset, and load the modified vector. This can + // dramatically increase code size if we have a chain of insertelts on a + // large vector: requiring O(V*C) stores/loads where V = length of + // vector and C is length of chain. If each insertelt is only fed into the + // next, the vector is write-only across this chain, and we can just + // save once before the chain and load after in O(V + C) operations. + SmallVector Seq{N}; + unsigned NumDynamic = 1; + while (true) { + SDValue InVec = Seq.back()->getOperand(0); + if (InVec.getOpcode() != ISD::INSERT_VECTOR_ELT) + break; + Seq.push_back(InVec.getNode()); + NumDynamic += !isa(InVec.getOperand(2)); + } + + // It always and only makes sense to lower this sequence when we have more + // than one dynamic insertelt, since we will not have more than V constant + // insertelts, so we will be reducing the total number of stores+loads. + if (NumDynamic > 1) { + // In cases where the vector is illegal it will be broken down into + // parts and stored in parts - we should use the alignment for the + // smallest part. + Align SmallestAlign = DAG.getReducedAlign(VT, /*UseABI=*/false); + SDValue StackPtr = + DAG.CreateStackTemporary(VT.getStoreSize(), SmallestAlign); + auto &MF = DAG.getMachineFunction(); + int FrameIndex = cast(StackPtr.getNode())->getIndex(); + auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); + + // Save the vector to the stack + SDValue InVec = Seq.back()->getOperand(0); + if (!IsByteSized) + InVec = DAG.getNode(ISD::ANY_EXTEND, DL, VT, InVec); + SDValue Store = DAG.getStore(DAG.getEntryNode(), DL, InVec, StackPtr, + PtrInfo, SmallestAlign); + + // Lower each dynamic insertelt to a store + for (SDNode *N : reverse(Seq)) { + SDValue Elmnt = N->getOperand(1); + SDValue Index = N->getOperand(2); + + // Check if we have to extend the element type + if (!IsByteSized && Elmnt.getValueType().bitsLT(EltVT)) + Elmnt = DAG.getNode(ISD::ANY_EXTEND, DL, EltVT, Elmnt); + + // Store the new element. This may be larger than the vector element + // type, so use a truncating store. + SDValue EltPtr = + TLI.getVectorElementPointer(DAG, StackPtr, VT, Index); + EVT EltVT = Elmnt.getValueType(); + Store = DAG.getTruncStore( + Store, DL, Elmnt, EltPtr, MachinePointerInfo::getUnknownStack(MF), + EltVT, + commonAlignment(SmallestAlign, EltVT.getFixedSizeInBits() / 8)); + } + + // Load the saved vector from the stack + SDValue Load = + DAG.getLoad(VT, DL, Store, StackPtr, PtrInfo, SmallestAlign); + SDValue LoadV = Load.getValue(0); + return IsByteSized ? LoadV : DAG.getAnyExtOrTrunc(LoadV, DL, OldVT); + } + } + return SDValue(); } diff --git a/llvm/test/CodeGen/NVPTX/insertelt-dynamic.ll b/llvm/test/CodeGen/NVPTX/insertelt-dynamic.ll new file mode 100644 index 0000000000000..f2ccf3ed65c02 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/insertelt-dynamic.ll @@ -0,0 +1,738 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %} +target triple = "nvptx64-nvidia-cuda" + +; Test dynamic insertelt at the beginning of a chain +define <4 x i32> @dynamic_at_beginning(i32 %idx) { +; CHECK-LABEL: dynamic_at_beginning( +; CHECK: { +; CHECK-NEXT: .local .align 4 .b8 __local_depot0[16]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %SPL, __local_depot0; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_at_beginning_param_0]; +; CHECK-NEXT: and.b64 %rd2, %rd1, 3; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; +; CHECK-NEXT: add.u64 %rd4, %SP, 0; +; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; +; CHECK-NEXT: st.b32 [%rd5], 10; +; CHECK-NEXT: ld.b32 %r1, [%SP+12]; +; CHECK-NEXT: ld.b32 %r2, [%SP]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r2, 20, 30, %r1}; +; CHECK-NEXT: ret; + %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx + %v1 = insertelement <4 x i32> %v0, i32 20, i32 1 + %v2 = insertelement <4 x i32> %v1, i32 30, i32 2 + ret <4 x i32> %v2 +} + +; Test dynamic insertelt at the end of a chain +define <4 x i32> @dynamic_at_end(i32 %idx) { +; CHECK-LABEL: dynamic_at_end( +; CHECK: { +; CHECK-NEXT: .local .align 4 .b8 __local_depot1[16]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %SPL, __local_depot1; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_at_end_param_0]; +; CHECK-NEXT: and.b64 %rd2, %rd1, 3; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; +; CHECK-NEXT: add.u64 %rd4, %SP, 0; +; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; +; CHECK-NEXT: st.b32 [%SP+4], 20; +; CHECK-NEXT: st.b32 [%SP], 10; +; CHECK-NEXT: st.b32 [%rd5], 30; +; CHECK-NEXT: ld.b32 %r1, [%SP+12]; +; CHECK-NEXT: ld.b32 %r2, [%SP+8]; +; CHECK-NEXT: ld.b32 %r3, [%SP+4]; +; CHECK-NEXT: ld.b32 %r4, [%SP]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1}; +; CHECK-NEXT: ret; + %v0 = insertelement <4 x i32> poison, i32 10, i32 0 + %v1 = insertelement <4 x i32> %v0, i32 20, i32 1 + %v2 = insertelement <4 x i32> %v1, i32 30, i32 %idx + ret <4 x i32> %v2 +} + +; Test dynamic insertelt in the middle of a chain +define <4 x i32> @dynamic_in_middle(i32 %idx) { +; CHECK-LABEL: dynamic_in_middle( +; CHECK: { +; CHECK-NEXT: .local .align 4 .b8 __local_depot2[16]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %SPL, __local_depot2; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_in_middle_param_0]; +; CHECK-NEXT: and.b64 %rd2, %rd1, 3; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; +; CHECK-NEXT: add.u64 %rd4, %SP, 0; +; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; +; CHECK-NEXT: st.b32 [%SP], 10; +; CHECK-NEXT: st.b32 [%rd5], 20; +; CHECK-NEXT: ld.b32 %r1, [%SP+12]; +; CHECK-NEXT: ld.b32 %r2, [%SP+4]; +; CHECK-NEXT: ld.b32 %r3, [%SP]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r3, %r2, 30, %r1}; +; CHECK-NEXT: ret; + %v0 = insertelement <4 x i32> poison, i32 10, i32 0 + %v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx + %v2 = insertelement <4 x i32> %v1, i32 30, i32 2 + ret <4 x i32> %v2 +} + +; Test repeated dynamic insertelt with the same index +define <4 x i32> @repeated_same_index(i32 %idx) { +; CHECK-LABEL: repeated_same_index( +; CHECK: { +; CHECK-NEXT: .local .align 4 .b8 __local_depot3[16]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %SPL, __local_depot3; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b32 %rd1, [repeated_same_index_param_0]; +; CHECK-NEXT: and.b64 %rd2, %rd1, 3; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; +; CHECK-NEXT: add.u64 %rd4, %SP, 0; +; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; +; CHECK-NEXT: st.b32 [%rd5], 20; +; CHECK-NEXT: ld.b32 %r1, [%SP+12]; +; CHECK-NEXT: ld.b32 %r2, [%SP+8]; +; CHECK-NEXT: ld.b32 %r3, [%SP+4]; +; CHECK-NEXT: ld.b32 %r4, [%SP]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1}; +; CHECK-NEXT: ret; + %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx + %v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx + ret <4 x i32> %v1 +} + +; Test multiple dynamic insertelts +define <4 x i32> @multiple_dynamic(i32 %idx0, i32 %idx1) { +; CHECK-LABEL: multiple_dynamic( +; CHECK: { +; CHECK-NEXT: .local .align 4 .b8 __local_depot4[16]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<10>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %SPL, __local_depot4; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b32 %rd1, [multiple_dynamic_param_0]; +; CHECK-NEXT: and.b64 %rd2, %rd1, 3; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; +; CHECK-NEXT: add.u64 %rd4, %SP, 0; +; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; +; CHECK-NEXT: st.b32 [%rd5], 10; +; CHECK-NEXT: ld.param.b32 %rd6, [multiple_dynamic_param_1]; +; CHECK-NEXT: and.b64 %rd7, %rd6, 3; +; CHECK-NEXT: shl.b64 %rd8, %rd7, 2; +; CHECK-NEXT: add.s64 %rd9, %rd4, %rd8; +; CHECK-NEXT: st.b32 [%rd9], 20; +; CHECK-NEXT: ld.b32 %r1, [%SP+12]; +; CHECK-NEXT: ld.b32 %r2, [%SP+8]; +; CHECK-NEXT: ld.b32 %r3, [%SP+4]; +; CHECK-NEXT: ld.b32 %r4, [%SP]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1}; +; CHECK-NEXT: ret; + %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0 + %v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx1 + ret <4 x i32> %v1 +} + +; Test chain with all dynamic insertelts +define <4 x i32> @all_dynamic(i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) { +; CHECK-LABEL: all_dynamic( +; CHECK: { +; CHECK-NEXT: .local .align 4 .b8 __local_depot5[16]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<18>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %SPL, __local_depot5; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b32 %rd1, [all_dynamic_param_0]; +; CHECK-NEXT: and.b64 %rd2, %rd1, 3; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; +; CHECK-NEXT: add.u64 %rd4, %SP, 0; +; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; +; CHECK-NEXT: ld.param.b32 %rd6, [all_dynamic_param_1]; +; CHECK-NEXT: and.b64 %rd7, %rd6, 3; +; CHECK-NEXT: shl.b64 %rd8, %rd7, 2; +; CHECK-NEXT: add.s64 %rd9, %rd4, %rd8; +; CHECK-NEXT: ld.param.b32 %rd10, [all_dynamic_param_2]; +; CHECK-NEXT: and.b64 %rd11, %rd10, 3; +; CHECK-NEXT: shl.b64 %rd12, %rd11, 2; +; CHECK-NEXT: add.s64 %rd13, %rd4, %rd12; +; CHECK-NEXT: st.b32 [%rd5], 10; +; CHECK-NEXT: st.b32 [%rd9], 20; +; CHECK-NEXT: st.b32 [%rd13], 30; +; CHECK-NEXT: ld.param.b32 %rd14, [all_dynamic_param_3]; +; CHECK-NEXT: and.b64 %rd15, %rd14, 3; +; CHECK-NEXT: shl.b64 %rd16, %rd15, 2; +; CHECK-NEXT: add.s64 %rd17, %rd4, %rd16; +; CHECK-NEXT: st.b32 [%rd17], 40; +; CHECK-NEXT: ld.b32 %r1, [%SP+12]; +; CHECK-NEXT: ld.b32 %r2, [%SP+8]; +; CHECK-NEXT: ld.b32 %r3, [%SP+4]; +; CHECK-NEXT: ld.b32 %r4, [%SP]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1}; +; CHECK-NEXT: ret; + %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0 + %v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx1 + %v2 = insertelement <4 x i32> %v1, i32 30, i32 %idx2 + %v3 = insertelement <4 x i32> %v2, i32 40, i32 %idx3 + ret <4 x i32> %v3 +} + +; Test mixed constant and dynamic insertelts with high ratio of dynamic ones. +; Should lower all insertelts to stores. +define <4 x i32> @mix_dynamic_constant(i32 %idx0, i32 %idx1) { +; CHECK-LABEL: mix_dynamic_constant( +; CHECK: { +; CHECK-NEXT: .local .align 4 .b8 __local_depot6[16]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<10>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %SPL, __local_depot6; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b32 %rd1, [mix_dynamic_constant_param_0]; +; CHECK-NEXT: and.b64 %rd2, %rd1, 3; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; +; CHECK-NEXT: add.u64 %rd4, %SP, 0; +; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; +; CHECK-NEXT: st.b32 [%rd5], 10; +; CHECK-NEXT: ld.param.b32 %rd6, [mix_dynamic_constant_param_1]; +; CHECK-NEXT: and.b64 %rd7, %rd6, 3; +; CHECK-NEXT: shl.b64 %rd8, %rd7, 2; +; CHECK-NEXT: add.s64 %rd9, %rd4, %rd8; +; CHECK-NEXT: st.b32 [%SP+4], 20; +; CHECK-NEXT: st.b32 [%rd9], 30; +; CHECK-NEXT: ld.b32 %r1, [%SP+12]; +; CHECK-NEXT: ld.b32 %r2, [%SP+8]; +; CHECK-NEXT: ld.b32 %r3, [%SP+4]; +; CHECK-NEXT: ld.b32 %r4, [%SP]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1}; +; CHECK-NEXT: ret; + %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0 + %v1 = insertelement <4 x i32> %v0, i32 20, i32 1 + %v2 = insertelement <4 x i32> %v1, i32 30, i32 %idx1 + ret <4 x i32> %v2 +} + +; Test two separate chains that don't interfere +define void @two_separate_chains(i32 %idx0, i32 %idx1, ptr %out0, ptr %out1) { +; CHECK-LABEL: two_separate_chains( +; CHECK: { +; CHECK-NEXT: .local .align 4 .b8 __local_depot7[32]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %SPL, __local_depot7; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b32 %rd1, [two_separate_chains_param_0]; +; CHECK-NEXT: and.b64 %rd2, %rd1, 3; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; +; CHECK-NEXT: add.u64 %rd4, %SP, 16; +; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; +; CHECK-NEXT: st.b32 [%rd5], 10; +; CHECK-NEXT: ld.param.b32 %rd6, [two_separate_chains_param_1]; +; CHECK-NEXT: and.b64 %rd7, %rd6, 3; +; CHECK-NEXT: shl.b64 %rd8, %rd7, 2; +; CHECK-NEXT: add.u64 %rd9, %SP, 0; +; CHECK-NEXT: add.s64 %rd10, %rd9, %rd8; +; CHECK-NEXT: ld.b32 %r1, [%SP+28]; +; CHECK-NEXT: ld.b32 %r2, [%SP+24]; +; CHECK-NEXT: ld.b32 %r3, [%SP+16]; +; CHECK-NEXT: ld.param.b64 %rd11, [two_separate_chains_param_2]; +; CHECK-NEXT: st.b32 [%rd10], 30; +; CHECK-NEXT: ld.param.b64 %rd12, [two_separate_chains_param_3]; +; CHECK-NEXT: ld.b32 %r4, [%SP+12]; +; CHECK-NEXT: ld.b32 %r5, [%SP+4]; +; CHECK-NEXT: ld.b32 %r6, [%SP]; +; CHECK-NEXT: st.v4.b32 [%rd11], {%r3, 20, %r2, %r1}; +; CHECK-NEXT: st.v4.b32 [%rd12], {%r6, %r5, 40, %r4}; +; CHECK-NEXT: ret; + ; Chain 1 + %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0 + %v1 = insertelement <4 x i32> %v0, i32 20, i32 1 + + ; Chain 2 + %w0 = insertelement <4 x i32> poison, i32 30, i32 %idx1 + %w1 = insertelement <4 x i32> %w0, i32 40, i32 2 + + store <4 x i32> %v1, ptr %out0 + store <4 x i32> %w1, ptr %out1 + ret void +} + +; Test overlapping chains (chain 2 starts from middle of chain 1) +define void @overlapping_chains(i32 %idx0, i32 %idx1, ptr %out0, ptr %out1) { +; CHECK-LABEL: overlapping_chains( +; CHECK: { +; CHECK-NEXT: .local .align 4 .b8 __local_depot8[32]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<14>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %SPL, __local_depot8; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b32 %rd1, [overlapping_chains_param_0]; +; CHECK-NEXT: and.b64 %rd2, %rd1, 3; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; +; CHECK-NEXT: add.u64 %rd4, %SP, 16; +; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; +; CHECK-NEXT: st.b32 [%rd5], 10; +; CHECK-NEXT: add.u64 %rd6, %SP, 0; +; CHECK-NEXT: add.s64 %rd7, %rd6, %rd3; +; CHECK-NEXT: ld.b32 %r1, [%SP+28]; +; CHECK-NEXT: ld.b32 %r2, [%SP+16]; +; CHECK-NEXT: ld.param.b64 %rd8, [overlapping_chains_param_2]; +; CHECK-NEXT: st.b32 [%rd7], 10; +; CHECK-NEXT: ld.param.b32 %rd9, [overlapping_chains_param_1]; +; CHECK-NEXT: and.b64 %rd10, %rd9, 3; +; CHECK-NEXT: shl.b64 %rd11, %rd10, 2; +; CHECK-NEXT: add.s64 %rd12, %rd6, %rd11; +; CHECK-NEXT: st.b32 [%SP+4], 20; +; CHECK-NEXT: st.b32 [%rd12], 30; +; CHECK-NEXT: ld.param.b64 %rd13, [overlapping_chains_param_3]; +; CHECK-NEXT: ld.b32 %r3, [%SP+12]; +; CHECK-NEXT: ld.b32 %r4, [%SP+8]; +; CHECK-NEXT: ld.b32 %r5, [%SP+4]; +; CHECK-NEXT: ld.b32 %r6, [%SP]; +; CHECK-NEXT: st.v4.b32 [%rd8], {%r2, 20, 40, %r1}; +; CHECK-NEXT: st.v4.b32 [%rd13], {%r6, %r5, %r4, %r3}; +; CHECK-NEXT: ret; + %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0 + %v1 = insertelement <4 x i32> %v0, i32 20, i32 1 + + ; Chain 2 starts from v1 + %w0 = insertelement <4 x i32> %v1, i32 30, i32 %idx1 + + ; Continue chain 1 + %v2 = insertelement <4 x i32> %v1, i32 40, i32 2 + + store <4 x i32> %v2, ptr %out0 + store <4 x i32> %w0, ptr %out1 + ret void +} + +; Test with i1 elements (1-bit, non-byte-aligned) +define <8 x i1> @dynamic_i1(i32 %idx) { +; CHECK-LABEL: dynamic_i1( +; CHECK: { +; CHECK-NEXT: .local .align 8 .b8 __local_depot9[8]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %SPL, __local_depot9; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i1_param_0]; +; CHECK-NEXT: and.b64 %rd2, %rd1, 7; +; CHECK-NEXT: add.u64 %rd3, %SP, 0; +; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; +; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; +; CHECK-NEXT: st.b8 [%rd4], 1; +; CHECK-NEXT: ld.b32 %r3, [%SP]; +; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x7773U; +; CHECK-NEXT: ld.b32 %r5, [%SP+4]; +; CHECK-NEXT: prmt.b32 %r6, %r5, 0, 0x7771U; +; CHECK-NEXT: prmt.b32 %r7, %r5, 0, 0x7772U; +; CHECK-NEXT: prmt.b32 %r8, %r5, 0, 0x7773U; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %r5; +; CHECK-NEXT: st.param.b8 [func_retval0], %r3; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %r8; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %r7; +; CHECK-NEXT: st.param.b8 [func_retval0+5], %r6; +; CHECK-NEXT: st.param.b8 [func_retval0+3], %r4; +; CHECK-NEXT: st.param.b8 [func_retval0+2], 1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], 0; +; CHECK-NEXT: ret; + %v0 = insertelement <8 x i1> poison, i1 1, i32 %idx + %v1 = insertelement <8 x i1> %v0, i1 0, i32 1 + %v2 = insertelement <8 x i1> %v1, i1 1, i32 2 + ret <8 x i1> %v2 +} + +; Test with i2 elements (2-bit, non-byte-aligned) +define <8 x i2> @dynamic_i2(i32 %idx) { +; CHECK-LABEL: dynamic_i2( +; CHECK: { +; CHECK-NEXT: .local .align 8 .b8 __local_depot10[16]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b16 %rs<24>; +; CHECK-NEXT: .reg .b32 %r<10>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %SPL, __local_depot10; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i2_param_0]; +; CHECK-NEXT: and.b64 %rd2, %rd1, 7; +; CHECK-NEXT: add.u64 %rd3, %SP, 0; +; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; +; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; +; CHECK-NEXT: st.b8 [%rd4], 1; +; CHECK-NEXT: ld.b32 %r3, [%SP+4]; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; +; CHECK-NEXT: and.b16 %rs2, %rs1, 3; +; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x7771U; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: and.b16 %rs4, %rs3, 3; +; CHECK-NEXT: shl.b16 %rs5, %rs4, 2; +; CHECK-NEXT: or.b16 %rs6, %rs2, %rs5; +; CHECK-NEXT: prmt.b32 %r5, %r3, 0, 0x7772U; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r5; +; CHECK-NEXT: and.b16 %rs8, %rs7, 3; +; CHECK-NEXT: shl.b16 %rs9, %rs8, 4; +; CHECK-NEXT: or.b16 %rs10, %rs6, %rs9; +; CHECK-NEXT: prmt.b32 %r6, %r3, 0, 0x7773U; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r6; +; CHECK-NEXT: shl.b16 %rs12, %rs11, 6; +; CHECK-NEXT: or.b16 %rs13, %rs10, %rs12; +; CHECK-NEXT: st.b8 [%SP+8], %rs13; +; CHECK-NEXT: ld.b32 %r7, [%SP]; +; CHECK-NEXT: prmt.b32 %r8, %r7, 0, 0x7773U; +; CHECK-NEXT: cvt.u16.u32 %rs14, %r8; +; CHECK-NEXT: shl.b16 %rs15, %rs14, 6; +; CHECK-NEXT: and.b16 %rs16, %rs15, 192; +; CHECK-NEXT: ld.s8 %rs17, [%SP+8]; +; CHECK-NEXT: shl.b16 %rs18, %rs17, 8; +; CHECK-NEXT: or.b16 %rs19, %rs16, %rs18; +; CHECK-NEXT: prmt.b32 %r9, %r7, 0, 0x7770U; +; CHECK-NEXT: st.param.b16 [func_retval0], %r9; +; CHECK-NEXT: st.param.b16 [func_retval0+8], %rs17; +; CHECK-NEXT: shr.s16 %rs20, %rs18, 14; +; CHECK-NEXT: st.param.b16 [func_retval0+14], %rs20; +; CHECK-NEXT: shr.s16 %rs21, %rs18, 12; +; CHECK-NEXT: st.param.b16 [func_retval0+12], %rs21; +; CHECK-NEXT: shr.s16 %rs22, %rs18, 10; +; CHECK-NEXT: st.param.b16 [func_retval0+10], %rs22; +; CHECK-NEXT: shr.s16 %rs23, %rs19, 6; +; CHECK-NEXT: st.param.b16 [func_retval0+6], %rs23; +; CHECK-NEXT: st.param.b16 [func_retval0+4], 3; +; CHECK-NEXT: st.param.b16 [func_retval0+2], 2; +; CHECK-NEXT: ret; + %v0 = insertelement <8 x i2> poison, i2 1, i32 %idx + %v1 = insertelement <8 x i2> %v0, i2 2, i32 1 + %v2 = insertelement <8 x i2> %v1, i2 3, i32 2 + ret <8 x i2> %v2 +} + +; Test with i3 elements (3-bit, non-byte-aligned) +define <8 x i3> @dynamic_i3(i32 %idx) { +; CHECK-LABEL: dynamic_i3( +; CHECK: { +; CHECK-NEXT: .local .align 8 .b8 __local_depot11[8]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %SPL, __local_depot11; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i3_param_0]; +; CHECK-NEXT: and.b64 %rd2, %rd1, 7; +; CHECK-NEXT: add.u64 %rd3, %SP, 0; +; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; +; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; +; CHECK-NEXT: st.b8 [%rd4], 1; +; CHECK-NEXT: ld.b32 %r3, [%SP]; +; CHECK-NEXT: ld.b32 %r4, [%SP+4]; +; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U; +; CHECK-NEXT: prmt.b32 %r6, %r4, 0, 0x7772U; +; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 0x5410U; +; CHECK-NEXT: st.param.b32 [func_retval0+12], %r7; +; CHECK-NEXT: prmt.b32 %r8, %r4, 0, 0x7771U; +; CHECK-NEXT: prmt.b32 %r9, %r4, 0, 0x7770U; +; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x5410U; +; CHECK-NEXT: st.param.b32 [func_retval0+8], %r10; +; CHECK-NEXT: prmt.b32 %r11, %r3, 0, 0x7773U; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r11; +; CHECK-NEXT: mov.b16 %rs2, 3; +; CHECK-NEXT: mov.b32 %r12, {%rs2, %rs1}; +; CHECK-NEXT: st.param.b32 [func_retval0+4], %r12; +; CHECK-NEXT: prmt.b32 %r13, %r3, 0, 0x7770U; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r13; +; CHECK-NEXT: mov.b16 %rs4, 2; +; CHECK-NEXT: mov.b32 %r14, {%rs3, %rs4}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r14; +; CHECK-NEXT: ret; + %v0 = insertelement <8 x i3> poison, i3 1, i32 %idx + %v1 = insertelement <8 x i3> %v0, i3 2, i32 1 + %v2 = insertelement <8 x i3> %v1, i3 3, i32 2 + ret <8 x i3> %v2 +} + +; Test with i4 elements (4-bit, non-byte-aligned) +define <8 x i4> @dynamic_i4(i32 %idx) { +; CHECK-LABEL: dynamic_i4( +; CHECK: { +; CHECK-NEXT: .local .align 8 .b8 __local_depot12[16]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b16 %rs<30>; +; CHECK-NEXT: .reg .b32 %r<22>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %SPL, __local_depot12; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i4_param_0]; +; CHECK-NEXT: and.b64 %rd2, %rd1, 7; +; CHECK-NEXT: add.u64 %rd3, %SP, 0; +; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; +; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; +; CHECK-NEXT: st.b8 [%rd4], 1; +; CHECK-NEXT: ld.b32 %r3, [%SP]; +; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x7770U; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r4; +; CHECK-NEXT: and.b16 %rs2, %rs1, 15; +; CHECK-NEXT: prmt.b32 %r5, %r3, 0, 0x7771U; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; +; CHECK-NEXT: and.b16 %rs4, %rs3, 15; +; CHECK-NEXT: shl.b16 %rs5, %rs4, 4; +; CHECK-NEXT: or.b16 %rs6, %rs2, %rs5; +; CHECK-NEXT: prmt.b32 %r6, %r3, 0, 0x7772U; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r6; +; CHECK-NEXT: and.b16 %rs8, %rs7, 15; +; CHECK-NEXT: shl.b16 %rs9, %rs8, 8; +; CHECK-NEXT: or.b16 %rs10, %rs6, %rs9; +; CHECK-NEXT: prmt.b32 %r7, %r3, 0, 0x7773U; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r7; +; CHECK-NEXT: shl.b16 %rs12, %rs11, 12; +; CHECK-NEXT: or.b16 %rs13, %rs10, %rs12; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs13; +; CHECK-NEXT: ld.b32 %r9, [%SP+4]; +; CHECK-NEXT: prmt.b32 %r10, %r9, 0, 0x7770U; +; CHECK-NEXT: cvt.u16.u32 %rs14, %r10; +; CHECK-NEXT: and.b16 %rs15, %rs14, 15; +; CHECK-NEXT: prmt.b32 %r11, %r9, 0, 0x7771U; +; CHECK-NEXT: cvt.u16.u32 %rs16, %r11; +; CHECK-NEXT: and.b16 %rs17, %rs16, 15; +; CHECK-NEXT: shl.b16 %rs18, %rs17, 4; +; CHECK-NEXT: or.b16 %rs19, %rs15, %rs18; +; CHECK-NEXT: prmt.b32 %r12, %r9, 0, 0x7772U; +; CHECK-NEXT: cvt.u16.u32 %rs20, %r12; +; CHECK-NEXT: and.b16 %rs21, %rs20, 15; +; CHECK-NEXT: shl.b16 %rs22, %rs21, 8; +; CHECK-NEXT: or.b16 %rs23, %rs19, %rs22; +; CHECK-NEXT: prmt.b32 %r13, %r9, 0, 0x7773U; +; CHECK-NEXT: cvt.u16.u32 %rs24, %r13; +; CHECK-NEXT: shl.b16 %rs25, %rs24, 12; +; CHECK-NEXT: or.b16 %rs26, %rs23, %rs25; +; CHECK-NEXT: cvt.u32.u16 %r14, %rs26; +; CHECK-NEXT: shl.b32 %r15, %r14, 16; +; CHECK-NEXT: or.b32 %r16, %r8, %r15; +; CHECK-NEXT: mov.b32 %r17, {%rs20, %rs24}; +; CHECK-NEXT: st.param.b32 [func_retval0+12], %r17; +; CHECK-NEXT: mov.b32 %r18, {%rs14, %rs16}; +; CHECK-NEXT: st.param.b32 [func_retval0+8], %r18; +; CHECK-NEXT: mov.b16 %rs27, 2; +; CHECK-NEXT: mov.b32 %r19, {%rs1, %rs27}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r19; +; CHECK-NEXT: shr.u32 %r20, %r16, 12; +; CHECK-NEXT: cvt.u16.u32 %rs28, %r20; +; CHECK-NEXT: mov.b16 %rs29, 3; +; CHECK-NEXT: mov.b32 %r21, {%rs29, %rs28}; +; CHECK-NEXT: st.param.b32 [func_retval0+4], %r21; +; CHECK-NEXT: ret; + %v0 = insertelement <8 x i4> poison, i4 1, i32 %idx + %v1 = insertelement <8 x i4> %v0, i4 2, i32 1 + %v2 = insertelement <8 x i4> %v1, i4 3, i32 2 + ret <8 x i4> %v2 +} + +; Test with i5 elements (5-bit, non-byte-aligned) +define <8 x i5> @dynamic_i5(i32 %idx) { +; CHECK-LABEL: dynamic_i5( +; CHECK: { +; CHECK-NEXT: .local .align 8 .b8 __local_depot13[8]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %SPL, __local_depot13; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i5_param_0]; +; CHECK-NEXT: and.b64 %rd2, %rd1, 7; +; CHECK-NEXT: add.u64 %rd3, %SP, 0; +; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; +; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; +; CHECK-NEXT: st.b8 [%rd4], 1; +; CHECK-NEXT: ld.b32 %r3, [%SP]; +; CHECK-NEXT: ld.b32 %r4, [%SP+4]; +; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U; +; CHECK-NEXT: prmt.b32 %r6, %r4, 0, 0x7772U; +; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 0x5410U; +; CHECK-NEXT: prmt.b32 %r8, %r4, 0, 0x7771U; +; CHECK-NEXT: prmt.b32 %r9, %r4, 0, 0x7770U; +; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x5410U; +; CHECK-NEXT: st.param.v2.b32 [func_retval0+8], {%r10, %r7}; +; CHECK-NEXT: prmt.b32 %r11, %r3, 0, 0x7773U; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r11; +; CHECK-NEXT: mov.b16 %rs2, 3; +; CHECK-NEXT: mov.b32 %r12, {%rs2, %rs1}; +; CHECK-NEXT: prmt.b32 %r13, %r3, 0, 0x7770U; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r13; +; CHECK-NEXT: mov.b16 %rs4, 2; +; CHECK-NEXT: mov.b32 %r14, {%rs3, %rs4}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r12}; +; CHECK-NEXT: ret; + %v0 = insertelement <8 x i5> poison, i5 1, i32 %idx + %v1 = insertelement <8 x i5> %v0, i5 2, i32 1 + %v2 = insertelement <8 x i5> %v1, i5 3, i32 2 + ret <8 x i5> %v2 +} + +; Test with i7 elements (7-bit, non-byte-aligned) +define <8 x i7> @dynamic_i7(i32 %idx) { +; CHECK-LABEL: dynamic_i7( +; CHECK: { +; CHECK-NEXT: .local .align 8 .b8 __local_depot14[8]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %SPL, __local_depot14; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i7_param_0]; +; CHECK-NEXT: and.b64 %rd2, %rd1, 7; +; CHECK-NEXT: add.u64 %rd3, %SP, 0; +; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; +; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; +; CHECK-NEXT: st.b8 [%rd4], 1; +; CHECK-NEXT: ld.b32 %r3, [%SP]; +; CHECK-NEXT: ld.b32 %r4, [%SP+4]; +; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U; +; CHECK-NEXT: prmt.b32 %r6, %r4, 0, 0x7772U; +; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 0x5410U; +; CHECK-NEXT: prmt.b32 %r8, %r4, 0, 0x7771U; +; CHECK-NEXT: prmt.b32 %r9, %r4, 0, 0x7770U; +; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x5410U; +; CHECK-NEXT: st.param.v2.b32 [func_retval0+8], {%r10, %r7}; +; CHECK-NEXT: prmt.b32 %r11, %r3, 0, 0x7773U; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r11; +; CHECK-NEXT: mov.b16 %rs2, 3; +; CHECK-NEXT: mov.b32 %r12, {%rs2, %rs1}; +; CHECK-NEXT: prmt.b32 %r13, %r3, 0, 0x7770U; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r13; +; CHECK-NEXT: mov.b16 %rs4, 2; +; CHECK-NEXT: mov.b32 %r14, {%rs3, %rs4}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r12}; +; CHECK-NEXT: ret; + %v0 = insertelement <8 x i7> poison, i7 1, i32 %idx + %v1 = insertelement <8 x i7> %v0, i7 2, i32 1 + %v2 = insertelement <8 x i7> %v1, i7 3, i32 2 + ret <8 x i7> %v2 +} + +; Test with i6 elements (6-bit, non-byte-aligned) +define <8 x i6> @dynamic_i6(i32 %idx) { +; CHECK-LABEL: dynamic_i6( +; CHECK: { +; CHECK-NEXT: .local .align 8 .b8 __local_depot15[8]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %SPL, __local_depot15; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i6_param_0]; +; CHECK-NEXT: and.b64 %rd2, %rd1, 7; +; CHECK-NEXT: add.u64 %rd3, %SP, 0; +; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; +; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; +; CHECK-NEXT: st.b8 [%rd4], 1; +; CHECK-NEXT: ld.b32 %r3, [%SP]; +; CHECK-NEXT: ld.b32 %r4, [%SP+4]; +; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U; +; CHECK-NEXT: prmt.b32 %r6, %r4, 0, 0x7772U; +; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 0x5410U; +; CHECK-NEXT: prmt.b32 %r8, %r4, 0, 0x7771U; +; CHECK-NEXT: prmt.b32 %r9, %r4, 0, 0x7770U; +; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x5410U; +; CHECK-NEXT: st.param.v2.b32 [func_retval0+8], {%r10, %r7}; +; CHECK-NEXT: prmt.b32 %r11, %r3, 0, 0x7773U; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r11; +; CHECK-NEXT: mov.b16 %rs2, 3; +; CHECK-NEXT: mov.b32 %r12, {%rs2, %rs1}; +; CHECK-NEXT: prmt.b32 %r13, %r3, 0, 0x7770U; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r13; +; CHECK-NEXT: mov.b16 %rs4, 2; +; CHECK-NEXT: mov.b32 %r14, {%rs3, %rs4}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r12}; +; CHECK-NEXT: ret; + %v0 = insertelement <8 x i6> poison, i6 1, i32 %idx + %v1 = insertelement <8 x i6> %v0, i6 2, i32 1 + %v2 = insertelement <8 x i6> %v1, i6 3, i32 2 + ret <8 x i6> %v2 +} + +; Test with multiple dynamic insertions on i3 elements +define <4 x i3> @multiple_dynamic_i3(i32 %idx0, i32 %idx1) { +; CHECK-LABEL: multiple_dynamic_i3( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [multiple_dynamic_i3_param_0]; +; CHECK-NEXT: shl.b32 %r2, %r1, 3; +; CHECK-NEXT: bfi.b32 %r3, 1, %r4, %r2, 8; +; CHECK-NEXT: ld.param.b32 %r5, [multiple_dynamic_i3_param_1]; +; CHECK-NEXT: shl.b32 %r6, %r5, 3; +; CHECK-NEXT: bfi.b32 %r7, 2, %r3, %r6, 8; +; CHECK-NEXT: st.param.b16 [func_retval0], %r7; +; CHECK-NEXT: shr.u32 %r8, %r7, 16; +; CHECK-NEXT: st.param.b16 [func_retval0+2], %r8; +; CHECK-NEXT: ret; + %v0 = insertelement <4 x i3> poison, i3 1, i32 %idx0 + %v1 = insertelement <4 x i3> %v0, i3 2, i32 %idx1 + ret <4 x i3> %v1 +} diff --git a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll index 291a9c1f978da..b006c78604648 100644 --- a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll +++ b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll @@ -242,17 +242,14 @@ define <2 x i64> @testDoubleword(<2 x i64> %a, i64 %b, i64 %idx) { ; AIX-P8-32-LABEL: testDoubleword: ; AIX-P8-32: # %bb.0: # %entry ; AIX-P8-32-NEXT: add r6, r6, r6 -; AIX-P8-32-NEXT: addi r5, r1, -32 +; AIX-P8-32-NEXT: addi r5, r1, -16 ; AIX-P8-32-NEXT: rlwinm r7, r6, 2, 28, 29 -; AIX-P8-32-NEXT: stxvw4x v2, 0, r5 +; AIX-P8-32-NEXT: stxvd2x v2, 0, r5 ; AIX-P8-32-NEXT: stwx r3, r5, r7 -; AIX-P8-32-NEXT: addi r3, r1, -16 -; AIX-P8-32-NEXT: lxvw4x vs0, 0, r5 -; AIX-P8-32-NEXT: addi r5, r6, 1 -; AIX-P8-32-NEXT: rlwinm r5, r5, 2, 28, 29 -; AIX-P8-32-NEXT: stxvw4x vs0, 0, r3 -; AIX-P8-32-NEXT: stwx r4, r3, r5 -; AIX-P8-32-NEXT: lxvw4x v2, 0, r3 +; AIX-P8-32-NEXT: addi r3, r6, 1 +; AIX-P8-32-NEXT: rlwinm r3, r3, 2, 28, 29 +; AIX-P8-32-NEXT: stwx r4, r5, r3 +; AIX-P8-32-NEXT: lxvd2x v2, 0, r5 ; AIX-P8-32-NEXT: blr entry: %vecins = insertelement <2 x i64> %a, i64 %b, i64 %idx @@ -426,17 +423,14 @@ define <4 x float> @testFloat2(<4 x float> %a, ptr %b, i32 zeroext %idx1, i32 ze ; AIX-P8-LABEL: testFloat2: ; AIX-P8: # %bb.0: # %entry ; AIX-P8-NEXT: lwz r6, 0(r3) -; AIX-P8-NEXT: rlwinm r4, r4, 2, 28, 29 -; AIX-P8-NEXT: addi r7, r1, -32 +; AIX-P8-NEXT: lwz r3, 1(r3) +; AIX-P8-NEXT: addi r7, r1, -16 ; AIX-P8-NEXT: stxvw4x v2, 0, r7 ; AIX-P8-NEXT: rlwinm r5, r5, 2, 28, 29 +; AIX-P8-NEXT: rlwinm r4, r4, 2, 28, 29 ; AIX-P8-NEXT: stwx r6, r7, r4 -; AIX-P8-NEXT: addi r4, r1, -16 -; AIX-P8-NEXT: lxvw4x vs0, 0, r7 -; AIX-P8-NEXT: lwz r3, 1(r3) -; AIX-P8-NEXT: stxvw4x vs0, 0, r4 -; AIX-P8-NEXT: stwx r3, r4, r5 -; AIX-P8-NEXT: lxvw4x v2, 0, r4 +; AIX-P8-NEXT: stwx r3, r7, r5 +; AIX-P8-NEXT: lxvw4x v2, 0, r7 ; AIX-P8-NEXT: blr entry: %add.ptr1 = getelementptr inbounds i8, ptr %b, i64 1 @@ -493,38 +487,32 @@ define <4 x float> @testFloat3(<4 x float> %a, ptr %b, i32 zeroext %idx1, i32 ze ; ; AIX-P8-64-LABEL: testFloat3: ; AIX-P8-64: # %bb.0: # %entry +; AIX-P8-64-NEXT: li r7, 1 ; AIX-P8-64-NEXT: lis r6, 1 -; AIX-P8-64-NEXT: rlwinm r4, r4, 2, 28, 29 -; AIX-P8-64-NEXT: addi r7, r1, -32 ; AIX-P8-64-NEXT: rlwinm r5, r5, 2, 28, 29 +; AIX-P8-64-NEXT: rlwinm r4, r4, 2, 28, 29 +; AIX-P8-64-NEXT: rldic r7, r7, 36, 27 ; AIX-P8-64-NEXT: lwzx r6, r3, r6 +; AIX-P8-64-NEXT: lwzx r3, r3, r7 +; AIX-P8-64-NEXT: addi r7, r1, -16 ; AIX-P8-64-NEXT: stxvw4x v2, 0, r7 ; AIX-P8-64-NEXT: stwx r6, r7, r4 -; AIX-P8-64-NEXT: li r4, 1 -; AIX-P8-64-NEXT: lxvw4x vs0, 0, r7 -; AIX-P8-64-NEXT: rldic r4, r4, 36, 27 -; AIX-P8-64-NEXT: lwzx r3, r3, r4 -; AIX-P8-64-NEXT: addi r4, r1, -16 -; AIX-P8-64-NEXT: stxvw4x vs0, 0, r4 -; AIX-P8-64-NEXT: stwx r3, r4, r5 -; AIX-P8-64-NEXT: lxvw4x v2, 0, r4 +; AIX-P8-64-NEXT: stwx r3, r7, r5 +; AIX-P8-64-NEXT: lxvw4x v2, 0, r7 ; AIX-P8-64-NEXT: blr ; ; AIX-P8-32-LABEL: testFloat3: ; AIX-P8-32: # %bb.0: # %entry ; AIX-P8-32-NEXT: lis r6, 1 -; AIX-P8-32-NEXT: rlwinm r4, r4, 2, 28, 29 -; AIX-P8-32-NEXT: addi r7, r1, -32 ; AIX-P8-32-NEXT: rlwinm r5, r5, 2, 28, 29 +; AIX-P8-32-NEXT: rlwinm r4, r4, 2, 28, 29 +; AIX-P8-32-NEXT: addi r7, r1, -16 ; AIX-P8-32-NEXT: lwzx r6, r3, r6 +; AIX-P8-32-NEXT: lwz r3, 0(r3) ; AIX-P8-32-NEXT: stxvw4x v2, 0, r7 ; AIX-P8-32-NEXT: stwx r6, r7, r4 -; AIX-P8-32-NEXT: addi r4, r1, -16 -; AIX-P8-32-NEXT: lxvw4x vs0, 0, r7 -; AIX-P8-32-NEXT: lwz r3, 0(r3) -; AIX-P8-32-NEXT: stxvw4x vs0, 0, r4 -; AIX-P8-32-NEXT: stwx r3, r4, r5 -; AIX-P8-32-NEXT: lxvw4x v2, 0, r4 +; AIX-P8-32-NEXT: stwx r3, r7, r5 +; AIX-P8-32-NEXT: lxvw4x v2, 0, r7 ; AIX-P8-32-NEXT: blr entry: %add.ptr = getelementptr inbounds i8, ptr %b, i64 65536 From 635074e91688157b3e6d3619d20dc97432694a32 Mon Sep 17 00:00:00 2001 From: Sang Ik Lee Date: Wed, 29 Oct 2025 09:55:50 -0700 Subject: [PATCH 120/539] [MLIR][Conversion] XeGPU to XeVM: Lower ranked dynamic base memory for create_nd_tdesc. (#164283) Current lowering pattern for create_nd_tdesc restricts source memref to static shape. In case of a dynamic ranked memref, create_nd_tdesc already provides shape as an argument. Lowering can use those values instead of returning a mismatch error. --- .../Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp | 4 +-- .../XeGPUToXeVM/create_nd_tdesc.mlir | 25 ++++++++++++++++++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp index fcbf66dbe9e45..33e8f2ed1f6ed 100644 --- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp +++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp @@ -194,8 +194,8 @@ class CreateNdDescToXeVMPattern // If source is a memref, we need to extract the aligned pointer as index. // Pointer type is passed as i32 or i64 by type converter. if (sourceMemrefTy) { - if (!sourceMemrefTy.hasStaticShape()) { - return rewriter.notifyMatchFailure(op, "Expected static memref shape."); + if (!sourceMemrefTy.hasRank()) { + return rewriter.notifyMatchFailure(op, "Expected ranked Memref."); } baseAddr = memref::ExtractAlignedPointerAsIndexOp::create(rewriter, loc, source); diff --git a/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir b/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir index d6e36fa73bf04..09ef76c9d1740 100644 --- a/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir +++ b/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir @@ -4,8 +4,9 @@ gpu.module @create_nd_tdesc { // CHECK-LABEL: gpu.func @create_nd_tdesc // CHECK-SAME: %[[ARG0:.*]]: memref<16x32xf32, 1>, %[[ARG1:.*]]: ui64, // CHECK-SAME: %[[ARG2:.*]]: index, %[[ARG3:.*]]: index, %[[ARG4:.*]]: index, %[[ARG5:.*]]: index, %[[ARG6:.*]]: index, %[[ARG7:.*]]: index + // CHECK-SAME: %[[DYN:.*]]: memref) kernel { gpu.func @create_nd_tdesc(%src: memref<16x32xf32, 1>, %ptr: ui64, %shape1: index, %shape2: index, - %stride1: index, %stride2: index, %offset1: index, %offset2: index) kernel { + %stride1: index, %stride2: index, %offset1: index, %offset2: index, %dyn: memref) kernel { // CHECK: %[[VAR0:.*]] = index.castu %[[ARG1]] : ui64 to index // CHECK: %[[BASE_ADDR:.*]] = arith.index_castui %[[VAR0]] : index to i64 // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xi32> @@ -43,6 +44,28 @@ gpu.module @create_nd_tdesc { // CHECK: %[[VAR19:.*]] = vector.insert %[[OFFSET_W2]], %[[VAR18]] [4] : i32 into vector<8xi32> // CHECK: %[[PAYLOAD:.*]] = vector.insert %[[OFFSET_H2]], %[[VAR19]] [5] : i32 into vector<8xi32> %src_tdesc = xegpu.create_nd_tdesc %srcce : memref<16x32xf32> -> !xegpu.tensor_desc<8x16xf32> + + // CHECK: %[[C1:.*]] = arith.constant 1 : index + %c1 = arith.constant 1 : index + // CHECK: %[[C64:.*]] = arith.constant 64 : index + %size_x = arith.constant 64 : index + // CHECK: %[[C16:.*]] = arith.constant 16 : index + %BLOCK_DMODEL = arith.constant 16 : index + // CHECK: %[[CST_4:.*]] = arith.constant dense<0> : vector<8xi32> + // CHECK: %[[INTPTR_5:.*]] = memref.extract_aligned_pointer_as_index %[[DYN]] : memref -> index + // CHECK: %[[C0_I32_6:.*]] = arith.constant 0 : i32 + // CHECK: %[[C0_I32_7:.*]] = arith.constant 0 : i32 + // CHECK: %[[VAR21:.*]] = arith.index_cast %[[C16]] : index to i32 + // CHECK: %[[VAR22:.*]] = arith.index_cast %[[C64]] : index to i32 + // CHECK: %[[VAR23:.*]] = arith.index_castui %[[INTPTR_5]] : index to i64 + // CHECK: %[[VAR24:.*]] = vector.bitcast %[[CST_4]] : vector<8xi32> to vector<4xi64> + // CHECK: %[[VAR25:.*]] = vector.insert %[[VAR23]], %[[VAR24]] [0] : i64 into vector<4xi64> + // CHECK: %[[VAR26:.*]] = vector.bitcast %[[VAR25]] : vector<4xi64> to vector<8xi32> + // CHECK: %[[VAR27:.*]] = vector.insert %[[VAR21]], %[[VAR26]] [2] : i32 into vector<8xi32> + // CHECK: %[[VAR28:.*]] = vector.insert %[[VAR22]], %[[VAR27]] [3] : i32 into vector<8xi32> + // CHECK: %[[VAR29:.*]] = vector.insert %[[C0_I32_6]], %[[VAR28]] [4] : i32 into vector<8xi32> + // CHECK: %[[VAR30:.*]] = vector.insert %[[C0_I32_7]], %[[VAR29]] [5] : i32 into vector<8xi32> + %dyn_tdesc = xegpu.create_nd_tdesc %dyn, shape: [%size_x, %BLOCK_DMODEL], strides: [%BLOCK_DMODEL, %c1] : memref -> !xegpu.tensor_desc<16x16xf16> gpu.return } } From 5907c2b26dee40da2a2aa8f5959e0d23cc367dea Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 29 Oct 2025 17:01:29 +0000 Subject: [PATCH 121/539] [X86] vector-reduce-or-cmp.ll - add v4i64 signbit test coverage (#165588) Missing fold to make use of VTESTPD --- llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll | 69 +++++++++++++++---- 1 file changed, 57 insertions(+), 12 deletions(-) diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll index ab1feba98b008..9816fa7c83560 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -992,6 +992,51 @@ define i1 @signtest_v8i32(<8 x i32> %a0) { ret i1 %2 } +define i1 @signtest_v4i64(<4 x i64> %a0) { +; SSE2-LABEL: signtest_v4i64: +; SSE2: # %bb.0: +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: setns %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: signtest_v4i64: +; SSE41: # %bb.0: +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: sete %al +; SSE41-NEXT: retq +; +; AVX1-LABEL: signtest_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: sete %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: signtest_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: sete %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: signtest_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a0) + %2 = icmp sgt i64 %1, -1 + ret i1 %2 +} + define i1 @trunc_v16i16(<16 x i16> %a0) { ; SSE2-LABEL: trunc_v16i16: ; SSE2: # %bb.0: @@ -1162,11 +1207,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) { ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: orl %ecx, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: je .LBB29_2 +; SSE2-NEXT: je .LBB30_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: retq -; SSE2-NEXT: .LBB29_2: +; SSE2-NEXT: .LBB30_2: ; SSE2-NEXT: movl $1, %eax ; SSE2-NEXT: retq ; @@ -1181,11 +1226,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) { ; SSE41-NEXT: pextrd $2, %xmm1, %eax ; SSE41-NEXT: orl %ecx, %eax ; SSE41-NEXT: testb $1, %al -; SSE41-NEXT: je .LBB29_2 +; SSE41-NEXT: je .LBB30_2 ; SSE41-NEXT: # %bb.1: ; SSE41-NEXT: xorl %eax, %eax ; SSE41-NEXT: retq -; SSE41-NEXT: .LBB29_2: +; SSE41-NEXT: .LBB30_2: ; SSE41-NEXT: movl $1, %eax ; SSE41-NEXT: retq ; @@ -1200,11 +1245,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) { ; AVX1OR2-NEXT: vpextrd $2, %xmm0, %eax ; AVX1OR2-NEXT: orl %ecx, %eax ; AVX1OR2-NEXT: testb $1, %al -; AVX1OR2-NEXT: je .LBB29_2 +; AVX1OR2-NEXT: je .LBB30_2 ; AVX1OR2-NEXT: # %bb.1: ; AVX1OR2-NEXT: xorl %eax, %eax ; AVX1OR2-NEXT: retq -; AVX1OR2-NEXT: .LBB29_2: +; AVX1OR2-NEXT: .LBB30_2: ; AVX1OR2-NEXT: movl $1, %eax ; AVX1OR2-NEXT: retq ; @@ -1219,12 +1264,12 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) { ; AVX512F-NEXT: korw %k0, %k1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al -; AVX512F-NEXT: je .LBB29_2 +; AVX512F-NEXT: je .LBB30_2 ; AVX512F-NEXT: # %bb.1: ; AVX512F-NEXT: xorl %eax, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq -; AVX512F-NEXT: .LBB29_2: +; AVX512F-NEXT: .LBB30_2: ; AVX512F-NEXT: movl $1, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1240,12 +1285,12 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) { ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testb $1, %al -; AVX512BW-NEXT: je .LBB29_2 +; AVX512BW-NEXT: je .LBB30_2 ; AVX512BW-NEXT: # %bb.1: ; AVX512BW-NEXT: xorl %eax, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: .LBB29_2: +; AVX512BW-NEXT: .LBB30_2: ; AVX512BW-NEXT: movl $1, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1259,11 +1304,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) { ; AVX512BWVL-NEXT: korw %k0, %k1, %k0 ; AVX512BWVL-NEXT: kmovd %k0, %eax ; AVX512BWVL-NEXT: testb $1, %al -; AVX512BWVL-NEXT: je .LBB29_2 +; AVX512BWVL-NEXT: je .LBB30_2 ; AVX512BWVL-NEXT: # %bb.1: ; AVX512BWVL-NEXT: xorl %eax, %eax ; AVX512BWVL-NEXT: retq -; AVX512BWVL-NEXT: .LBB29_2: +; AVX512BWVL-NEXT: .LBB30_2: ; AVX512BWVL-NEXT: movl $1, %eax ; AVX512BWVL-NEXT: retq %1 = icmp ne <3 x i32> %a, %b From 29059cfe4569661478590523a48acbf40924eea3 Mon Sep 17 00:00:00 2001 From: nerix Date: Wed, 29 Oct 2025 18:38:34 +0100 Subject: [PATCH 122/539] [LLDB][PDB] Explicitly set DIA plugin in unit test (#165592) Fixes the failing DIA unit test (https://lab.llvm.org/buildbot/#/builders/197/builds/10342) after #165363. Now that the native plugin is the default, we need to set the symbol file plugin for DIA via the settings. --- lldb/unittests/SymbolFile/PDB/CMakeLists.txt | 1 + lldb/unittests/SymbolFile/PDB/SymbolFilePDBTests.cpp | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/lldb/unittests/SymbolFile/PDB/CMakeLists.txt b/lldb/unittests/SymbolFile/PDB/CMakeLists.txt index 8edb352e5a3e1..0bd90fe90d88b 100644 --- a/lldb/unittests/SymbolFile/PDB/CMakeLists.txt +++ b/lldb/unittests/SymbolFile/PDB/CMakeLists.txt @@ -9,6 +9,7 @@ add_lldb_unittest(SymbolFilePDBTests lldbHost lldbSymbol lldbPluginObjectFilePECOFF + lldbPluginPlatformWindows lldbPluginSymbolFileDWARF lldbPluginSymbolFilePDB lldbPluginTypeSystemClang diff --git a/lldb/unittests/SymbolFile/PDB/SymbolFilePDBTests.cpp b/lldb/unittests/SymbolFile/PDB/SymbolFilePDBTests.cpp index 858aecd1b9798..90cd4d568f524 100644 --- a/lldb/unittests/SymbolFile/PDB/SymbolFilePDBTests.cpp +++ b/lldb/unittests/SymbolFile/PDB/SymbolFilePDBTests.cpp @@ -16,11 +16,13 @@ #include "llvm/Testing/Support/Error.h" #include "Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h" +#include "Plugins/Platform/Windows/PlatformWindows.h" #include "Plugins/SymbolFile/DWARF/SymbolFileDWARF.h" #include "Plugins/SymbolFile/PDB/SymbolFilePDB.h" #include "Plugins/TypeSystem/Clang/TypeSystemClang.h" #include "TestingSupport/TestUtilities.h" #include "lldb/Core/Address.h" +#include "lldb/Core/Debugger.h" #include "lldb/Core/Module.h" #include "lldb/Core/ModuleSpec.h" #include "lldb/Host/FileSystem.h" @@ -59,6 +61,13 @@ class SymbolFilePDBTests : public testing::Test { m_pdb_test_exe = GetInputFilePath("test-pdb.exe"); m_types_test_exe = GetInputFilePath("test-pdb-types.exe"); + + ArchSpec arch("x86_64-pc-windows-msvc"); + Platform::SetHostPlatform(PlatformWindows::CreateInstance(true, &arch)); + m_debugger_sp = Debugger::CreateInstance(); + m_debugger_sp->SetPropertyValue(nullptr, + lldb_private::eVarSetOperationAssign, + "plugin.symbol-file.pdb.reader", "dia"); } void TearDown() override { @@ -77,6 +86,7 @@ class SymbolFilePDBTests : public testing::Test { protected: std::string m_pdb_test_exe; std::string m_types_test_exe; + lldb::DebuggerSP m_debugger_sp; bool FileSpecMatchesAsBaseOrFull(const FileSpec &left, const FileSpec &right) const { From 448c0f0329f6dc5c77196a13c226e4d37f8027b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 29 Oct 2025 07:55:47 -1000 Subject: [PATCH 123/539] [flang][cuda][NFC] Enhance test for tma_bulk_g2s lowering (#165603) --- flang/test/Lower/CUDA/cuda-device-proc.cuf | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 8f355217899b3..d8c78887ff924 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -468,7 +468,18 @@ attributes(global) subroutine test_bulk_g2s(a) end subroutine ! CHECK-LABEL: func.func @_QPtest_bulk_g2s -! CHECK: nvvm.cp.async.bulk.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : <7>, <1> +! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %4 {data_attr = #cuf.cuda, uniq_name = "_QFtest_bulk_g2sEbarrier1"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[DST:.*]]:2 = hlfir.declare %16(%17) {data_attr = #cuf.cuda, uniq_name = "_QFtest_bulk_g2sEtmpa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) +! CHECK: %[[COUNT:.*]]:2 = hlfir.declare %19 {data_attr = #cuf.cuda, uniq_name = "_QFtest_bulk_g2sEtx_count"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[SRC:.*]] = hlfir.designate %{{.*}} (%{{.*}}) : (!fir.box>, i64) -> !fir.ref +! CHECK: %[[COUNT_LOAD:.*]] = fir.load %20#0 : !fir.ref +! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr +! CHECK: %[[BARRIER_3:.*]] = llvm.addrspacecast %[[BARRIER_PTR]] : !llvm.ptr to !llvm.ptr<3> +! CHECK: %[[DST_PTR:.*]] = fir.convert %[[DST]]#0 : (!fir.ref>) -> !llvm.ptr +! CHECK: %[[DST_7:.*]] = llvm.addrspacecast %[[DST_PTR]] : !llvm.ptr to !llvm.ptr<7> +! CHECK: %[[SRC_PTR:.*]] = fir.convert %[[SRC]] : (!fir.ref) -> !llvm.ptr +! CHECK: %[[SRC_3:.*]] = llvm.addrspacecast %[[SRC_PTR]] : !llvm.ptr to !llvm.ptr<1> +! CHECK: nvvm.cp.async.bulk.shared.cluster.global %[[DST_7]], %[[SRC_3]], %[[BARRIER_3]], %[[COUNT_LOAD]] : <7>, <1> attributes(global) subroutine test_bulk_s2g(a) real(8), device :: a(*) From c7d161c2ab59611a0f55ad89c4b600b1291a6883 Mon Sep 17 00:00:00 2001 From: lonely eagle <2020382038@qq.com> Date: Thu, 30 Oct 2025 02:03:10 +0800 Subject: [PATCH 124/539] [mlir][bufferize] Use the flag of skipRegions to print op (NFC) (#165516) --- .../Bufferization/Transforms/OneShotAnalysis.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp index fb7f2bb5f01d8..9ccbfd363b1df 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp @@ -620,7 +620,8 @@ hasReadAfterWriteInterference(const DenseSet &usesRead, LDBG() << "\n- bufferizes out-of-place due to parallel region:\n" << " unConflictingWrite = operand " << uConflictingWrite->getOperandNumber() << " of " - << *uConflictingWrite->getOwner(); + << OpWithFlags(uConflictingWrite->getOwner(), + OpPrintingFlags().skipRegions()); return true; } } @@ -631,7 +632,7 @@ hasReadAfterWriteInterference(const DenseSet &usesRead, Operation *readingOp = uRead->getOwner(); LDBG() << "\n- check conflict:\n" << " uRead = operand " << uRead->getOperandNumber() << " of " - << *readingOp; + << OpWithFlags(readingOp, OpPrintingFlags().skipRegions()); // Find the definition of uRead by following the SSA use-def chain. // E.g.: @@ -655,7 +656,8 @@ hasReadAfterWriteInterference(const DenseSet &usesRead, for (OpOperand *uConflictingWrite : usesWrite) { LDBG() << " unConflictingWrite = operand " << uConflictingWrite->getOperandNumber() << " of " - << *uConflictingWrite->getOwner(); + << OpWithFlags(uConflictingWrite->getOwner(), + OpPrintingFlags().skipRegions()); // Check if op dominance can be used to rule out read-after-write // conflicts. @@ -975,7 +977,7 @@ bufferizableInPlaceAnalysisImpl(OpOperand &operand, OneShotAnalysisState &state, const DominanceInfo &domInfo) { LDBG() << "//===-------------------------------------------===//\n" << "Analyzing operand #" << operand.getOperandNumber() << " of " - << *operand.getOwner(); + << OpWithFlags(operand.getOwner(), OpPrintingFlags().skipRegions()); bool foundInterference = wouldCreateWriteToNonWritableBuffer(operand, state) || From ba17c13928875bbc7cacfbf755acf1ac9ad7aa3c Mon Sep 17 00:00:00 2001 From: lonely eagle <2020382038@qq.com> Date: Thu, 30 Oct 2025 02:03:39 +0800 Subject: [PATCH 125/539] [mlir][affine] Add fold logic when the affine.yield has IV as operand in the AffineForEmptyLoopFolder (#164064) Co-authored-by: Jakub Kuderski --- mlir/lib/Dialect/Affine/IR/AffineOps.cpp | 29 ++++++++++++++++++---- mlir/test/Dialect/Affine/canonicalize.mlir | 13 ++++++++++ 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp index 0c3592124cdec..002f1f60bb58e 100644 --- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -2610,6 +2610,21 @@ static std::optional getTrivialConstantTripCount(AffineForOp forOp) { return ub - lb <= 0 ? 0 : (ub - lb + step - 1) / step; } +/// Calculate the constant value of the loop's induction variable for its last +/// trip. +static std::optional +getConstantInductionVarForLastTrip(AffineForOp forOp) { + std::optional tripCount = getTrivialConstantTripCount(forOp); + if (!tripCount.has_value()) + return std::nullopt; + if (tripCount.value() == 0) + return std::nullopt; + int64_t lb = forOp.getConstantLowerBound(); + int64_t step = forOp.getStepAsInt(); + int64_t lastTripIv = lb + (tripCount.value() - 1) * step; + return lastTripIv; +} + /// Fold the empty loop. static SmallVector AffineForEmptyLoopFolder(AffineForOp forOp) { if (!llvm::hasSingleElement(*forOp.getBody())) @@ -2622,7 +2637,7 @@ static SmallVector AffineForEmptyLoopFolder(AffineForOp forOp) { // results. return forOp.getInits(); } - SmallVector replacements; + SmallVector replacements; auto yieldOp = cast(forOp.getBody()->getTerminator()); auto iterArgs = forOp.getRegionIterArgs(); bool hasValDefinedOutsideLoop = false; @@ -2630,10 +2645,14 @@ static SmallVector AffineForEmptyLoopFolder(AffineForOp forOp) { for (unsigned i = 0, e = yieldOp->getNumOperands(); i < e; ++i) { Value val = yieldOp.getOperand(i); BlockArgument *iterArgIt = llvm::find(iterArgs, val); - // TODO: It should be possible to perform a replacement by computing the - // last value of the IV based on the bounds and the step. - if (val == forOp.getInductionVar()) + if (val == forOp.getInductionVar()) { + if (auto lastTripIv = getConstantInductionVarForLastTrip(forOp)) { + replacements.push_back(IntegerAttr::get( + IndexType::get(forOp.getContext()), lastTripIv.value())); + continue; + } return {}; + } if (iterArgIt == iterArgs.end()) { // `val` is defined outside of the loop. assert(forOp.isDefinedOutsideOfLoop(val) && @@ -2656,7 +2675,7 @@ static SmallVector AffineForEmptyLoopFolder(AffineForOp forOp) { // out of order. if (tripCount.has_value() && tripCount.value() >= 2 && iterArgsNotInOrder) return {}; - return llvm::to_vector_of(replacements); + return replacements; } /// Canonicalize the bounds of the given loop. diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir index 1169cd1c29d74..997f23b4bd669 100644 --- a/mlir/test/Dialect/Affine/canonicalize.mlir +++ b/mlir/test/Dialect/Affine/canonicalize.mlir @@ -609,6 +609,19 @@ func.func @fold_zero_iter_loops(%in : index) -> index { // ----- +// CHECK-LABEL: func @fold_empty_loop_iv +// CHECK-SAME: %[[INIT:.*]]: index +func.func @fold_empty_loop_iv(%init: index) -> (index, index) { + %res:2 = affine.for %i = 0 to 10 step 1 iter_args(%arg0 = %init, %arg1 = %init) -> (index, index) { + affine.yield %i, %arg1 : index, index + } + // CHECK: %[[C9:.*]] = arith.constant 9 : index + // CHECK: return %[[C9]], %[[INIT]] : index, index + return %res#0, %res#1 : index, index +} + +// ----- + // CHECK-DAG: #[[$SET:.*]] = affine_set<(d0, d1)[s0] : (d0 >= 0, -d0 + 1022 >= 0, d1 >= 0, -d1 + s0 - 2 >= 0)> // CHECK-LABEL: func @canonicalize_affine_if From ef9a687d463d2b1d2d25951131fcd6e999c09ca1 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 29 Oct 2025 18:16:36 +0000 Subject: [PATCH 126/539] [X86] combinePTESTCC - canonicalize constants to the RHS if the PTEST/TESTP node just uses the ZF flag (#165601) If we're just comparing against zero then move the constant to the RHS to reduce duplicated folds. Noticed while triaging #156233 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 89b42da9a40f0..624cff24ddf03 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48787,6 +48787,11 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, } if (CC == X86::COND_E || CC == X86::COND_NE) { + // Canonicalize constant to RHS if we're just using ZF. + if (Op0 != Op1 && DAG.isConstantIntBuildVectorOrConstantInt(Op0) && + !DAG.isConstantIntBuildVectorOrConstantInt(Op1)) + return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op0); + // TESTZ(X,~Y) == TESTC(Y,X) if (SDValue NotOp1 = IsNOT(Op1, DAG)) { CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE); @@ -48850,10 +48855,6 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, } } - // TESTZ(-1,X) == TESTZ(X,X) - if (ISD::isBuildVectorAllOnes(Op0.getNode())) - return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1); - // TESTZ(X,-1) == TESTZ(X,X) if (ISD::isBuildVectorAllOnes(Op1.getNode())) return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0); From 96637726da70d45e7d725ac4bc001b7f95f06c5a Mon Sep 17 00:00:00 2001 From: Ebuka Ezike Date: Wed, 29 Oct 2025 18:22:54 +0000 Subject: [PATCH 127/539] [lldb] Do not narrow `GetIndexOfChildWithName` return type to int (#165453) Modify the python wrapper to return uint32_t, which prevents incorrect child name-to-index mapping and avoids performing redundant operations on non-existent SBValues. --- lldb/bindings/python/python-wrapper.swig | 2 +- lldb/include/lldb/Interpreter/ScriptInterpreter.h | 2 +- .../Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h | 5 +++-- .../ScriptInterpreter/Python/ScriptInterpreterPython.cpp | 6 +++--- .../ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h | 2 +- lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp | 3 ++- 6 files changed, 11 insertions(+), 9 deletions(-) diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig index 64b7dc8381073..e7acba5b95d89 100644 --- a/lldb/bindings/python/python-wrapper.swig +++ b/lldb/bindings/python/python-wrapper.swig @@ -312,7 +312,7 @@ PyObject *lldb_private::python::SWIGBridge::LLDBSwigPython_GetChildAtIndex(PyObj return result.release(); } -int lldb_private::python::SWIGBridge::LLDBSwigPython_GetIndexOfChildWithName( +uint32_t lldb_private::python::SWIGBridge::LLDBSwigPython_GetIndexOfChildWithName( PyObject * implementor, const char *child_name) { PyErr_Cleaner py_err_cleaner(true); diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h index 6c0054a1ec1d1..edb80dc66aca7 100644 --- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h +++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h @@ -352,7 +352,7 @@ class ScriptInterpreter : public PluginInterface { return lldb::ValueObjectSP(); } - virtual llvm::Expected + virtual llvm::Expected GetIndexOfChildWithName(const StructuredData::ObjectSP &implementor, const char *child_name) { return llvm::createStringError("Type has no child named '%s'", child_name); diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h index 7b39d29ba2b20..27f5d2ee471c0 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h @@ -158,8 +158,9 @@ class SWIGBridge { static PyObject *LLDBSwigPython_GetChildAtIndex(PyObject *implementor, uint32_t idx); - static int LLDBSwigPython_GetIndexOfChildWithName(PyObject *implementor, - const char *child_name); + static uint32_t + LLDBSwigPython_GetIndexOfChildWithName(PyObject *implementor, + const char *child_name); static lldb::ValueObjectSP LLDBSWIGPython_GetValueObjectSPFromSBValue(void *data); diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp index 73c5c72932ff1..d257a08a2c62c 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp @@ -1939,7 +1939,7 @@ lldb::ValueObjectSP ScriptInterpreterPythonImpl::GetChildAtIndex( return ret_val; } -llvm::Expected ScriptInterpreterPythonImpl::GetIndexOfChildWithName( +llvm::Expected ScriptInterpreterPythonImpl::GetIndexOfChildWithName( const StructuredData::ObjectSP &implementor_sp, const char *child_name) { if (!implementor_sp) return llvm::createStringError("Type has no child named '%s'", child_name); @@ -1951,7 +1951,7 @@ llvm::Expected ScriptInterpreterPythonImpl::GetIndexOfChildWithName( if (!implementor) return llvm::createStringError("Type has no child named '%s'", child_name); - int ret_val = INT32_MAX; + uint32_t ret_val = UINT32_MAX; { Locker py_lock(this, @@ -1960,7 +1960,7 @@ llvm::Expected ScriptInterpreterPythonImpl::GetIndexOfChildWithName( child_name); } - if (ret_val == INT32_MAX) + if (ret_val == UINT32_MAX) return llvm::createStringError("Type has no child named '%s'", child_name); return ret_val; } diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h index dedac280788f4..00ae59c1c4241 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h @@ -122,7 +122,7 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython { GetChildAtIndex(const StructuredData::ObjectSP &implementor, uint32_t idx) override; - llvm::Expected + llvm::Expected GetIndexOfChildWithName(const StructuredData::ObjectSP &implementor, const char *child_name) override; diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp index 6f5d9fd97ee28..3d0e2d8a62482 100644 --- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp +++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp @@ -90,7 +90,8 @@ PyObject *lldb_private::python::SWIGBridge::LLDBSwigPython_GetChildAtIndex( return nullptr; } -int lldb_private::python::SWIGBridge::LLDBSwigPython_GetIndexOfChildWithName( +uint32_t +lldb_private::python::SWIGBridge::LLDBSwigPython_GetIndexOfChildWithName( PyObject *implementor, const char *child_name) { return 0; } From da72c0f6a4b06dd7f9fc05887ddd2ff253e7d075 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 29 Oct 2025 11:25:46 -0700 Subject: [PATCH 128/539] [VPlan] Don't preserve LCSSA in expandSCEVs. (#165505) This follows similar reasoning as 45ce88758d24 (https://github.com/llvm/llvm-project/pull/159556): LV does not preserve LCSSA, it constructs it just before processing a loop to vectorize. Runtime check expressions are invariant to that loop, so expanding them should not break LCSSA form for the loop we are about to vectorize. LV creates SCEV and memory runtime checks early on and then disconnects the blocks temporarily. The patch fixes a mis-compile, where previously LCSSA construction during SCEV expand may replace uses in currently unreachable SCEV/memory check blocks. Fixes https://github.com/llvm/llvm-project/issues/162512 PR: https://github.com/llvm/llvm-project/pull/165505 --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +- .../LoopVectorize/X86/scev-checks-unprofitable.ll | 3 +-- .../Transforms/LoopVectorize/create-induction-resume.ll | 5 ++--- .../invalidate-scev-at-scope-after-vectorization.ll | 5 ++--- .../LoopVectorize/nested-loops-scev-expansion.ll | 3 +-- llvm/test/Transforms/LoopVectorize/pr45259.ll | 4 +--- .../Transforms/LoopVectorize/pr58811-scev-expansion.ll | 8 +++----- llvm/test/Transforms/LoopVectorize/pr66616.ll | 3 +-- .../LoopVectorize/reuse-lcssa-phi-scev-expansion.ll | 6 +++--- .../LoopVectorize/scev-exit-phi-invalidation.ll | 7 +++---- 10 files changed, 18 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index d9ac26bba7507..4d98014622224 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -4062,7 +4062,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, DenseMap VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) { const DataLayout &DL = SE.getDataLayout(); - SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/true); + SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/false); auto *Entry = cast(Plan.getEntry()); BasicBlock *EntryBB = Entry->getIRBasicBlock(); diff --git a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll index 905e67b8723f9..7e6b5e932b6c6 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll @@ -17,13 +17,12 @@ define void @value_defined_in_loop1_used_for_trip_counts(i32 %start, i1 %c, ptr ; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[ZEXT]], %[[LOOP_1]] ] ; CHECK-NEXT: br i1 false, label %[[LOOP_1_EXIT:.*]], label %[[LOOP_1]] ; CHECK: [[LOOP_1_EXIT]]: -; CHECK-NEXT: [[IV_1_LCSSA2:%.*]] = phi i64 [ [[IV_1]], %[[LOOP_1]] ] ; CHECK-NEXT: [[IV_1_LCSSA:%.*]] = phi i64 [ [[IV_1]], %[[LOOP_1]] ] ; CHECK-NEXT: br i1 [[C]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_3_PREHEADER:.*]] ; CHECK: [[LOOP_3_PREHEADER]]: ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[IV_1_LCSSA2]], 1 +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[IV_1]], 1 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] diff --git a/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll b/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll index 62399c5d4b4ee..f9b512700f608 100644 --- a/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll +++ b/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll @@ -29,7 +29,6 @@ define void @test(i32 %arg, i32 %L1.limit, i32 %L2.switch, i1 %c, ptr %dst) { ; CHECK: L1.early.exit: ; CHECK-NEXT: ret void ; CHECK: L1.exit: -; CHECK-NEXT: [[INDUCTION_IV_LCSSA1:%.*]] = phi i32 [ [[INDUCTION_IV]], [[L1_BACKEDGE]] ] ; CHECK-NEXT: [[L1_EXIT_VAL:%.*]] = phi i32 [ [[L1_SUM_NEXT]], [[L1_BACKEDGE]] ] ; CHECK-NEXT: br label [[L2_HEADER:%.*]] ; CHECK: L2.header.loopexit: @@ -46,11 +45,11 @@ define void @test(i32 %arg, i32 %L1.limit, i32 %L2.switch, i1 %c, ptr %dst) { ; CHECK: vector.ph: ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[L1_EXIT_VAL]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[INDUCTION_IV_LCSSA1]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[INDUCTION_IV]], i64 0 ; CHECK-NEXT: [[DOTSPLAT1:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> , [[DOTSPLAT1]] ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> splat (i32 1), [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[INDUCTION_IV_LCSSA1]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[INDUCTION_IV]], 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll index 1f32f89001ee0..32de44ce8aac1 100644 --- a/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll @@ -31,12 +31,11 @@ define void @test_invalidate_scevs_at_scope(ptr %p) { ; CHECK-NEXT: [[C_1:%.*]] = icmp eq i32 [[IV_1]], 100 ; CHECK-NEXT: br i1 [[C_1]], label %[[EXIT_1:.*]], label %[[LOOP_1]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT_1]]: -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP4]], %[[LOOP_1]] ] ; CHECK-NEXT: [[ADD_LCSSA1:%.*]] = phi i32 [ [[ADD_1]], %[[LOOP_1]] ] -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = add i32 [[DOTLCSSA]], 100 +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = add i32 [[TMP4]], 100 ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[ADD_LCSSA]], i32 100) ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SMAX]], -100 -; CHECK-NEXT: [[TMP5:%.*]] = sub i32 [[TMP3]], [[DOTLCSSA]] +; CHECK-NEXT: [[TMP5:%.*]] = sub i32 [[TMP3]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP7]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/nested-loops-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/nested-loops-scev-expansion.ll index 8525b3aa5d349..3bf5c0d1d13a9 100644 --- a/llvm/test/Transforms/LoopVectorize/nested-loops-scev-expansion.ll +++ b/llvm/test/Transforms/LoopVectorize/nested-loops-scev-expansion.ll @@ -222,10 +222,9 @@ define void @pr52024(ptr %dst, i16 %N) { ; CHECK-NEXT: [[EXITCOND_2:%.*]] = icmp eq i16 [[IV_1_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_2]], label %[[LOOP_2_PH:.*]], label %[[LOOP_1]] ; CHECK: [[LOOP_2_PH]]: -; CHECK-NEXT: [[IV_1_LCSSA2:%.*]] = phi i16 [ [[IV_1]], %[[LOOP_1_LATCH]] ] ; CHECK-NEXT: [[IV_1_NEXT_LCSSA:%.*]] = phi i16 [ [[IV_1_NEXT]], %[[LOOP_1_LATCH]] ] ; CHECK-NEXT: [[IV_1_NEXT_EXT:%.*]] = sext i16 [[IV_1_NEXT_LCSSA]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = mul i16 [[IV_1_LCSSA2]], 3 +; CHECK-NEXT: [[TMP0:%.*]] = mul i16 [[IV_1]], 3 ; CHECK-NEXT: br label %[[LOOP_2_HEADER:.*]] ; CHECK: [[LOOP_2_HEADER]]: ; CHECK-NEXT: [[IV_1_REM:%.*]] = urem i64 100, [[IV_1_NEXT_EXT]] diff --git a/llvm/test/Transforms/LoopVectorize/pr45259.ll b/llvm/test/Transforms/LoopVectorize/pr45259.ll index f33437fd8ebde..7a048a9a607ba 100644 --- a/llvm/test/Transforms/LoopVectorize/pr45259.ll +++ b/llvm/test/Transforms/LoopVectorize/pr45259.ll @@ -14,12 +14,10 @@ define i8 @widget(ptr %arr, i8 %t9) { ; CHECK-NEXT: [[C:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[C]], label [[FOR_PREHEADER:%.*]], label [[BB6]] ; CHECK: for.preheader: -; CHECK-NEXT: [[T1_0_LCSSA4:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ] ; CHECK-NEXT: [[T1_0_LCSSA1:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ] ; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[ARR1]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[TMP0]] -; CHECK-NEXT: [[T1_0_LCSSA3:%.*]] = ptrtoint ptr [[T1_0_LCSSA4]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[T1_0_LCSSA3]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[T1_0_LCSSA2]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP3]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll index 269c3bf73c869..879c7ae5c3c43 100644 --- a/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll +++ b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll @@ -19,11 +19,10 @@ define void @test1_pr58811() { ; CHECK-NEXT: [[INDUCTION_IV_NEXT]] = add i32 [[INDUCTION_IV]], [[TMP1]] ; CHECK-NEXT: br i1 false, label [[LOOP_1]], label [[LOOP_2_PREHEADER:%.*]] ; CHECK: loop.2.preheader: -; CHECK-NEXT: [[INDUCTION_IV_LCSSA:%.*]] = phi i32 [ [[INDUCTION_IV]], [[LOOP_1]] ] ; CHECK-NEXT: [[IV_1_LCSSA:%.*]] = phi i32 [ [[IV_1]], [[LOOP_1]] ] ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[IND_END:%.*]] = mul i32 196, [[INDUCTION_IV_LCSSA]] +; CHECK-NEXT: [[IND_END:%.*]] = mul i32 196, [[INDUCTION_IV]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -111,8 +110,8 @@ define void @test2_pr58811() { ; CHECK-NEXT: [[INDUCTION_IV_NEXT]] = add i32 [[INDUCTION_IV]], [[TMP1]] ; CHECK-NEXT: br i1 false, label [[LOOP_2]], label [[LOOP_3_PREHEADER:%.*]] ; CHECK: loop.3.preheader: -; CHECK-NEXT: [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], [[LOOP_2]] ] ; CHECK-NEXT: [[INDUCTION_IV_LCSSA:%.*]] = phi i32 [ [[INDUCTION_IV]], [[LOOP_2]] ] +; CHECK-NEXT: [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], [[LOOP_2]] ] ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[IND_END:%.*]] = mul i32 196, [[INDUCTION_IV_LCSSA]] @@ -182,12 +181,11 @@ define void @test3_pr58811() { ; CHECK-NEXT: [[ADD101:%.*]] = add i32 [[REM85]], [[P_2]] ; CHECK-NEXT: br i1 false, label [[LOOP_2]], label [[LOOP_3_PREHEADER:%.*]] ; CHECK: loop.3.preheader: -; CHECK-NEXT: [[P_2_LCSSA:%.*]] = phi i32 [ [[P_2]], [[LOOP_2]] ] ; CHECK-NEXT: [[ADD101_LCSSA:%.*]] = phi i32 [ [[ADD101]], [[LOOP_2]] ] ; CHECK-NEXT: [[TMP0:%.*]] = udiv i32 1, [[P_1]] ; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i32 [[P_1]], [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = sub i32 [[TMP2]], [[P_2_LCSSA]] +; CHECK-NEXT: [[TMP3:%.*]] = sub i32 [[TMP2]], [[P_2]] ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[IND_END:%.*]] = mul i32 196, [[TMP3]] diff --git a/llvm/test/Transforms/LoopVectorize/pr66616.ll b/llvm/test/Transforms/LoopVectorize/pr66616.ll index 1ef614ab32472..1e093407620d5 100644 --- a/llvm/test/Transforms/LoopVectorize/pr66616.ll +++ b/llvm/test/Transforms/LoopVectorize/pr66616.ll @@ -18,10 +18,9 @@ define void @pr66616(ptr %ptr) { ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP0]], [[VECTOR_BODY]] ] ; CHECK-NEXT: br label [[LOOP_1:%.*]] ; CHECK: preheader: -; CHECK-NEXT: [[TMP4:%.*]] = sub i32 -1, [[DOTLCSSA]] +; CHECK-NEXT: [[TMP4:%.*]] = sub i32 -1, [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 ; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP6]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll index c270a23344f54..faca86a41b023 100644 --- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll +++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll @@ -205,12 +205,11 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog ; CHECK-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1 ; CHECK-NEXT: br i1 [[INVAR_C]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_1]] ; CHECK: [[LOOP_2_PREHEADER]]: -; CHECK-NEXT: [[INDVAR_LCSSA1:%.*]] = phi i32 [ [[INDVAR]], %[[LOOP_1]] ] ; CHECK-NEXT: [[IV_1_LCSSA:%.*]] = phi i32 [ [[IV_1]], %[[LOOP_1]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IV_1_LCSSA]], [[STEP]] ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 0) ; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[STEP]], -2 -; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[INDVAR_LCSSA1]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[INDVAR]], -1 ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SMAX]], [[TMP4]] ; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 1) @@ -219,7 +218,8 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog ; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[STEP]], i32 1) ; CHECK-NEXT: [[TMP8:%.*]] = udiv i32 [[TMP7]], [[UMAX]] ; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[INDVAR_LCSSA1]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = sub i32 2, [[STEP]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[IV_1_LCSSA]], [[TMP16]] ; CHECK-NEXT: [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 0) ; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP3]], -1 ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[SMAX1]], [[TMP14]] diff --git a/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll b/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll index c7b27040d6484..479d859a9287c 100644 --- a/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll +++ b/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll @@ -19,15 +19,14 @@ define void @test_pr63368(i1 %c, ptr %A) { ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP0]], [[VECTOR_BODY]] ] ; CHECK-NEXT: br label [[EXIT_1:%.*]] ; CHECK: exit.1: -; CHECK-NEXT: [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[DOTLCSSA]], i32 -1) +; CHECK-NEXT: [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 -1) ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SMAX1]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: -; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 poison, i32 -1) +; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 -1) ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SMAX]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 ; CHECK-NEXT: [[TMP5:%.*]] = add i8 1, [[TMP4]] @@ -61,7 +60,7 @@ define void @test_pr63368(i1 %c, ptr %A) { ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i8 [[IV_2_NEXT]] ; CHECK-NEXT: store i8 0, ptr [[GEP_A]], align 1 ; CHECK-NEXT: [[IV_2_SEXT:%.*]] = sext i8 [[IV_2]] to i32 -; CHECK-NEXT: [[EC_2:%.*]] = icmp sge i32 [[DOTLCSSA]], [[IV_2_SEXT]] +; CHECK-NEXT: [[EC_2:%.*]] = icmp sge i32 [[TMP0]], [[IV_2_SEXT]] ; CHECK-NEXT: br i1 [[EC_2]], label [[LOOP_2]], label [[EXIT_2]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: exit.2: ; CHECK-NEXT: ret void From b39fc454489b4119e2e3c3630588d5d8bb92a6d7 Mon Sep 17 00:00:00 2001 From: lonely eagle <2020382038@qq.com> Date: Thu, 30 Oct 2025 02:28:24 +0800 Subject: [PATCH 129/539] Revert "[mlir][affine] Add fold logic when the affine.yield has IV as operand in the AffineForEmptyLoopFolder" (#165607) Reverts llvm/llvm-project#164064 Broke Windows on mlir-s390x-linux buildbot build, needs investigations. --- mlir/lib/Dialect/Affine/IR/AffineOps.cpp | 29 ++++------------------ mlir/test/Dialect/Affine/canonicalize.mlir | 13 ---------- 2 files changed, 5 insertions(+), 37 deletions(-) diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp index 002f1f60bb58e..0c3592124cdec 100644 --- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -2610,21 +2610,6 @@ static std::optional getTrivialConstantTripCount(AffineForOp forOp) { return ub - lb <= 0 ? 0 : (ub - lb + step - 1) / step; } -/// Calculate the constant value of the loop's induction variable for its last -/// trip. -static std::optional -getConstantInductionVarForLastTrip(AffineForOp forOp) { - std::optional tripCount = getTrivialConstantTripCount(forOp); - if (!tripCount.has_value()) - return std::nullopt; - if (tripCount.value() == 0) - return std::nullopt; - int64_t lb = forOp.getConstantLowerBound(); - int64_t step = forOp.getStepAsInt(); - int64_t lastTripIv = lb + (tripCount.value() - 1) * step; - return lastTripIv; -} - /// Fold the empty loop. static SmallVector AffineForEmptyLoopFolder(AffineForOp forOp) { if (!llvm::hasSingleElement(*forOp.getBody())) @@ -2637,7 +2622,7 @@ static SmallVector AffineForEmptyLoopFolder(AffineForOp forOp) { // results. return forOp.getInits(); } - SmallVector replacements; + SmallVector replacements; auto yieldOp = cast(forOp.getBody()->getTerminator()); auto iterArgs = forOp.getRegionIterArgs(); bool hasValDefinedOutsideLoop = false; @@ -2645,14 +2630,10 @@ static SmallVector AffineForEmptyLoopFolder(AffineForOp forOp) { for (unsigned i = 0, e = yieldOp->getNumOperands(); i < e; ++i) { Value val = yieldOp.getOperand(i); BlockArgument *iterArgIt = llvm::find(iterArgs, val); - if (val == forOp.getInductionVar()) { - if (auto lastTripIv = getConstantInductionVarForLastTrip(forOp)) { - replacements.push_back(IntegerAttr::get( - IndexType::get(forOp.getContext()), lastTripIv.value())); - continue; - } + // TODO: It should be possible to perform a replacement by computing the + // last value of the IV based on the bounds and the step. + if (val == forOp.getInductionVar()) return {}; - } if (iterArgIt == iterArgs.end()) { // `val` is defined outside of the loop. assert(forOp.isDefinedOutsideOfLoop(val) && @@ -2675,7 +2656,7 @@ static SmallVector AffineForEmptyLoopFolder(AffineForOp forOp) { // out of order. if (tripCount.has_value() && tripCount.value() >= 2 && iterArgsNotInOrder) return {}; - return replacements; + return llvm::to_vector_of(replacements); } /// Canonicalize the bounds of the given loop. diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir index 997f23b4bd669..1169cd1c29d74 100644 --- a/mlir/test/Dialect/Affine/canonicalize.mlir +++ b/mlir/test/Dialect/Affine/canonicalize.mlir @@ -609,19 +609,6 @@ func.func @fold_zero_iter_loops(%in : index) -> index { // ----- -// CHECK-LABEL: func @fold_empty_loop_iv -// CHECK-SAME: %[[INIT:.*]]: index -func.func @fold_empty_loop_iv(%init: index) -> (index, index) { - %res:2 = affine.for %i = 0 to 10 step 1 iter_args(%arg0 = %init, %arg1 = %init) -> (index, index) { - affine.yield %i, %arg1 : index, index - } - // CHECK: %[[C9:.*]] = arith.constant 9 : index - // CHECK: return %[[C9]], %[[INIT]] : index, index - return %res#0, %res#1 : index, index -} - -// ----- - // CHECK-DAG: #[[$SET:.*]] = affine_set<(d0, d1)[s0] : (d0 >= 0, -d0 + 1022 >= 0, d1 >= 0, -d1 + s0 - 2 >= 0)> // CHECK-LABEL: func @canonicalize_affine_if From 1bd8729a76d05508cb0e05cf64c208cd036f7641 Mon Sep 17 00:00:00 2001 From: Kunqiu Chen Date: Thu, 30 Oct 2025 02:45:27 +0800 Subject: [PATCH 130/539] [SimplifyCFG] Hoist common code when succ is unreachable block (#165570) Previously, `hoistCommonCodeFromSuccessors` returned early if one of the succ of BB has >1 predecessors. However, if the succ is an unreachable BB, we can relax the condition to perform `hoistCommonCodeFromSuccessors` based on the assumption of not reaching UB. See discussion https://github.com/dtcxzyw/llvm-opt-benchmark/pull/2989 for details. Alive2 proof: https://alive2.llvm.org/ce/z/OJOw0s Promising optimization impact: https://github.com/dtcxzyw/llvm-opt-benchmark/pull/2995 --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 15 ++- .../SimplifyCFG/hoist-common-code.ll | 116 ++++++++++++++++++ 2 files changed, 128 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 90423d30aadb2..b03fb6213d61c 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -1866,10 +1866,19 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI, // If either of the blocks has it's address taken, then we can't do this fold, // because the code we'd hoist would no longer run when we jump into the block // by it's address. - for (auto *Succ : successors(BB)) - if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor()) + for (auto *Succ : successors(BB)) { + if (Succ->hasAddressTaken()) return false; - + if (Succ->getSinglePredecessor()) + continue; + // If Succ has >1 predecessors, continue to check if the Succ contains only + // one `unreachable` inst. Since executing `unreachable` inst is an UB, we + // can relax the condition based on the assumptiom that the program would + // never enter Succ and trigger such an UB. + if (isa(*Succ->begin())) + continue; + return false; + } // The second of pair is a SkipFlags bitmask. using SuccIterPair = std::pair; SmallVector SuccIterPairs; diff --git a/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll b/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll index 8ce94d1cf5b4e..98c0599ab209c 100644 --- a/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll +++ b/llvm/test/Transforms/SimplifyCFG/hoist-common-code.ll @@ -486,3 +486,119 @@ else: call void @bar() ret float %op2 } + +define void @test_switch_with_unreachable_block_as_default(i1 %c, i32 %x, ptr %ptr) { +; CHECK-LABEL: @test_switch_with_unreachable_block_as_default( +; CHECK-NEXT: br i1 [[C:%.*]], label [[SW1:%.*]], label [[SW2:%.*]] +; CHECK: sw1: +; CHECK-NEXT: switch i32 [[X:%.*]], label [[UNREACHABLE:%.*]] [ +; CHECK-NEXT: i32 1, label [[COMMON_RET:%.*]] +; CHECK-NEXT: i32 2, label [[BAR:%.*]] +; CHECK-NEXT: ] +; CHECK: sw2: +; CHECK-NEXT: store i64 42, ptr [[PTR:%.*]], align 4 +; CHECK-NEXT: br label [[COMMON_RET]] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: unreachable: +; CHECK-NEXT: unreachable +; CHECK: bar: +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: br label [[COMMON_RET]] +; + br i1 %c, label %sw1, label %sw2 + +sw1: + ; This switch only exists to have an %unreachable block with multiple predecessors. + switch i32 %x, label %unreachable [ + i32 1, label %foo + i32 2, label %bar + ] + +sw2: + switch i32 %x, label %unreachable [ + i32 1, label %bb1 + i32 2, label %bb2 + i32 3, label %bb3 + ] + +bb1: + store i64 42, ptr %ptr + ret void + +bb2: + store i64 42, ptr %ptr + ret void + +bb3: + store i64 42, ptr %ptr + ret void + +unreachable: + unreachable + +foo: + ret void + +bar: + call void @bar() + ret void +} + +define void @test_switch_with_unreachable_block_as_case(i1 %c, i32 %x, ptr %ptr) { +; CHECK-LABEL: @test_switch_with_unreachable_block_as_case( +; CHECK-NEXT: br i1 [[C:%.*]], label [[SW1:%.*]], label [[SW2:%.*]] +; CHECK: sw1: +; CHECK-NEXT: switch i32 [[X:%.*]], label [[UNREACHABLE:%.*]] [ +; CHECK-NEXT: i32 1, label [[COMMON_RET:%.*]] +; CHECK-NEXT: i32 2, label [[BAR:%.*]] +; CHECK-NEXT: ] +; CHECK: sw2: +; CHECK-NEXT: store i64 42, ptr [[PTR:%.*]], align 4 +; CHECK-NEXT: br label [[COMMON_RET]] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: unreachable: +; CHECK-NEXT: unreachable +; CHECK: bar: +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: br label [[COMMON_RET]] +; + br i1 %c, label %sw1, label %sw2 + +sw1: + ; This switch only exists to have an %unreachable block with multiple predecessors. + switch i32 %x, label %unreachable [ + i32 1, label %foo + i32 2, label %bar + ] + +sw2: + switch i32 %x, label %bb3 [ + i32 1, label %bb1 + i32 2, label %bb2 + i32 3, label %unreachable + ] + +bb1: + store i64 42, ptr %ptr + ret void + +bb2: + store i64 42, ptr %ptr + ret void + +bb3: + store i64 42, ptr %ptr + ret void + +unreachable: + unreachable + +foo: + ret void + +bar: + call void @bar() + ret void +} From f86d95031eef3660a0bad6d663613e6815862193 Mon Sep 17 00:00:00 2001 From: nerix Date: Wed, 29 Oct 2025 19:48:33 +0100 Subject: [PATCH 131/539] [LLDB] Skip TestMultipleSlides.py on Windows (#165604) After the default PDB plugin changed to the native one (#165363), this test failed, because it uses the size of public symbols and the native plugin sets the size to 0 (as PDB doesn't include this information explicitly). A PDB was built because the final executable in that test was linked with `-gdwarf`. --- .../API/functionalities/multiple-slides/TestMultipleSlides.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py b/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py index 3d6b27fe68a1b..7fd2ff4229004 100644 --- a/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py +++ b/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py @@ -12,6 +12,10 @@ class MultipleSlidesTestCase(TestBase): NO_DEBUG_INFO_TESTCASE = True + # The intermediate object main.o is compiled without debug info, but + # a.out is linked with `-gdwarf` on Windows. This creates a PDB. + # However, in the native PDB plugin, the symbols don't have a size. + @expectedFailureWindows def test_mulitple_slides(self): """Test that a binary can be slid multiple times correctly.""" self.build() From d29d819993618e2de4de7618635c34712ad96d45 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 29 Oct 2025 09:02:24 -0700 Subject: [PATCH 132/539] [SLP] Do not match the gather node with copyable parent, containing insert instruction If the gather/buildvector node has the match and this matching node has a scheduled copyable parent, and the parent node of the original node has a last instruction, which is non-schedulable and is part of the schedule copyable parent, such matching node should be excluded as non-matching, since it produces wrong def-use chain. Fixes #165435 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 5 +- .../X86/gathered-node-with-in-order-parent.ll | 49 +++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/gathered-node-with-in-order-parent.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 43166c035fe7a..1b55a3b235228 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -16920,7 +16920,10 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( // otherwise TEPtr depends on TE. if ((TEInsertBlock != InsertPt->getParent() || TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) && - !CheckOrdering(InsertPt)) + (!CheckOrdering(InsertPt) || + (UseEI.UserTE->hasCopyableElements() && + isUsedOutsideBlock(const_cast(TEInsertPt)) && + is_contained(UseEI.UserTE->Scalars, TEInsertPt)))) continue; // The node is reused - exit. if (CheckAndUseSameNode(TEPtr)) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-node-with-in-order-parent.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-node-with-in-order-parent.ll new file mode 100644 index 0000000000000..260de1cc2b76a --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-node-with-in-order-parent.ll @@ -0,0 +1,49 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define double @test() { +; CHECK-LABEL: define double @test() { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: br label %[[BB1:.*]] +; CHECK: [[BB1]]: +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP3:%.*]], %[[BB4:.*]] ] +; CHECK-NEXT: br label %[[BB4]] +; CHECK: [[BB4]]: +; CHECK-NEXT: [[MUL:%.*]] = mul i32 0, 1 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[MUL]], i32 0 +; CHECK-NEXT: [[TMP3]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[MUL]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP0]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP6]], 0 +; CHECK-NEXT: br i1 false, label %[[BB7:.*]], label %[[BB1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i32> [ [[TMP5]], %[[BB4]] ] +; CHECK-NEXT: ret double 0.000000e+00 +; +bb: + br label %bb1 + +bb1: + %phi = phi i32 [ 0, %bb ], [ 0, %bb4 ] + %phi2 = phi i32 [ 0, %bb ], [ 0, %bb4 ] + %phi3 = phi i32 [ 0, %bb ], [ %or5, %bb4 ] + br label %bb4 + +bb4: + %or = or i32 %phi2, 0 + %mul = mul i32 0, 1 + %or5 = or i32 %phi3, %mul + %and = and i32 %or, 0 + %or6 = or i32 %phi2, 1 + br i1 false, label %bb7, label %bb1 + +bb7: + %phi8 = phi i32 [ %phi, %bb4 ] + %phi9 = phi i32 [ %or, %bb4 ] + %phi10 = phi i32 [ %or5, %bb4 ] + %phi11 = phi i32 [ %or6, %bb4 ] + ret double 0.000000e+00 +} + From 50b7f5984c3d7fff66cb7eebf937acbd5bb76697 Mon Sep 17 00:00:00 2001 From: Ebuka Ezike Date: Wed, 29 Oct 2025 18:53:00 +0000 Subject: [PATCH 133/539] [lldb-dap] Report any errors during attach request (#165270) Attaching using `core`, `gdbremote` or `attachInfo` may have an error. fail early if it does. --- .../tools/lldb-dap/coreFile/TestDAP_coreFile.py | 15 +++++++++++++++ .../lldb-dap/Handler/AttachRequestHandler.cpp | 2 ++ 2 files changed, 17 insertions(+) diff --git a/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py b/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py index 1143cd93a70b3..d56a8a45ebf1e 100644 --- a/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py +++ b/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py @@ -61,6 +61,21 @@ def test_core_file(self): self.dap_server.request_next(threadId=32259) self.assertEqual(self.get_stackFrames(), expected_frames) + def test_wrong_core_file(self): + exe_file = self.getSourcePath("linux-x86_64.out") + wrong_core_file = self.getSourcePath("main.c") + + self.create_debug_adapter() + resp = self.attach( + program=exe_file, coreFile=wrong_core_file, expectFailure=True + ) + self.assertIsNotNone(resp) + self.assertFalse(resp["success"], "Expected failure in response {resp!r}") + error_msg = resp["body"]["error"]["format"] + + # attach may fail for mutilple reasons. + self.assertEqual(error_msg, "Failed to create the process") + @skipIfLLVMTargetMissing("X86") def test_core_file_source_mapping_array(self): """Test that sourceMap property is correctly applied when loading a core""" diff --git a/lldb/tools/lldb-dap/Handler/AttachRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/AttachRequestHandler.cpp index 371349a26866e..490513fe8a0b8 100644 --- a/lldb/tools/lldb-dap/Handler/AttachRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/AttachRequestHandler.cpp @@ -124,6 +124,8 @@ Error AttachRequestHandler::Run(const AttachRequestArguments &args) const { attach_info.SetWaitForLaunch(args.waitFor, /*async=*/false); dap.target.Attach(attach_info, error); } + if (error.Fail()) + return ToError(error); } // Make sure the process is attached and stopped. From d741899af1de35df262ad8f5466b9f2bb6015e7e Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Wed, 29 Oct 2025 11:54:18 -0700 Subject: [PATCH 134/539] [DirectX] Add DXIL validation of `llvm.loop` metadata (#164292) This pr adds the equivalent validation of `llvm.loop` metadata that is [done in DXC](https://github.com/microsoft/DirectXShaderCompiler/blob/8f21027f2ad5dcfa63a275cbd278691f2c8fad33/lib/DxilValidation/DxilValidation.cpp#L3010). This is done as follows: - Add `llvm.loop` to the metadata allow-list in `DXILTranslateMetadata` - Iterate through all `llvm.loop` metadata nodes and strip all incompatible ones - Raise an error for ill-formed nodes that are compatible with DXIL Resolves: https://github.com/llvm/llvm-project/issues/137387 --- .../Target/DirectX/DXILTranslateMetadata.cpp | 223 ++++++++++++------ .../Target/DirectX/DXILTranslateMetadata.h | 17 ++ .../CodeGen/DirectX/Metadata/loop-md-errs.ll | 113 +++++++++ .../DirectX/Metadata/loop-md-stripped.ll | 58 +++++ .../CodeGen/DirectX/Metadata/loop-md-valid.ll | 95 ++++++++ .../Metadata/multiple-entries-cs-error.ll | 2 +- .../CodeGen/DirectX/metadata-stripping.ll | 5 +- 7 files changed, 443 insertions(+), 70 deletions(-) create mode 100644 llvm/test/CodeGen/DirectX/Metadata/loop-md-errs.ll create mode 100644 llvm/test/CodeGen/DirectX/Metadata/loop-md-stripped.ll create mode 100644 llvm/test/CodeGen/DirectX/Metadata/loop-md-valid.ll diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp index 1e4797bbd05aa..e345bda23b133 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp @@ -36,9 +36,10 @@ using namespace llvm; using namespace llvm::dxil; namespace { -/// A simple Wrapper DiagnosticInfo that generates Module-level diagnostic -/// for TranslateMetadata pass -class DiagnosticInfoTranslateMD : public DiagnosticInfo { + +/// A simple wrapper of DiagnosticInfo that generates module-level diagnostic +/// for the DXILValidateMetadata pass +class DiagnosticInfoValidateMD : public DiagnosticInfo { private: const Twine &Msg; const Module &Mod; @@ -47,9 +48,9 @@ class DiagnosticInfoTranslateMD : public DiagnosticInfo { /// \p M is the module for which the diagnostic is being emitted. \p Msg is /// the message to show. Note that this class does not copy this message, so /// this reference must be valid for the whole life time of the diagnostic. - DiagnosticInfoTranslateMD(const Module &M, - const Twine &Msg LLVM_LIFETIME_BOUND, - DiagnosticSeverity Severity = DS_Error) + DiagnosticInfoValidateMD(const Module &M, + const Twine &Msg LLVM_LIFETIME_BOUND, + DiagnosticSeverity Severity = DS_Error) : DiagnosticInfo(DK_Unsupported, Severity), Msg(Msg), Mod(M) {} void print(DiagnosticPrinter &DP) const override { @@ -57,6 +58,16 @@ class DiagnosticInfoTranslateMD : public DiagnosticInfo { } }; +static void reportError(Module &M, Twine Message, + DiagnosticSeverity Severity = DS_Error) { + M.getContext().diagnose(DiagnosticInfoValidateMD(M, Message, Severity)); +} + +static void reportLoopError(Module &M, Twine Message, + DiagnosticSeverity Severity = DS_Error) { + reportError(M, Twine("Invalid \"llvm.loop\" metadata: ") + Message, Severity); +} + enum class EntryPropsTag { ShaderFlags = 0, GSState, @@ -314,25 +325,122 @@ static void translateBranchMetadata(Module &M, Instruction *BBTerminatorInst) { BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr); } -static std::array getCompatibleInstructionMDs(llvm::Module &M) { +// Determines if the metadata node will be compatible with DXIL's loop metadata +// representation. +// +// Reports an error for compatible metadata that is ill-formed. +static bool isLoopMDCompatible(Module &M, Metadata *MD) { + // DXIL only accepts the following loop hints: + std::array ValidHintNames = {"llvm.loop.unroll.count", + "llvm.loop.unroll.disable", + "llvm.loop.unroll.full"}; + + MDNode *HintMD = dyn_cast(MD); + if (!HintMD || HintMD->getNumOperands() == 0) + return false; + + auto *HintStr = dyn_cast(HintMD->getOperand(0)); + if (!HintStr) + return false; + + if (!llvm::is_contained(ValidHintNames, HintStr->getString())) + return false; + + auto ValidCountNode = [](MDNode *CountMD) -> bool { + if (CountMD->getNumOperands() == 2) + if (auto *Count = dyn_cast(CountMD->getOperand(1))) + if (isa(Count->getValue())) + return true; + return false; + }; + + if (HintStr->getString() == "llvm.loop.unroll.count") { + if (!ValidCountNode(HintMD)) { + reportLoopError(M, "\"llvm.loop.unroll.count\" must have 2 operands and " + "the second must be a constant integer"); + return false; + } + } else if (HintMD->getNumOperands() != 1) { + reportLoopError( + M, "\"llvm.loop.unroll.disable\" and \"llvm.loop.unroll.full\" " + "must be provided as a single operand"); + return false; + } + + return true; +} + +static void translateLoopMetadata(Module &M, Instruction *I, MDNode *BaseMD) { + // A distinct node has the self-referential form: !0 = !{ !0, ... } + auto IsDistinctNode = [](MDNode *Node) -> bool { + return Node && Node->getNumOperands() != 0 && Node == Node->getOperand(0); + }; + + // Set metadata to null to remove empty/ill-formed metadata from instruction + if (BaseMD->getNumOperands() == 0 || !IsDistinctNode(BaseMD)) + return I->setMetadata("llvm.loop", nullptr); + + // It is valid to have a chain of self-refential loop metadata nodes, as + // below. We will collapse these into just one when we reconstruct the + // metadata. + // + // Eg: + // !0 = !{!0, !1} + // !1 = !{!1, !2} + // !2 = !{!"llvm.loop.unroll.disable"} + // + // So, traverse down a potential self-referential chain + while (1 < BaseMD->getNumOperands() && + IsDistinctNode(dyn_cast(BaseMD->getOperand(1)))) + BaseMD = dyn_cast(BaseMD->getOperand(1)); + + // To reconstruct a distinct node we create a temporary node that we will + // then update to create a self-reference. + llvm::TempMDTuple TempNode = llvm::MDNode::getTemporary(M.getContext(), {}); + SmallVector CompatibleOperands = {TempNode.get()}; + + // Iterate and reconstruct the metadata nodes that contains any hints, + // stripping any unrecognized metadata. + ArrayRef Operands = BaseMD->operands(); + for (auto &Op : Operands.drop_front()) + if (isLoopMDCompatible(M, Op.get())) + CompatibleOperands.push_back(Op.get()); + + if (2 < CompatibleOperands.size()) + reportLoopError(M, "Provided conflicting hints"); + + MDNode *CompatibleLoopMD = MDNode::get(M.getContext(), CompatibleOperands); + TempNode->replaceAllUsesWith(CompatibleLoopMD); + + I->setMetadata("llvm.loop", CompatibleLoopMD); +} + +using InstructionMDList = std::array; + +static InstructionMDList getCompatibleInstructionMDs(llvm::Module &M) { return { M.getMDKindID("dx.nonuniform"), M.getMDKindID("dx.controlflow.hints"), M.getMDKindID("dx.precise"), llvm::LLVMContext::MD_range, - llvm::LLVMContext::MD_alias_scope, llvm::LLVMContext::MD_noalias}; + llvm::LLVMContext::MD_alias_scope, llvm::LLVMContext::MD_noalias, + M.getMDKindID("llvm.loop")}; } static void translateInstructionMetadata(Module &M) { // construct allowlist of valid metadata node kinds - std::array DXILCompatibleMDs = getCompatibleInstructionMDs(M); + InstructionMDList DXILCompatibleMDs = getCompatibleInstructionMDs(M); + unsigned char MDLoopKind = M.getContext().getMDKindID("llvm.loop"); for (Function &F : M) { for (BasicBlock &BB : F) { // This needs to be done first so that "hlsl.controlflow.hints" isn't - // removed in the whitelist below + // removed in the allow-list below if (auto *I = BB.getTerminator()) translateBranchMetadata(M, I); for (auto &I : make_early_inc_range(BB)) { + if (isa(I)) + if (MDNode *LoopMD = I.getMetadata(MDLoopKind)) + translateLoopMetadata(M, &I, LoopMD); I.dropUnknownNonDebugMetadata(DXILCompatibleMDs); } } @@ -389,31 +497,23 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM, uint64_t CombinedMask = ShaderFlags.getCombinedFlags(); EntryFnMDNodes.emplace_back( emitTopLevelLibraryNode(M, ResourceMD, CombinedMask)); - } else if (MMDI.EntryPropertyVec.size() > 1) { - M.getContext().diagnose(DiagnosticInfoTranslateMD( - M, "Non-library shader: One and only one entry expected")); - } + } else if (1 < MMDI.EntryPropertyVec.size()) + reportError(M, "Non-library shader: One and only one entry expected"); for (const EntryProperties &EntryProp : MMDI.EntryPropertyVec) { - const ComputedShaderFlags &EntrySFMask = - ShaderFlags.getFunctionFlags(EntryProp.Entry); - - // If ShaderProfile is Library, mask is already consolidated in the - // top-level library node. Hence it is not emitted. uint64_t EntryShaderFlags = 0; if (MMDI.ShaderProfile != Triple::EnvironmentType::Library) { - EntryShaderFlags = EntrySFMask; - if (EntryProp.ShaderStage != MMDI.ShaderProfile) { - M.getContext().diagnose(DiagnosticInfoTranslateMD( - M, - "Shader stage '" + - Twine(getShortShaderStage(EntryProp.ShaderStage) + - "' for entry '" + Twine(EntryProp.Entry->getName()) + - "' different from specified target profile '" + - Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) + - "'")))); - } + EntryShaderFlags = ShaderFlags.getFunctionFlags(EntryProp.Entry); + if (EntryProp.ShaderStage != MMDI.ShaderProfile) + reportError( + M, "Shader stage '" + + Twine(getShortShaderStage(EntryProp.ShaderStage)) + + "' for entry '" + Twine(EntryProp.Entry->getName()) + + "' different from specified target profile '" + + Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) + + "'")); } + EntryFnMDNodes.emplace_back(emitEntryMD(EntryProp, Signatures, ResourceMD, EntryShaderFlags, MMDI.ShaderProfile)); @@ -454,45 +554,34 @@ PreservedAnalyses DXILTranslateMetadata::run(Module &M, return PreservedAnalyses::all(); } -namespace { -class DXILTranslateMetadataLegacy : public ModulePass { -public: - static char ID; // Pass identification, replacement for typeid - explicit DXILTranslateMetadataLegacy() : ModulePass(ID) {} - - StringRef getPassName() const override { return "DXIL Translate Metadata"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - - AU.addPreserved(); - AU.addPreserved(); - AU.addPreserved(); - AU.addPreserved(); - AU.addPreserved(); - } +void DXILTranslateMetadataLegacy::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); +} - bool runOnModule(Module &M) override { - DXILResourceMap &DRM = - getAnalysis().getResourceMap(); - DXILResourceTypeMap &DRTM = - getAnalysis().getResourceTypeMap(); - const ModuleShaderFlags &ShaderFlags = - getAnalysis().getShaderFlags(); - dxil::ModuleMetadataInfo MMDI = - getAnalysis().getModuleMetadata(); - - translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI); - translateInstructionMetadata(M); - return true; - } -}; +bool DXILTranslateMetadataLegacy::runOnModule(Module &M) { + DXILResourceMap &DRM = + getAnalysis().getResourceMap(); + DXILResourceTypeMap &DRTM = + getAnalysis().getResourceTypeMap(); + const ModuleShaderFlags &ShaderFlags = + getAnalysis().getShaderFlags(); + dxil::ModuleMetadataInfo MMDI = + getAnalysis().getModuleMetadata(); -} // namespace + translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI); + translateInstructionMetadata(M); + return true; +} char DXILTranslateMetadataLegacy::ID = 0; diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h index 4c1ffac1781e6..cfb8aaa8f98b5 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h @@ -10,6 +10,7 @@ #define LLVM_TARGET_DIRECTX_DXILTRANSLATEMETADATA_H #include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" namespace llvm { @@ -20,6 +21,22 @@ class DXILTranslateMetadata : public PassInfoMixin { PreservedAnalyses run(Module &M, ModuleAnalysisManager &); }; +/// Wrapper pass for the legacy pass manager. +/// +/// This is required because the passes that will depend on this are codegen +/// passes which run through the legacy pass manager. +class DXILTranslateMetadataLegacy : public ModulePass { +public: + static char ID; // Pass identification, replacement for typeid + explicit DXILTranslateMetadataLegacy() : ModulePass(ID) {} + + StringRef getPassName() const override { return "DXIL Translate Metadata"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + bool runOnModule(Module &M) override; +}; + } // namespace llvm #endif // LLVM_TARGET_DIRECTX_DXILTRANSLATEMETADATA_H diff --git a/llvm/test/CodeGen/DirectX/Metadata/loop-md-errs.ll b/llvm/test/CodeGen/DirectX/Metadata/loop-md-errs.ll new file mode 100644 index 0000000000000..fbe4653b45dea --- /dev/null +++ b/llvm/test/CodeGen/DirectX/Metadata/loop-md-errs.ll @@ -0,0 +1,113 @@ +; RUN: split-file %s %t +; RUN: not opt -S --dxil-translate-metadata %t/args.ll 2>&1 | FileCheck %t/args.ll +; RUN: not opt -S --dxil-translate-metadata %t/bad-count.ll 2>&1 | FileCheck %t/bad-count.ll +; RUN: not opt -S --dxil-translate-metadata %t/invalid-disable.ll 2>&1 | FileCheck %t/invalid-disable.ll +; RUN: not opt -S --dxil-translate-metadata %t/invalid-full.ll 2>&1 | FileCheck %t/invalid-full.ll + +; Test that loop metadata is validated as with the DXIL validator + +;--- args.ll + +; CHECK: Invalid "llvm.loop" metadata: Provided conflicting hints + +target triple = "dxilv1.0-unknown-shadermodel6.0-library" + +define void @example_loop(i32 %n) { +entry: + br label %loop.header + +loop.header: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ] + %cmp = icmp slt i32 %i, %n + br i1 %cmp, label %loop.body, label %exit + +loop.body: + %i.next = add nsw i32 %i, 1 + br label %loop.header, !llvm.loop !1 + +exit: + ret void +} + +!1 = !{!1, !2, !3} ; conflicting args +!2 = !{!"llvm.loop.unroll.full"} +!3 = !{!"llvm.loop.unroll.disable"} + +;--- bad-count.ll + +; CHECK: "llvm.loop.unroll.count" must have 2 operands and the second must be a constant integer + +target triple = "dxilv1.0-unknown-shadermodel6.0-library" + +define void @example_loop(i32 %n) { +entry: + br label %loop.header + +loop.header: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ] + %cmp = icmp slt i32 %i, %n + br i1 %cmp, label %loop.body, label %exit + +loop.body: + %i.next = add nsw i32 %i, 1 + br label %loop.header, !llvm.loop !1 + +exit: + ret void +} + +!1 = !{!1, !2} +!2 = !{!"llvm.loop.unroll.count", !"not an int"} ; invalid count parameters + +;--- invalid-disable.ll + +; CHECK: Invalid "llvm.loop" metadata: "llvm.loop.unroll.disable" and "llvm.loop.unroll.full" must be provided as a single operand + +target triple = "dxilv1.0-unknown-shadermodel6.0-library" + +define void @example_loop(i32 %n) { +entry: + br label %loop.header + +loop.header: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ] + %cmp = icmp slt i32 %i, %n + br i1 %cmp, label %loop.body, label %exit + +loop.body: + %i.next = add nsw i32 %i, 1 + br label %loop.header, !llvm.loop !1 + +exit: + ret void +} + +!1 = !{!1, !2} +!2 = !{!"llvm.loop.unroll.disable", i32 0} ; invalid second operand + + +;--- invalid-full.ll + +; CHECK: Invalid "llvm.loop" metadata: "llvm.loop.unroll.disable" and "llvm.loop.unroll.full" must be provided as a single operand + +target triple = "dxilv1.0-unknown-shadermodel6.0-library" + +define void @example_loop(i32 %n) { +entry: + br label %loop.header + +loop.header: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ] + %cmp = icmp slt i32 %i, %n + br i1 %cmp, label %loop.body, label %exit + +loop.body: + %i.next = add nsw i32 %i, 1 + br label %loop.header, !llvm.loop !1 + +exit: + ret void +} + +!1 = !{!1, !2} +!2 = !{!"llvm.loop.unroll.full", i32 0} ; invalid second operand diff --git a/llvm/test/CodeGen/DirectX/Metadata/loop-md-stripped.ll b/llvm/test/CodeGen/DirectX/Metadata/loop-md-stripped.ll new file mode 100644 index 0000000000000..09d8aec2ff0e5 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/Metadata/loop-md-stripped.ll @@ -0,0 +1,58 @@ +; RUN: split-file %s %t +; RUN: opt -S --dxil-translate-metadata %t/not-distinct.ll 2>&1 | FileCheck %t/not-distinct.ll +; RUN: opt -S --dxil-translate-metadata %t/not-md.ll 2>&1 | FileCheck %t/not-md.ll + +; Test that DXIL incompatible loop metadata is stripped + +;--- not-distinct.ll + +; Ensure it is stripped because it is not provided a distinct loop parent +; CHECK-NOT: {!"llvm.loop.unroll.disable"} + +target triple = "dxilv1.0-unknown-shadermodel6.0-library" + +define void @example_loop(i32 %n) { +entry: + br label %loop.header + +loop.header: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ] + %cmp = icmp slt i32 %i, %n + br i1 %cmp, label %loop.body, label %exit + +loop.body: + %i.next = add nsw i32 %i, 1 + br label %loop.header, !llvm.loop !1 + +exit: + ret void +} + +!1 = !{!"llvm.loop.unroll.disable"} ; first node must be a distinct self-reference + + +;--- not-md.ll + +target triple = "dxilv1.0-unknown-shadermodel6.0-library" + +define void @example_loop(i32 %n) { +entry: + br label %loop.header + +loop.header: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ] + %cmp = icmp slt i32 %i, %n + br i1 %cmp, label %loop.body, label %exit + +loop.body: + %i.next = add nsw i32 %i, 1 + ; CHECK: br label %loop.header, !llvm.loop ![[#LOOP_MD:]] + br label %loop.header, !llvm.loop !1 + +exit: + ret void +} + +; CHECK: ![[#LOOP_MD:]] = distinct !{![[#LOOP_MD]]} + +!1 = !{!1, i32 0} ; second operand is not a metadata node diff --git a/llvm/test/CodeGen/DirectX/Metadata/loop-md-valid.ll b/llvm/test/CodeGen/DirectX/Metadata/loop-md-valid.ll new file mode 100644 index 0000000000000..a189c0e3f8aaa --- /dev/null +++ b/llvm/test/CodeGen/DirectX/Metadata/loop-md-valid.ll @@ -0,0 +1,95 @@ +; RUN: split-file %s %t +; RUN: opt -S --dxil-translate-metadata %t/count.ll | FileCheck %t/count.ll +; RUN: opt -S --dxil-translate-metadata %t/disable.ll | FileCheck %t/disable.ll +; RUN: opt -S --dxil-translate-metadata %t/full.ll | FileCheck %t/full.ll + +;--- count.ll + +; Test that we collapse a self-referential chain and allow a unroll.count hint + +target triple = "dxilv1.0-unknown-shadermodel6.0-library" + +define void @example_loop(i32 %n) { +entry: + br label %loop.header + +loop.header: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ] + %cmp = icmp slt i32 %i, %n + br i1 %cmp, label %loop.body, label %exit + +loop.body: + %i.next = add nsw i32 %i, 1 + ; CHECK: br label %loop.header, !llvm.loop ![[#LOOP_MD:]] + br label %loop.header, !llvm.loop !0 + +exit: + ret void +} + +; CHECK: ![[#LOOP_MD]] = distinct !{![[#LOOP_MD]], ![[#COUNT:]]} +; CHECK: ![[#COUNT]] = !{!"llvm.loop.unroll.count", i6 4} + +!0 = !{!0, !1} +!1 = !{!1, !2} +!2 = !{!"llvm.loop.unroll.count", i6 4} + +;--- disable.ll + +; Test that we allow a disable hint + +target triple = "dxilv1.0-unknown-shadermodel6.0-library" + +define void @example_loop(i32 %n) { +entry: + br label %loop.header + +loop.header: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ] + %cmp = icmp slt i32 %i, %n + br i1 %cmp, label %loop.body, label %exit + +loop.body: + %i.next = add nsw i32 %i, 1 + ; CHECK: br label %loop.header, !llvm.loop ![[#LOOP_MD:]] + br label %loop.header, !llvm.loop !0 + +exit: + ret void +} + +; CHECK: ![[#LOOP_MD]] = distinct !{![[#LOOP_MD]], ![[#DISABLE:]]} +; CHECK: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"} + +!0 = !{!0, !1} +!1 = !{!"llvm.loop.unroll.disable"} + +;--- full.ll + +; Test that we allow a full hint + +target triple = "dxilv1.0-unknown-shadermodel6.0-library" + +define void @example_loop(i32 %n) { +entry: + br label %loop.header + +loop.header: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop.body ] + %cmp = icmp slt i32 %i, %n + br i1 %cmp, label %loop.body, label %exit + +loop.body: + %i.next = add nsw i32 %i, 1 + ; CHECK: br label %loop.header, !llvm.loop ![[#LOOP_MD:]] + br label %loop.header, !llvm.loop !0 + +exit: + ret void +} + +; CHECK: ![[#LOOP_MD]] = distinct !{![[#LOOP_MD]], ![[#FULL:]]} +; CHECK: ![[#FULL]] = !{!"llvm.loop.unroll.full"} + +!0 = !{!0, !1} +!1 = !{!"llvm.loop.unroll.full"} diff --git a/llvm/test/CodeGen/DirectX/Metadata/multiple-entries-cs-error.ll b/llvm/test/CodeGen/DirectX/Metadata/multiple-entries-cs-error.ll index 9697d4389a888..5740ee11401f2 100644 --- a/llvm/test/CodeGen/DirectX/Metadata/multiple-entries-cs-error.ll +++ b/llvm/test/CodeGen/DirectX/Metadata/multiple-entries-cs-error.ll @@ -1,4 +1,4 @@ -; RUN: not opt -S -S -dxil-translate-metadata %s 2>&1 | FileCheck %s +; RUN: not opt -S -dxil-translate-metadata %s 2>&1 | FileCheck %s target triple = "dxil-pc-shadermodel6.8-compute" ; CHECK: Non-library shader: One and only one entry expected diff --git a/llvm/test/CodeGen/DirectX/metadata-stripping.ll b/llvm/test/CodeGen/DirectX/metadata-stripping.ll index 531ab6c334d24..53716ff29f292 100644 --- a/llvm/test/CodeGen/DirectX/metadata-stripping.ll +++ b/llvm/test/CodeGen/DirectX/metadata-stripping.ll @@ -14,7 +14,7 @@ entry: %cmp.i = icmp ult i32 1, 2 ; Ensure that the !llvm.loop metadata node gets dropped. - ; CHECK: br i1 %cmp.i, label %_Z4mainDv3_j.exit, label %_Z4mainDv3_j.exit{{$}} + ; CHECK: br i1 %cmp.i, label %_Z4mainDv3_j.exit, label %_Z4mainDv3_j.exit, !llvm.loop [[LOOPMD:![0-9]+]] br i1 %cmp.i, label %_Z4mainDv3_j.exit, label %_Z4mainDv3_j.exit, !llvm.loop !0 _Z4mainDv3_j.exit: ; preds = %for.body.i, %entry @@ -25,7 +25,8 @@ _Z4mainDv3_j.exit: ; preds = %for.body.i, %entry ; No more metadata should be necessary, the rest (the current 0 and 1) ; should be removed. ; CHECK-NOT: !{!"llvm.loop.mustprogress"} -; CHECK: [[RANGEMD]] = !{i32 1, i32 5} +; CHECK-DAG: [[RANGEMD]] = !{i32 1, i32 5} +; CHECK-DAG: [[LOOPMD]] = distinct !{[[LOOPMD]]} ; CHECK-NOT: !{!"llvm.loop.mustprogress"} !0 = distinct !{!0, !1} !1 = !{!"llvm.loop.mustprogress"} From 83eda715810be48cf6e096457ee353b1c134cbc0 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Wed, 29 Oct 2025 12:35:01 -0700 Subject: [PATCH 135/539] [AMDGPU] Support true16 spill restore with sram-ecc (#165320) --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 25 ++- llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir | 67 ++++++ llvm/test/CodeGen/AMDGPU/spillv16.ll | 235 ++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/spillv16.mir | 22 ++ 4 files changed, 348 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index d80a6f339c8f6..a6c1af24e13e9 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1823,6 +1823,16 @@ void SIRegisterInfo::buildSpillLoadStore( } } + Register FinalValueReg = ValueReg; + if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) { + // If we are loading 16-bit value with SRAMECC endabled we need a temp + // 32-bit VGPR to load and extract 16-bits into the final register. + ValueReg = + RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0); + SubReg = ValueReg; + IsKill = false; + } + MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); MachineMemOperand *NewMMO = MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, @@ -1863,6 +1873,17 @@ void SIRegisterInfo::buildSpillLoadStore( MIB.addImm(0); // swz MIB.addMemOperand(NewMMO); + if (FinalValueReg != ValueReg) { + // Extract 16-bit from the loaded 32-bit value. + ValueReg = getSubReg(ValueReg, AMDGPU::lo16); + MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64)) + .addReg(FinalValueReg, getDefRegState(true)) + .addImm(0) + .addReg(ValueReg, getKillRegState(true)) + .addImm(0); + ValueReg = FinalValueReg; + } + if (!IsAGPR && NeedSuperRegDef) MIB.addReg(ValueReg, RegState::ImplicitDefine); @@ -2505,7 +2526,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, unsigned Opc; if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) { assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!"); - Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16; + Opc = ST.d16PreservesUnusedBits() + ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16 + : AMDGPU::SCRATCH_LOAD_USHORT_SADDR; } else { Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR diff --git a/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir b/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir index 0c694d9f49e18..69895833efccb 100644 --- a/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir +++ b/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 -mattr=+real-true16 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=EXPANDED %s +# RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1250 -mattr=+real-true16 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=SRAMECC-EXPANDED %s --- name: spill_restore_vgpr16 @@ -31,6 +32,28 @@ body: | ; EXPANDED-NEXT: $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5) ; EXPANDED-NEXT: $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5) ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16 + ; + ; SRAMECC-EXPANDED-LABEL: name: spill_restore_vgpr16 + ; SRAMECC-EXPANDED: bb.0: + ; SRAMECC-EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16 + ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit renamable $vgpr0_lo16 + ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: bb.1: + ; SRAMECC-EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: S_NOP 1 + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: bb.2: + ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec + ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16 bb.0: successors: %bb.1(0x80000000) S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16 @@ -78,6 +101,29 @@ body: | ; EXPANDED-NEXT: $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5) ; EXPANDED-NEXT: $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5) ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16 + ; + ; SRAMECC-EXPANDED-LABEL: name: spill_restore_vgpr16_middle_of_block + ; SRAMECC-EXPANDED: bb.0: + ; SRAMECC-EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16 + ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit renamable $vgpr0_lo16 + ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: bb.1: + ; SRAMECC-EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: S_NOP 1 + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: bb.2: + ; SRAMECC-EXPANDED-NEXT: S_NOP 1 + ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec + ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16 bb.0: successors: %bb.1(0x80000000) S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16 @@ -124,6 +170,27 @@ body: | ; EXPANDED-NEXT: bb.2: ; EXPANDED-NEXT: $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5) ; EXPANDED-NEXT: $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5) + ; + ; SRAMECC-EXPANDED-LABEL: name: spill_restore_vgpr16_end_of_block + ; SRAMECC-EXPANDED: bb.0: + ; SRAMECC-EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16 + ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit renamable $vgpr0_lo16 + ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: bb.1: + ; SRAMECC-EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: S_NOP 1 + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: bb.2: + ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec + ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec bb.0: successors: %bb.1(0x80000000) S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16 diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll index 0e45df223465d..2d54ac8283a3a 100644 --- a/llvm/test/CodeGen/AMDGPU/spillv16.ll +++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-TRUE16 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-FAKE16 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-TRUE16 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-FAKE16 define void @spill_i16_alu() { ; GCN-TRUE16-LABEL: spill_i16_alu: @@ -32,6 +34,41 @@ define void @spill_i16_alu() { ; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: spill_i16_alu: +; GFX1250-TRUE16: ; %bb.0: ; %entry +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-NEXT: scratch_load_u16 v1, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: spill_i16_alu: +; GFX1250-FAKE16: ; %bb.0: ; %entry +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0 +; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: ;;#ASMSTART +; GFX1250-FAKE16-NEXT: ;;#ASMEND +; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca i16, i32 1, align 4, addrspace(5) @@ -88,6 +125,51 @@ define void @spill_i16_alu_two_vals() { ; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: spill_i16_alu_two_vals: +; GFX1250-TRUE16: ; %bb.0: ; %entry +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v1, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX1250-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: spill_i16_alu_two_vals: +; GFX1250-FAKE16: ; %bb.0: ; %entry +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0 +; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: ;;#ASMSTART +; GFX1250-FAKE16-NEXT: ;;#ASMEND +; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_b32 v1, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v1, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca i16, i32 1, align 4, addrspace(5) %alloca2 = alloca i16, i32 1, align 4, addrspace(5) @@ -140,6 +222,22 @@ define void @spill_i16() { ; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: spill_i16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: ;;#ASMSTART +; GFX1250-NEXT: ;;#ASMEND +; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca i16, i32 1, align 4, addrspace(5) @@ -183,6 +281,22 @@ define void @spill_half() { ; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: spill_half: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: ;;#ASMSTART +; GFX1250-NEXT: ;;#ASMEND +; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca half, i32 1, align 4, addrspace(5) @@ -226,6 +340,22 @@ define void @spill_i16_from_v2i16() { ; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: spill_i16_from_v2i16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: ;;#ASMSTART +; GFX1250-NEXT: ;;#ASMEND +; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5) @@ -283,6 +413,54 @@ define void @spill_2xi16_from_v2i16() { ; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16: +; GFX1250-TRUE16: ; %bb.0: ; %entry +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_clause 0x1 +; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:12 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: spill_2xi16_from_v2i16: +; GFX1250-FAKE16: ; %bb.0: ; %entry +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_clause 0x1 +; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 +; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:12 ; 4-byte Folded Spill +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: ;;#ASMSTART +; GFX1250-FAKE16-NEXT: ;;#ASMEND +; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5) @@ -341,6 +519,47 @@ define void @spill_2xi16_from_v2i16_one_free_reg() { ; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg: +; GFX1250-TRUE16: ; %bb.0: ; %entry +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v7, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.l +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: spill_2xi16_from_v2i16_one_free_reg: +; GFX1250-FAKE16: ; %bb.0: ; %entry +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_u16 v7, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: ;;#ASMSTART +; GFX1250-FAKE16-NEXT: ;;#ASMEND +; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v7, s32 offset:2 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5) @@ -375,6 +594,22 @@ define void @spill_v2i16() { ; GCN-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: spill_v2i16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: ;;#ASMSTART +; GFX1250-NEXT: ;;#ASMEND +; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.mir b/llvm/test/CodeGen/AMDGPU/spillv16.mir index 05569bf394c43..ba2d926eb8883 100644 --- a/llvm/test/CodeGen/AMDGPU/spillv16.mir +++ b/llvm/test/CodeGen/AMDGPU/spillv16.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 -mattr=+real-true16 -run-pass=regallocfast -o - %s | FileCheck -check-prefix=SPILLED %s # RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 -mattr=+real-true16 -run-pass=regallocfast,prologepilog -o - %s | FileCheck -check-prefix=EXPANDED %s +# RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1250 -mattr=+real-true16 -run-pass=regallocfast,prologepilog -o - %s | FileCheck -check-prefix=SRAMECC-EXPANDED %s --- name: spill_restore_vgpr16 @@ -46,6 +47,27 @@ body: | ; EXPANDED-NEXT: $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, addrspace 5) ; EXPANDED-NEXT: $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 2, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, addrspace 5) ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16 + ; + ; SRAMECC-EXPANDED-LABEL: name: spill_restore_vgpr16 + ; SRAMECC-EXPANDED: bb.0: + ; SRAMECC-EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16 + ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 2, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: bb.1: + ; SRAMECC-EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: S_NOP 1 + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: bb.2: + ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec + ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 2, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16 bb.0: S_NOP 0, implicit-def %0:vgpr_16, implicit-def %1:vgpr_16 S_CBRANCH_SCC1 implicit undef $scc, %bb.1 From 85b2d606cc35e573714b7a43e20c45e76776abcb Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Wed, 29 Oct 2025 12:48:03 -0700 Subject: [PATCH 136/539] [AArch64][PAC] Fix an implicit pointer-to-bool conversion (#165056) ... which silently caused the wrong overload to be selected. --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 457e5402e0f46..ccc8eb8a9706d 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -122,7 +122,7 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { NumBytes = Desc.getSize() ? Desc.getSize() : 4; const auto *MFI = MF->getInfo(); - if (!MFI->shouldSignReturnAddress(MF)) + if (!MFI->shouldSignReturnAddress(*MF)) return NumBytes; const auto &STI = MF->getSubtarget(); From 688af7bd3684ddff754bd946fbd93277a5dda4a9 Mon Sep 17 00:00:00 2001 From: Dan Blackwell Date: Wed, 29 Oct 2025 19:56:08 +0000 Subject: [PATCH 137/539] [TSan][Test-Only][Darwin] Fix typo in external.cpp again (#165612) --- compiler-rt/test/tsan/Darwin/external.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/test/tsan/Darwin/external.cpp b/compiler-rt/test/tsan/Darwin/external.cpp index 8372a1eb125f3..bf189eb1d6b5b 100644 --- a/compiler-rt/test/tsan/Darwin/external.cpp +++ b/compiler-rt/test/tsan/Darwin/external.cpp @@ -70,7 +70,7 @@ int main(int argc, char *argv[]) { // TEST3: WARNING: ThreadSanitizer: race on MyLibrary::MyObject // TEST3: {{Modifying|Read-only}} access of MyLibrary::MyObject at // TEST3: {{ObjectWrite|ObjectRead}} - // TEST3: Previous {{modifying|Read-only}} access of MyLibrary::MyObject at + // TEST3: Previous {{modifying|read-only}} access of MyLibrary::MyObject at // TEST3: {{ObjectWrite|ObjectRead}} // TEST3: Location is MyLibrary::MyObject of size 16 at // TEST3: {{ObjectCreate}} From 26b253df0b0e2309b6a722362dcbb55ee15fdd8c Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Wed, 29 Oct 2025 16:09:59 -0400 Subject: [PATCH 138/539] [mlir][amdgpu][rocdl] Allow for graceful wmma conversion failures (#165616) --- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 6 +++--- .../Conversion/AMDGPUToROCDL/wmma-gfx11.mlir | 4 ++-- .../Conversion/AMDGPUToROCDL/wmma-gfx12.mlir | 13 +++++++++++- .../AMDGPUToROCDL/wmma-gfx1250.mlir | 20 ++++++++++++++----- 4 files changed, 32 insertions(+), 11 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 1eca43d96fe85..41e333c621eda 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -1043,7 +1043,7 @@ wmmaOpToIntrinsicRDNA(Type elemSourceType, Type elemBSourceType, return ROCDL::wmma_i32_16x16x32_iu4::getOperationName(); } - llvm_unreachable("Unsupported k value"); + return std::nullopt; } /// Return the `rocdl` intrinsic corresponding to a WMMA operation `wmma` @@ -1135,7 +1135,7 @@ static std::optional wmmaOpToIntrinsicGfx1250(Type elemSourceType, return std::nullopt; } - llvm_unreachable("Unsupported k value"); + return std::nullopt; } /// Returns the `rocdl` intrinsic corresponding to a WMMA operation `wmma` @@ -1164,7 +1164,7 @@ static std::optional wmmaOpToIntrinsic(WMMAOp wmma, return wmmaOpToIntrinsicGfx1250(elemSourceType, elemBSourceType, elemDestType, k); - llvm_unreachable("unhandled WMMA case"); + return std::nullopt; } namespace { diff --git a/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx11.mlir b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx11.mlir index d1301d0089220..9fcc1473d4a18 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx11.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx11.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1100 --allow-unregistered-dialect | FileCheck %s +// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s // CHECK-LABEL: @wmma_to_rocdl func.func @wmma_to_rocdl(%arg0 : vector<16xf16>, %arg1 : vector<8xf32>, %arg2 : vector<4xf32>, @@ -32,5 +32,5 @@ func.func @wmma_to_rocdl(%arg0 : vector<16xf16>, %arg1 : vector<8xf32>, %arg2 : // CHECK: rocdl.wmma.i32.16x16x16.iu4{{.*}}: (i1, i32, i1, i32, vector<4xi32>, i1) -> vector<4xi32> amdgpu.wmma 16x16x16 %arg11 * %arg11 + %arg8 {clamp}: vector<8xi4>, vector<8xi4>, vector<4xi32> - func.return + return } diff --git a/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx12.mlir b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx12.mlir index b897323340402..57883473bbf06 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx12.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx12.mlir @@ -1,4 +1,6 @@ -// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1200 --allow-unregistered-dialect | FileCheck %s +// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1200 \ +// RUN: --split-input-file --verify-diagnostics | FileCheck %s + // CHECK-LABEL: @wmma_to_rocdl func.func @wmma_to_rocdl(%arg0 : vector<8xf16>, %arg1 : vector<4xf16>, %arg2 : vector<8xf32>, %arg3 : vector<4xf32>, @@ -66,3 +68,12 @@ func.func @wmma_to_rocdl(%arg0 : vector<8xf16>, %arg1 : vector<4xf16>, func.return } + +// ----- + +func.func @wmma_unsupported_k(%arg0 : vector<64xf8E4M3FN>, %arg1 : vector<8xf16>) { + // expected-error@below {{'amdgpu.wmma' op no intrinsic matching WMMA on the given chipset}} + // expected-error@below {{failed to legalize operation 'amdgpu.wmma'}} + amdgpu.wmma 16x16x128 %arg0 * %arg0 + %arg1 : vector<64xf8E4M3FN>, vector<64xf8E4M3FN>, vector<8xf16> + func.return +} diff --git a/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx1250.mlir index bcbdef040ebe3..5e77a3add3184 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx1250.mlir @@ -1,10 +1,11 @@ -// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1250 --allow-unregistered-dialect | FileCheck %s +// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1250 \ +// RUN: --split-input-file --verify-diagnostics | FileCheck %s // CHECK-LABEL: @wmma_k4 func.func @wmma_k4(%arg0 : vector<2xf32>, %arg1 : vector<8xf32>) { // CHECK: rocdl.wmma.f32.16x16x4.f32 %arg0, %arg0, %arg1 amdgpu.wmma 16x16x4 %arg0 * %arg0 + %arg1 : vector<2xf32>, vector<2xf32>, vector<8xf32> - func.return + return } // CHECK-LABEL: @wmma_k32 @@ -22,7 +23,7 @@ func.func @wmma_k32(%arg0 : vector<16xf16>, %arg1 : vector<16xbf16>, %arg2 : vec // CHECK: rocdl.wmma.bf16.16x16x32.bf16 {{.*}}, {{.*}}, {{.*}}, {{.*}} : (vector<16xi16>, vector<16xi16>, vector<8xi16>, i1) amdgpu.wmma 16x16x32 %arg1 * %arg1 + %arg4 : vector<16xbf16>, vector<16xbf16>, vector<8xbf16> - func.return + return } // CHECK-LABEL: @wmma_k64 @@ -55,7 +56,7 @@ func.func @wmma_k64(%arg0 : vector<32xi8>, %arg1 : vector<32xf8E4M3FN>, %arg2 : // CHECK: rocdl.wmma.f16.16x16x64.bf8_fp8 {{.*}}, {{.*}}, %arg5, {{.*}} : (vector<8xi32>, vector<8xi32>, vector<8xf16>, i1) amdgpu.wmma 16x16x64 %arg2 * %arg1 + %arg5 : vector<32xf8E5M2>, vector<32xf8E4M3FN>, vector<8xf16> - func.return + return } // CHECK-LABEL: @wmma_k128 @@ -85,5 +86,14 @@ func.func @wmma_k128(%arg0 : vector<64xf8E4M3FN>, %arg1 : vector<64xf8E5M2>, // CHECK: rocdl.wmma.f16.16x16x128.bf8_fp8 {{.*}}, {{.*}}, %arg3, {{.*}} : (vector<16xi32>, vector<16xi32>, vector<8xf16>, i1) amdgpu.wmma 16x16x128 %arg1 * %arg0 + %arg3 : vector<64xf8E5M2>, vector<64xf8E4M3FN>, vector<8xf16> - func.return + return +} + +// ----- + +func.func @wmma_unsupported_k(%arg0 : vector<8xf16>, %arg1 : vector<8xf32>) { + // expected-error@below {{'amdgpu.wmma' op no intrinsic matching WMMA on the given chipset}} + // expected-error@below {{failed to legalize operation 'amdgpu.wmma'}} + amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 : vector<8xf16>, vector<8xf16>, vector<8xf32> + return } From c59db618e15e839fd93af32ca3b7dd09886c14ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 29 Oct 2025 10:19:18 -1000 Subject: [PATCH 139/539] [flang][cuda] Convert src and dst to llvm.ptr in tma_bulk_load (#165618) --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 2 ++ flang/test/Lower/CUDA/cuda-device-proc.cuf | 14 +++++++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 53fe9c0d2f6f0..ca3e1cd46db7d 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -9362,6 +9362,8 @@ static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize); auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); barrier = builder.createConvert(loc, llvmPtrTy, barrier); + dst = builder.createConvert(loc, llvmPtrTy, dst); + src = builder.createConvert(loc, llvmPtrTy, src); mlir::NVVM::InlinePtxOp::create( builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {}, "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], " diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index d8c78887ff924..e5d3c437d7152 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -544,7 +544,7 @@ end subroutine ! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32 ! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 ! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr -! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref>>, !fir.ref>, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) ! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) attributes(global) subroutine test_tma_bulk_load_c8(a, n) @@ -563,7 +563,7 @@ end subroutine ! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 16 : i32 ! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 ! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr -! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref>>, !fir.ref>, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) ! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) attributes(global) subroutine test_tma_bulk_load_i4(a, n) @@ -582,7 +582,7 @@ end subroutine ! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32 ! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 ! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr -! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref>, !fir.ref, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) ! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) attributes(global) subroutine test_tma_bulk_load_i8(a, n) @@ -601,7 +601,7 @@ end subroutine ! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32 ! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 ! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr -! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref>, !fir.ref, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) ! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) attributes(global) subroutine test_tma_bulk_load_r2(a, n) @@ -620,7 +620,7 @@ end subroutine ! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 2 : i32 ! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 ! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr -! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref>, !fir.ref, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) ! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) attributes(global) subroutine test_tma_bulk_load_r4(a, n) @@ -639,7 +639,7 @@ end subroutine ! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32 ! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 ! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr -! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref>, !fir.ref, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) ! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) attributes(global) subroutine test_tma_bulk_load_r8(a, n) @@ -658,7 +658,7 @@ end subroutine ! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32 ! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32 ! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref) -> !llvm.ptr -! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref>, !fir.ref, i32, !llvm.ptr) +! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !llvm.ptr, !llvm.ptr, i32, !llvm.ptr) ! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32) attributes(global) subroutine test_tma_bulk_store_c4(c, n) From 3e1f686a2a97fb1eb487a6269fa6235474a291ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20R=C3=B8nne=20Petersen?= Date: Wed, 29 Oct 2025 21:30:54 +0100 Subject: [PATCH 140/539] [libc++] Fix locale-related compilation errors on NetBSD (#143055) To my knowledge, NetBSD is mostly like other BSDs, but doesn't have `xlocale.h`. I think c664a7f may have inadvertently broken this. With this change, I was able to run [zig-bootstrap](https://github.com/ziglang/zig-bootstrap) to completion for `x86_64-netbsd10.1-none`. --- libcxx/include/CMakeLists.txt | 1 + libcxx/include/__locale_dir/locale_base_api.h | 2 ++ .../include/__locale_dir/support/bsd_like.h | 4 +++- libcxx/include/__locale_dir/support/netbsd.h | 20 +++++++++++++++++++ libcxx/include/module.modulemap.in | 1 + 5 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 libcxx/include/__locale_dir/support/netbsd.h diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 37259a7e6e7dd..de9819cf5346a 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -529,6 +529,7 @@ set(files __locale_dir/support/freebsd.h __locale_dir/support/fuchsia.h __locale_dir/support/linux.h + __locale_dir/support/netbsd.h __locale_dir/support/no_locale/characters.h __locale_dir/support/no_locale/strtonum.h __locale_dir/support/windows.h diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h index 9f3ce02a3af20..8c8f00061d1ed 100644 --- a/libcxx/include/__locale_dir/locale_base_api.h +++ b/libcxx/include/__locale_dir/locale_base_api.h @@ -115,6 +115,8 @@ # include <__locale_dir/support/apple.h> # elif defined(__FreeBSD__) # include <__locale_dir/support/freebsd.h> +# elif defined(__NetBSD__) +# include <__locale_dir/support/netbsd.h> # elif defined(_LIBCPP_MSVCRT_LIKE) # include <__locale_dir/support/windows.h> # elif defined(__Fuchsia__) diff --git a/libcxx/include/__locale_dir/support/bsd_like.h b/libcxx/include/__locale_dir/support/bsd_like.h index ac402924709e5..9d4bdd1d5775f 100644 --- a/libcxx/include/__locale_dir/support/bsd_like.h +++ b/libcxx/include/__locale_dir/support/bsd_like.h @@ -24,7 +24,9 @@ # include #endif -#include +#if __has_include() +# include +#endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__locale_dir/support/netbsd.h b/libcxx/include/__locale_dir/support/netbsd.h new file mode 100644 index 0000000000000..190857f6f84fe --- /dev/null +++ b/libcxx/include/__locale_dir/support/netbsd.h @@ -0,0 +1,20 @@ +//===-----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___LOCALE_DIR_SUPPORT_NETBSD_H +#define _LIBCPP___LOCALE_DIR_SUPPORT_NETBSD_H + +#include <__config> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +#include <__locale_dir/support/bsd_like.h> + +#endif // _LIBCPP___LOCALE_DIR_SUPPORT_NETBSD_H diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in index a86d6c6a43d0e..11ab61d959e22 100644 --- a/libcxx/include/module.modulemap.in +++ b/libcxx/include/module.modulemap.in @@ -1587,6 +1587,7 @@ module std [system] { textual header "__locale_dir/support/freebsd.h" textual header "__locale_dir/support/fuchsia.h" textual header "__locale_dir/support/linux.h" + textual header "__locale_dir/support/netbsd.h" textual header "__locale_dir/support/no_locale/characters.h" textual header "__locale_dir/support/no_locale/strtonum.h" textual header "__locale_dir/support/windows.h" From 1344a25dedd3c127a29958e5ecc8e8c57fb8d55f Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 29 Oct 2025 20:31:09 +0000 Subject: [PATCH 141/539] [gn build] Port e9389436e5ea --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 278c29c766ddb..c20d2aa033690 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -1175,6 +1175,7 @@ if (current_toolchain == default_toolchain) { "__locale_dir/support/freebsd.h", "__locale_dir/support/fuchsia.h", "__locale_dir/support/linux.h", + "__locale_dir/support/netbsd.h", "__locale_dir/support/no_locale/characters.h", "__locale_dir/support/no_locale/strtonum.h", "__locale_dir/support/windows.h", From 7591fb3fe294a37ba2a08a4309ef8a15227cb4f4 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Wed, 29 Oct 2025 15:40:05 -0500 Subject: [PATCH 142/539] [mlir][sparse] Include sparse emit strategy in wrapping iterator (#165611) When we create a `SparseIterator`, we sometimes wrap it in a `FilterIterator`, which delegates _some_ calls to the underlying `SparseIterator`. After construction, e.g. in `makeNonEmptySubSectIterator()`, we call `setSparseEmitStrategy()`. This sets the strategy only in one of the filters -- if we call `setSparseEmitStrategy()` immediately after creating the `SparseIterator`, then the wrapped `SparseIterator` will have the right strategy, and the `FilterIterator` strategy will be unintialized; if we call `setSparseEmitStrategy()` after wrapping the iterator in `FilterIterator`, then the opposite happens. If we make `setSparseEmitStrategy()` a virtual method so that it's included in the `FilterIterator` pattern, and then do all reads of `emitStrategy` via a virtual method as well, it's pretty simple to ensure that the value of `strategy` is being set consistently and correctly. Without this, the UB of strategy being uninitialized manifests as a sporadic test failure in mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_strided_conv_2d_nhwc_hwcf.mlir, when run downstream with the right flags (e.g. asan + assertions off). The test sometimes fails with `ne_sub>.begin' op created with unregistered dialect`. It can also be directly observed w/ msan that this uninitialized read is the cause of that issue, but msan causes other problems w/ this test. --- .../Transforms/Utils/SparseTensorIterator.cpp | 18 +++++++++++++----- .../Transforms/Utils/SparseTensorIterator.h | 6 +++++- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp index 46d0baac58f06..61b5ad600a16e 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp @@ -504,6 +504,14 @@ class SimpleWrapIterator : public SparseIterator { unsigned extraCursorVal = 0) : SparseIterator(kind, *wrap, extraCursorVal), wrap(std::move(wrap)) {} + void setSparseEmitStrategy(SparseEmitStrategy strategy) override { + wrap->setSparseEmitStrategy(strategy); + } + + SparseEmitStrategy getSparseEmitStrategy() const override { + return wrap->getSparseEmitStrategy(); + } + SmallVector getCursorValTypes(OpBuilder &b) const override { return wrap->getCursorValTypes(b); } @@ -979,7 +987,7 @@ class SubSectIterator : public SparseIterator { void SparseIterator::genInit(OpBuilder &b, Location l, const SparseIterator *p) { - if (emitStrategy == SparseEmitStrategy::kDebugInterface) { + if (getSparseEmitStrategy() == SparseEmitStrategy::kDebugInterface) { std::string prefix = getDebugInterfacePrefix(); Operation *begin = b.create(l, b.getStringAttr(prefix + ".begin"), {}, getCursorValTypes(b)); @@ -994,7 +1002,7 @@ void SparseIterator::genInit(OpBuilder &b, Location l, } Value SparseIterator::genNotEnd(OpBuilder &b, Location l) { - if (emitStrategy == SparseEmitStrategy::kDebugInterface) { + if (getSparseEmitStrategy() == SparseEmitStrategy::kDebugInterface) { std::string prefix = getDebugInterfacePrefix(); Operation *notEnd = b.create(l, b.getStringAttr(prefix + ".not_end"), getCursor(), b.getI1Type()); @@ -1005,7 +1013,7 @@ Value SparseIterator::genNotEnd(OpBuilder &b, Location l) { } void SparseIterator::locate(OpBuilder &b, Location l, Value crd) { - if (emitStrategy == SparseEmitStrategy::kDebugInterface) { + if (getSparseEmitStrategy() == SparseEmitStrategy::kDebugInterface) { std::string prefix = getDebugInterfacePrefix(); SmallVector args = getCursor(); args.push_back(crd); @@ -1019,7 +1027,7 @@ void SparseIterator::locate(OpBuilder &b, Location l, Value crd) { } Value SparseIterator::deref(OpBuilder &b, Location l) { - if (emitStrategy == SparseEmitStrategy::kDebugInterface) { + if (getSparseEmitStrategy() == SparseEmitStrategy::kDebugInterface) { std::string prefix = getDebugInterfacePrefix(); SmallVector args = getCursor(); Operation *deref = b.create(l, b.getStringAttr(prefix + ".deref"), @@ -1032,7 +1040,7 @@ Value SparseIterator::deref(OpBuilder &b, Location l) { ValueRange SparseIterator::forward(OpBuilder &b, Location l) { assert(!randomAccessible()); - if (emitStrategy == SparseEmitStrategy::kDebugInterface) { + if (getSparseEmitStrategy() == SparseEmitStrategy::kDebugInterface) { std::string prefix = getDebugInterfacePrefix(); Operation *next = b.create(l, b.getStringAttr(prefix + ".next"), getCursor(), getCursorValTypes(b)); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h index 642cb1afa156b..3636f3f01adb5 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h @@ -177,10 +177,14 @@ class SparseIterator { public: virtual ~SparseIterator() = default; - void setSparseEmitStrategy(SparseEmitStrategy strategy) { + virtual void setSparseEmitStrategy(SparseEmitStrategy strategy) { emitStrategy = strategy; } + virtual SparseEmitStrategy getSparseEmitStrategy() const { + return emitStrategy; + } + virtual std::string getDebugInterfacePrefix() const = 0; virtual SmallVector getCursorValTypes(OpBuilder &b) const = 0; From 2313c6fab514da42c43b0a278d4f1136643b9826 Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Wed, 29 Oct 2025 13:42:08 -0700 Subject: [PATCH 143/539] [DirectX] Use an allow-list of DXIL compatible module metadata (#165290) This pr introduces an allow-list for module metadata, this encompasses the llvm metadata nodes: `llvm.ident` and `llvm.module.flags`, as well as, the generated `dx.` options. Resolves: #164473. --- .../Target/DirectX/DXILTranslateMetadata.cpp | 32 +++++--- llvm/test/CodeGen/DirectX/strip-module-md.ll | 75 +++++++++++++++++++ llvm/test/tools/dxil-dis/di-subprogram.ll | 37 --------- llvm/test/tools/dxil-dis/di-subrotine.ll | 12 --- llvm/test/tools/dxil-dis/md-manystrings.ll | 4 +- 5 files changed, 97 insertions(+), 63 deletions(-) create mode 100644 llvm/test/CodeGen/DirectX/strip-module-md.ll delete mode 100644 llvm/test/tools/dxil-dis/di-subrotine.ll diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp index e345bda23b133..cf8b833b3e42e 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp @@ -472,6 +472,16 @@ static void cleanModuleFlags(Module &M) { M.addModuleFlag(Flag.Behavior, Flag.Key->getString(), Flag.Val); } +using GlobalMDList = std::array; + +// The following are compatible with DXIL but not emit with clang, they can +// be added when applicable: +// dx.typeAnnotations, dx.viewIDState, dx.dxrPayloadAnnotations +static GlobalMDList CompatibleNamedModuleMDs = { + "llvm.ident", "llvm.module.flags", "dx.resources", "dx.valver", + "dx.shaderModel", "dx.version", "dx.entryPoints", +}; + static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM, DXILResourceTypeMap &DRTM, const ModuleShaderFlags &ShaderFlags, @@ -526,19 +536,17 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM, cleanModuleFlags(M); - // dx.rootsignatures will have been parsed from its metadata form as its - // binary form as part of the RootSignatureAnalysisWrapper, so safely - // remove it as it is not recognized in DXIL - if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures")) - RootSignature->eraseFromParent(); + // Finally, strip all module metadata that is not explicitly specified in the + // allow-list + SmallVector ToStrip; - // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and - // causes all tests using the DXIL Validator to fail. - // - // This is a temporary fix and should be replaced with a allowlist once - // we have determined all metadata that the DXIL Validator allows - if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa")) - ErrNo->eraseFromParent(); + for (NamedMDNode &NamedMD : M.named_metadata()) + if (!NamedMD.getName().starts_with("llvm.dbg.") && + !llvm::is_contained(CompatibleNamedModuleMDs, NamedMD.getName())) + ToStrip.push_back(&NamedMD); + + for (NamedMDNode *NamedMD : ToStrip) + NamedMD->eraseFromParent(); } PreservedAnalyses DXILTranslateMetadata::run(Module &M, diff --git a/llvm/test/CodeGen/DirectX/strip-module-md.ll b/llvm/test/CodeGen/DirectX/strip-module-md.ll new file mode 100644 index 0000000000000..4d8b9ec935f6b --- /dev/null +++ b/llvm/test/CodeGen/DirectX/strip-module-md.ll @@ -0,0 +1,75 @@ +; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s + +; Ensures that only metadata explictly specified on the allow list, or debug +; related, metadata is emitted + +target triple = "dxil-unknown-shadermodel6.0-compute" + +; CHECK-NOT: !dx.rootsignatures +; CHECK-NOT: !llvm.errno.tbaa + +; CHECK-DAG: !llvm.dbg.cu + +; CHECK-DAG: !llvm.module.flags = !{![[#DWARF_VER:]], ![[#DEBUG_VER:]]} +; CHECK-DAG: !llvm.ident = !{![[#IDENT:]]} + +; CHECK-DAG: !dx.shaderModel +; CHECK-DAG: !dx.version +; CHECK-DAG: !dx.entryPoints +; CHECK-DAG: !dx.valver +; CHECK-DAG: !dx.resources + +; CHECK-NOT: !dx.rootsignatures +; CHECK-NOT: !llvm.errno.tbaa + +; Check allowed llvm metadata structure to ensure it is still DXIL compatible +; If this fails, please ensure that the updated form is DXIL compatible before +; updating the test. + +; CHECK-DAG: ![[#IDENT]] = !{!"clang 22.0.0"} +; CHECK-DAG: ![[#DWARF_VER]] = !{i32 2, !"Dwarf Version", i32 2} +; CHECK-DAG: ![[#DEBUG_VER]] = !{i32 2, !"Debug Info Version", i32 3} + +; CHECK-NOT: !dx.rootsignatures +; CHECK-NOT: !llvm.errno.tbaa + +@BufA.str = private unnamed_addr constant [5 x i8] c"BufA\00", align 1 + +define void @main () #0 { +entry: + %typed0 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_1_0_0( + i32 3, i32 5, i32 1, i32 0, ptr @BufA.str) + ret void +} + +attributes #0 = { noinline nounwind "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } + +; Incompatible +!dx.rootsignatures = !{!2} +!llvm.errno.tbaa = !{!5} + +; Compatible +!llvm.dbg.cu = !{!8} +!llvm.module.flags = !{!11, !12} +!llvm.ident = !{!13} +!dx.valver = !{!14} + +!2 = !{ ptr @main, !3, i32 2 } +!3 = !{ !4 } +!4 = !{ !"RootFlags", i32 1 } + +!5 = !{!6, !6, i64 0} +!6 = !{!"omnipotent char", !7} +!7 = !{!"Simple C/C++ TBAA"} + +!8 = distinct !DICompileUnit(language: DW_LANG_C99, file: !9, producer: "Some Compiler", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !10, splitDebugInlining: false, nameTableKind: None) +!9 = !DIFile(filename: "hlsl.hlsl", directory: "/some-path") +!10 = !{} + +!11 = !{i32 2, !"Dwarf Version", i32 2} +!12 = !{i32 2, !"Debug Info Version", i32 3} + +!13 = !{!"clang 22.0.0"} + +!14 = !{i32 1, i32 1} diff --git a/llvm/test/tools/dxil-dis/di-subprogram.ll b/llvm/test/tools/dxil-dis/di-subprogram.ll index 8255d396dd55d..912421fb28ae5 100644 --- a/llvm/test/tools/dxil-dis/di-subprogram.ll +++ b/llvm/test/tools/dxil-dis/di-subprogram.ll @@ -3,8 +3,6 @@ target triple = "dxil-unknown-shadermodel6.7-library" !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} -!llvm.used = !{!5} -!llvm.lines = !{!13, !14, !15, !16} ; CHECK: !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Some Compiler", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2) !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Some Compiler", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) @@ -16,38 +14,3 @@ target triple = "dxil-unknown-shadermodel6.7-library" !3 = !{i32 2, !"Dwarf Version", i32 4} ; CHECK: !4 = !{i32 2, !"Debug Info Version", i32 3} !4 = !{i32 2, !"Debug Info Version", i32 3} - -; CHECK: !5 = distinct !DISubprogram(name: "fma", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, function: !0, variables: !9) -!5 = distinct !DISubprogram(name: "fma", scope: !1, file: !1, line: 1, type: !6, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !9) - -; CHECK: !6 = !DISubroutineType(types: !7) -!6 = !DISubroutineType(types: !7) - -; CHECK: !7 = !{!8, !8, !8, !8} -!7 = !{!8, !8, !8, !8} - -; CHECK: !8 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) -!8 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) - -; CHECK: !9 = !{!10, !11, !12} -!9 = !{!10, !11, !12} - -; CHECK: !10 = !DILocalVariable(tag: DW_TAG_variable, name: "x", arg: 1, scope: !5, file: !1, line: 1, type: !8) -!10 = !DILocalVariable(name: "x", arg: 1, scope: !5, file: !1, line: 1, type: !8) - -; CHECK: !11 = !DILocalVariable(tag: DW_TAG_variable, name: "y", arg: 2, scope: !5, file: !1, line: 1, type: !8) -!11 = !DILocalVariable(name: "y", arg: 2, scope: !5, file: !1, line: 1, type: !8) - -; CHECK: !12 = !DILocalVariable(tag: DW_TAG_variable, name: "z", arg: 3, scope: !5, file: !1, line: 1, type: !8) -!12 = !DILocalVariable(name: "z", arg: 3, scope: !5, file: !1, line: 1, type: !8) - - -; CHECK: !13 = !DILocation(line: 0, scope: !5) -; CHECK: !14 = !DILocation(line: 2, column: 12, scope: !5) -; CHECK: !15 = !DILocation(line: 2, column: 16, scope: !5) -; CHECK: !16 = !DILocation(line: 2, column: 3, scope: !5) - -!13 = !DILocation(line: 0, scope: !5) -!14 = !DILocation(line: 2, column: 12, scope: !5) -!15 = !DILocation(line: 2, column: 16, scope: !5) -!16 = !DILocation(line: 2, column: 3, scope: !5) diff --git a/llvm/test/tools/dxil-dis/di-subrotine.ll b/llvm/test/tools/dxil-dis/di-subrotine.ll deleted file mode 100644 index 285e319b74056..0000000000000 --- a/llvm/test/tools/dxil-dis/di-subrotine.ll +++ /dev/null @@ -1,12 +0,0 @@ -; RUN: llc --filetype=obj %s -o - | dxil-dis -o - | FileCheck %s -target triple = "dxil-unknown-shadermodel6.7-library" - -!llvm.used = !{!0} - -!0 = !DISubroutineType(types: !1) -!1 = !{!2, !2, !2, !2} -!2 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) - -; CHECK: !0 = !DISubroutineType(types: !1) -; CHECK: !1 = !{!2, !2, !2, !2} -; CHECK: !2 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) diff --git a/llvm/test/tools/dxil-dis/md-manystrings.ll b/llvm/test/tools/dxil-dis/md-manystrings.ll index 938e2dd5114da..a7dd595f09d94 100644 --- a/llvm/test/tools/dxil-dis/md-manystrings.ll +++ b/llvm/test/tools/dxil-dis/md-manystrings.ll @@ -4,7 +4,7 @@ target triple = "dxil-unknown-shadermodel6.7-library" -!llvm.too_many_strings = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31} +!llvm.ident = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31} !0 = !{!"String 0"} !1 = !{!"String 1"} @@ -39,7 +39,7 @@ target triple = "dxil-unknown-shadermodel6.7-library" !30 = !{!"String 30"} !31 = !{!"String 31"} -; CHECK: !llvm.too_many_strings = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31} +; CHECK: !llvm.ident = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31} ; CHECK: !0 = !{!"String 0"} ; CHECK: !1 = !{!"String 1"} ; CHECK: !2 = !{!"String 2"} From d88dc61c46aa31fd049ef709649fc805434ca888 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 29 Oct 2025 13:52:42 -0700 Subject: [PATCH 144/539] [lldb-dap] Improving consistency of tests by removing concurrency. (#165496) We currently use a background thread to read the DAP output. This means the test thread and the background thread can race at times and we may have inconsistent timing due to these races. To improve the consistency I've removed the reader thread and instead switched to using the `selectors` module that wraps `select` in a platform independent way. --- .../test/tools/lldb-dap/dap_server.py | 206 +++++++----------- .../test/tools/lldb-dap/lldbdap_testcase.py | 2 +- .../TestDAP_breakpointEvents.py | 30 ++- .../tools/lldb-dap/launch/TestDAP_launch.py | 2 +- .../module-event/TestDAP_module_event.py | 88 ++++---- .../tools/lldb-dap/module/TestDAP_module.py | 8 +- .../restart/TestDAP_restart_console.py | 24 +- .../lldb-dap/send-event/TestDAP_sendEvent.py | 2 +- 8 files changed, 159 insertions(+), 203 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py index d892c01f0bc71..8f3652172dfdf 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py @@ -10,8 +10,8 @@ import subprocess import signal import sys -import threading import warnings +import selectors import time from typing import ( Any, @@ -139,35 +139,6 @@ def dump_memory(base_addr, data, num_per_line, outfile): outfile.write("\n") -def read_packet( - f: IO[bytes], trace_file: Optional[IO[str]] = None -) -> Optional[ProtocolMessage]: - """Decode a JSON packet that starts with the content length and is - followed by the JSON bytes from a file 'f'. Returns None on EOF. - """ - line = f.readline().decode("utf-8") - if len(line) == 0: - return None # EOF. - - # Watch for line that starts with the prefix - prefix = "Content-Length: " - if line.startswith(prefix): - # Decode length of JSON bytes - length = int(line[len(prefix) :]) - # Skip empty line - separator = f.readline().decode() - if separator != "": - Exception("malformed DAP content header, unexpected line: " + separator) - # Read JSON bytes - json_str = f.read(length).decode() - if trace_file: - trace_file.write("from adapter:\n%s\n" % (json_str)) - # Decode the JSON bytes into a python dictionary - return json.loads(json_str) - - raise Exception("unexpected malformed message from lldb-dap: " + line) - - def packet_type_is(packet, packet_type): return "type" in packet and packet["type"] == packet_type @@ -199,16 +170,8 @@ def __init__( self.log_file = log_file self.send = send self.recv = recv - - # Packets that have been received and processed but have not yet been - # requested by a test case. - self._pending_packets: List[Optional[ProtocolMessage]] = [] - # Received packets that have not yet been processed. - self._recv_packets: List[Optional[ProtocolMessage]] = [] - # Used as a mutex for _recv_packets and for notify when _recv_packets - # changes. - self._recv_condition = threading.Condition() - self._recv_thread = threading.Thread(target=self._read_packet_thread) + self.selector = selectors.DefaultSelector() + self.selector.register(recv, selectors.EVENT_READ) # session state self.init_commands = init_commands @@ -234,9 +197,6 @@ def __init__( # keyed by breakpoint id self.resolved_breakpoints: dict[str, Breakpoint] = {} - # trigger enqueue thread - self._recv_thread.start() - @classmethod def encode_content(cls, s: str) -> bytes: return ("Content-Length: %u\r\n\r\n%s" % (len(s), s)).encode("utf-8") @@ -252,17 +212,46 @@ def validate_response(cls, command, response): f"seq mismatch in response {command['seq']} != {response['request_seq']}" ) - def _read_packet_thread(self): - try: - while True: - packet = read_packet(self.recv, trace_file=self.trace_file) - # `packet` will be `None` on EOF. We want to pass it down to - # handle_recv_packet anyway so the main thread can handle unexpected - # termination of lldb-dap and stop waiting for new packets. - if not self._handle_recv_packet(packet): - break - finally: - dump_dap_log(self.log_file) + def _read_packet( + self, + timeout: float = DEFAULT_TIMEOUT, + ) -> Optional[ProtocolMessage]: + """Decode a JSON packet that starts with the content length and is + followed by the JSON bytes from self.recv. Returns None on EOF. + """ + + ready = self.selector.select(timeout) + if not ready: + warnings.warn( + "timeout occurred waiting for a packet, check if the test has a" + " negative assertion and see if it can be inverted.", + stacklevel=4, + ) + return None # timeout + + line = self.recv.readline().decode("utf-8") + if len(line) == 0: + return None # EOF. + + # Watch for line that starts with the prefix + prefix = "Content-Length: " + if line.startswith(prefix): + # Decode length of JSON bytes + length = int(line[len(prefix) :]) + # Skip empty line + separator = self.recv.readline().decode() + if separator != "": + Exception("malformed DAP content header, unexpected line: " + separator) + # Read JSON bytes + json_str = self.recv.read(length).decode() + if self.trace_file: + self.trace_file.write( + "%s from adapter:\n%s\n" % (time.time(), json_str) + ) + # Decode the JSON bytes into a python dictionary + return json.loads(json_str) + + raise Exception("unexpected malformed message from lldb-dap: " + line) def get_modules( self, start_module: Optional[int] = None, module_count: Optional[int] = None @@ -310,34 +299,6 @@ def collect_output( output += self.get_output(category, clear=clear) return output - def _enqueue_recv_packet(self, packet: Optional[ProtocolMessage]): - with self.recv_condition: - self.recv_packets.append(packet) - self.recv_condition.notify() - - def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool: - """Handles an incoming packet. - - Called by the read thread that is waiting for all incoming packets - to store the incoming packet in "self._recv_packets" in a thread safe - way. This function will then signal the "self._recv_condition" to - indicate a new packet is available. - - Args: - packet: A new packet to store. - - Returns: - True if the caller should keep calling this function for more - packets. - """ - with self._recv_condition: - self._recv_packets.append(packet) - self._recv_condition.notify() - # packet is None on EOF - return packet is not None and not ( - packet["type"] == "response" and packet["command"] == "disconnect" - ) - def _recv_packet( self, *, @@ -361,46 +322,34 @@ def _recv_packet( The first matching packet for the given predicate, if specified, otherwise None. """ - assert ( - threading.current_thread != self._recv_thread - ), "Must not be called from the _recv_thread" - - def process_until_match(): - self._process_recv_packets() - for i, packet in enumerate(self._pending_packets): - if packet is None: - # We need to return a truthy value to break out of the - # wait_for, use `EOFError` as an indicator of EOF. - return EOFError() - if predicate and predicate(packet): - self._pending_packets.pop(i) - return packet - - with self._recv_condition: - packet = self._recv_condition.wait_for(process_until_match, timeout) - return None if isinstance(packet, EOFError) else packet - - def _process_recv_packets(self) -> None: + deadline = time.time() + timeout + + while time.time() < deadline: + packet = self._read_packet(timeout=deadline - time.time()) + if packet is None: + return None + self._process_recv_packet(packet) + if not predicate or predicate(packet): + return packet + + def _process_recv_packet(self, packet) -> None: """Process received packets, updating the session state.""" - with self._recv_condition: - for packet in self._recv_packets: - if packet and ("seq" not in packet or packet["seq"] == 0): - warnings.warn( - f"received a malformed packet, expected 'seq != 0' for {packet!r}" - ) - # Handle events that may modify any stateful properties of - # the DAP session. - if packet and packet["type"] == "event": - self._handle_event(packet) - elif packet and packet["type"] == "request": - # Handle reverse requests and keep processing. - self._handle_reverse_request(packet) - # Move the packet to the pending queue. - self._pending_packets.append(packet) - self._recv_packets.clear() + if packet and ("seq" not in packet or packet["seq"] == 0): + warnings.warn( + f"received a malformed packet, expected 'seq != 0' for {packet!r}" + ) + # Handle events that may modify any stateful properties of + # the DAP session. + if packet and packet["type"] == "event": + self._handle_event(packet) + elif packet and packet["type"] == "request": + # Handle reverse requests and keep processing. + self._handle_reverse_request(packet) def _handle_event(self, packet: Event) -> None: """Handle any events that modify debug session state we track.""" + self.events.append(packet) + event = packet["event"] body: Optional[Dict] = packet.get("body", None) @@ -453,6 +402,8 @@ def _handle_event(self, packet: Event) -> None: self.invalidated_event = packet elif event == "memory": self.memory_event = packet + elif event == "module": + self.module_events.append(packet) def _handle_reverse_request(self, request: Request) -> None: if request in self.reverse_requests: @@ -521,18 +472,14 @@ def send_packet(self, packet: ProtocolMessage) -> int: Returns the seq number of the request. """ - # Set the seq for requests. - if packet["type"] == "request": - packet["seq"] = self.sequence - self.sequence += 1 - else: - packet["seq"] = 0 + packet["seq"] = self.sequence + self.sequence += 1 # Encode our command dictionary as a JSON string json_str = json.dumps(packet, separators=(",", ":")) if self.trace_file: - self.trace_file.write("to adapter:\n%s\n" % (json_str)) + self.trace_file.write("%s to adapter:\n%s\n" % (time.time(), json_str)) length = len(json_str) if length > 0: @@ -913,6 +860,8 @@ def request_restart(self, restartArguments=None): if restartArguments: command_dict["arguments"] = restartArguments + # Clear state, the process is about to restart... + self._process_continued(True) response = self._send_recv(command_dict) # Caller must still call wait_for_stopped. return response @@ -1479,8 +1428,10 @@ def request_testGetTargetBreakpoints(self): def terminate(self): self.send.close() - if self._recv_thread.is_alive(): - self._recv_thread.join() + self.recv.close() + self.selector.close() + if self.log_file: + dump_dap_log(self.log_file) def request_setInstructionBreakpoints(self, memory_reference=[]): breakpoints = [] @@ -1577,6 +1528,7 @@ def launch( stdout=subprocess.PIPE, stderr=sys.stderr, env=adapter_env, + bufsize=0, ) if connection is None: diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py index 29935bb8046ff..fd07324d2ddda 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py @@ -416,7 +416,7 @@ def continue_to_next_stop(self): return self.dap_server.wait_for_stopped() def continue_to_breakpoint(self, breakpoint_id: str): - self.continue_to_breakpoints((breakpoint_id)) + self.continue_to_breakpoints([breakpoint_id]) def continue_to_breakpoints(self, breakpoint_ids): self.do_continue() diff --git a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py index beab4d6c1f5a6..7b78541fb4f8e 100644 --- a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py +++ b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py @@ -81,24 +81,20 @@ def test_breakpoint_events(self): breakpoint["verified"], "expect foo breakpoint to not be verified" ) - # Flush the breakpoint events. - self.dap_server.wait_for_breakpoint_events() - # Continue to the breakpoint - self.continue_to_breakpoints(dap_breakpoint_ids) + self.continue_to_breakpoint(foo_bp_id) + self.continue_to_next_stop() # foo_bp2 + self.continue_to_breakpoint(main_bp_id) + self.continue_to_exit() - verified_breakpoint_ids = [] - unverified_breakpoint_ids = [] - for breakpoint_event in self.dap_server.wait_for_breakpoint_events(): - breakpoint = breakpoint_event["body"]["breakpoint"] - id = breakpoint["id"] - if breakpoint["verified"]: - verified_breakpoint_ids.append(id) - else: - unverified_breakpoint_ids.append(id) + bp_events = [e for e in self.dap_server.events if e["event"] == "breakpoint"] - self.assertIn(main_bp_id, unverified_breakpoint_ids) - self.assertIn(foo_bp_id, unverified_breakpoint_ids) + main_bp_events = [ + e for e in bp_events if e["body"]["breakpoint"]["id"] == main_bp_id + ] + foo_bp_events = [ + e for e in bp_events if e["body"]["breakpoint"]["id"] == foo_bp_id + ] - self.assertIn(main_bp_id, verified_breakpoint_ids) - self.assertIn(foo_bp_id, verified_breakpoint_ids) + self.assertTrue(main_bp_events) + self.assertTrue(foo_bp_events) diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py index 8db2316e73fc8..dc6bf38303204 100644 --- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py +++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py @@ -156,6 +156,7 @@ def test_debuggerRoot(self): self.build_and_launch( program, debuggerRoot=program_parent_dir, initCommands=commands ) + self.continue_to_exit() output = self.get_console() self.assertTrue(output and len(output) > 0, "expect console output") lines = output.splitlines() @@ -171,7 +172,6 @@ def test_debuggerRoot(self): % (program_parent_dir, line[len(prefix) :]), ) self.assertTrue(found, "verified lldb-dap working directory") - self.continue_to_exit() def test_sourcePath(self): """ diff --git a/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py b/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py index 1f4afabbd161e..9d1d17b704f76 100644 --- a/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py +++ b/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py @@ -1,58 +1,58 @@ -import dap_server +""" +Test 'module' events for dynamically loaded libraries. +""" + from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * -from lldbsuite.test import lldbutil import lldbdap_testcase -import re class TestDAP_module_event(lldbdap_testcase.DAPTestCaseBase): + def lookup_module_id(self, name): + """Returns the identifier for the first module event starting with the given name.""" + for event in self.dap_server.module_events: + if self.get_dict_value(event, ["body", "module", "name"]).startswith(name): + return self.get_dict_value(event, ["body", "module", "id"]) + self.fail(f"No module events matching name={name}") + + def module_events(self, id): + """Finds all module events by identifier.""" + return [ + event + for event in self.dap_server.module_events + if self.get_dict_value(event, ["body", "module", "id"]) == id + ] + + def module_reasons(self, events): + """Returns the list of 'reason' values from the given events.""" + return [event["body"]["reason"] for event in events] + @skipIfWindows def test_module_event(self): + """ + Test that module events are fired on target load and when the list of + dynamic libraries updates while running. + """ program = self.getBuildArtifact("a.out") self.build_and_launch(program) + # We can analyze the order of events after the process exits. + self.continue_to_exit() - source = "main.cpp" - breakpoint1_line = line_number(source, "// breakpoint 1") - breakpoint2_line = line_number(source, "// breakpoint 2") - breakpoint3_line = line_number(source, "// breakpoint 3") + a_out_id = self.lookup_module_id("a.out") + a_out_events = self.module_events(id=a_out_id) - breakpoint_ids = self.set_source_breakpoints( - source, [breakpoint1_line, breakpoint2_line, breakpoint3_line] + self.assertIn( + "new", + self.module_reasons(a_out_events), + "Expected a.out to load during the debug session.", ) - self.continue_to_breakpoints(breakpoint_ids) - - # We're now stopped at breakpoint 1 before the dlopen. Flush all the module events. - event = self.dap_server.wait_for_event(["module"]) - while event is not None: - event = self.dap_server.wait_for_event(["module"]) - - # Continue to the second breakpoint, before the dlclose. - self.continue_to_breakpoints(breakpoint_ids) - - # Make sure we got a module event for libother. - event = self.dap_server.wait_for_event(["module"]) - self.assertIsNotNone(event, "didn't get a module event") - module_name = event["body"]["module"]["name"] - module_id = event["body"]["module"]["id"] - self.assertEqual(event["body"]["reason"], "new") - self.assertIn("libother", module_name) - - # Continue to the third breakpoint, after the dlclose. - self.continue_to_breakpoints(breakpoint_ids) - - # Make sure we got a module event for libother. - event = self.dap_server.wait_for_event(["module"]) - self.assertIsNotNone(event, "didn't get a module event") - reason = event["body"]["reason"] - self.assertEqual(reason, "removed") - self.assertEqual(event["body"]["module"]["id"], module_id) - - # The removed module event should omit everything but the module id and name - # as they are required fields. - module_data = event["body"]["module"] - required_keys = ["id", "name"] - self.assertListEqual(list(module_data.keys()), required_keys) - self.assertEqual(module_data["name"], "", "expects empty name.") - self.continue_to_exit() + libother_id = self.lookup_module_id( + "libother." # libother.so or libother.dylib based on OS. + ) + libother_events = self.module_events(id=libother_id) + self.assertEqual( + self.module_reasons(libother_events), + ["new", "removed"], + "Expected libother to be loaded then unloaded during the debug session.", + ) diff --git a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py index 0ed53dac5d869..2d00c512721c6 100644 --- a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py +++ b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py @@ -64,19 +64,18 @@ def check_symbols_loaded_with_size(): self.assertEqual(program, program_module["path"]) self.assertIn("addressRange", program_module) + self.continue_to_exit() + # Collect all the module names we saw as events. module_new_names = [] module_changed_names = [] - module_event = self.dap_server.wait_for_event(["module"]) - while module_event is not None: + for module_event in self.dap_server.module_events: reason = module_event["body"]["reason"] if reason == "new": module_new_names.append(module_event["body"]["module"]["name"]) elif reason == "changed": module_changed_names.append(module_event["body"]["module"]["name"]) - module_event = self.dap_server.wait_for_event(["module"]) - # Make sure we got an event for every active module. self.assertNotEqual(len(module_new_names), 0) for module in active_modules: @@ -86,7 +85,6 @@ def check_symbols_loaded_with_size(): # symbols got added. self.assertNotEqual(len(module_changed_names), 0) self.assertIn(program_module["name"], module_changed_names) - self.continue_to_exit() @skipIfWindows def test_modules(self): diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py index e1ad1425a993d..fa62ec243f5c5 100644 --- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py +++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py @@ -30,7 +30,11 @@ def verify_stopped_on_entry(self, stopped_events: List[Dict[str, Any]]): if reason == "entry": seen_stopped_event += 1 - self.assertEqual(seen_stopped_event, 1, "expect only one stopped entry event.") + self.assertEqual( + seen_stopped_event, + 1, + f"expect only one stopped entry event in {stopped_events}", + ) @skipIfAsan @skipIfWindows @@ -92,11 +96,13 @@ def test_stopOnEntry(self): self.build_and_launch(program, console="integratedTerminal", stopOnEntry=True) [bp_main] = self.set_function_breakpoints(["main"]) - self.dap_server.request_continue() # sends configuration done - stopped_events = self.dap_server.wait_for_stopped() + self.dap_server.request_configurationDone() + stopped_threads = list(self.dap_server.thread_stop_reasons.values()) # We should be stopped at the entry point. - self.assertGreaterEqual(len(stopped_events), 0, "expect stopped events") - self.verify_stopped_on_entry(stopped_events) + self.assertEqual( + len(stopped_threads), 1, "Expected the main thread to be stopped on entry." + ) + self.assertEqual(stopped_threads[0]["reason"], "entry") # Then, if we continue, we should hit the breakpoint at main. self.dap_server.request_continue() @@ -105,8 +111,12 @@ def test_stopOnEntry(self): # Restart and check that we still get a stopped event before reaching # main. self.dap_server.request_restart() - stopped_events = self.dap_server.wait_for_stopped() - self.verify_stopped_on_entry(stopped_events) + stopped_threads = list(self.dap_server.thread_stop_reasons.values()) + # We should be stopped at the entry point. + self.assertEqual( + len(stopped_threads), 1, "Expected the main thread to be stopped on entry." + ) + self.assertEqual(stopped_threads[0]["reason"], "entry") # continue to main self.dap_server.request_continue() diff --git a/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py index a01845669666f..0184020589176 100644 --- a/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py +++ b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py @@ -32,7 +32,7 @@ def test_send_event(self): ], ) self.set_source_breakpoints(source, [breakpoint_line]) - self.continue_to_next_stop() + self.do_continue() custom_event = self.dap_server.wait_for_event( filter=["my-custom-event-no-body"] From 399303b77a5ce4830aba7917072fc8fd815d5a3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Sch=C3=A4pers?= Date: Wed, 29 Oct 2025 22:04:26 +0100 Subject: [PATCH 145/539] [clang-format][NFC] Port FormatTestComments to verifyFormat (#164310) And reduce the number of getLLVMStyleWithColumnLimit calls. --- clang/unittests/Format/FormatTestComments.cpp | 6633 ++++++++--------- 1 file changed, 3210 insertions(+), 3423 deletions(-) diff --git a/clang/unittests/Format/FormatTestComments.cpp b/clang/unittests/Format/FormatTestComments.cpp index 6b433bb384864..399f8357692ba 100644 --- a/clang/unittests/Format/FormatTestComments.cpp +++ b/clang/unittests/Format/FormatTestComments.cpp @@ -29,13 +29,13 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) { "// line 2\n" "void f() {}"); - EXPECT_EQ("// comment", format("//comment")); - EXPECT_EQ("// #comment", format("//#comment")); + verifyFormat("// comment", "//comment"); + verifyFormat("// #comment", "//#comment"); - EXPECT_EQ("// comment\n" - "// clang-format on", - format("//comment\n" - "// clang-format on")); + verifyFormat("// comment\n" + "// clang-format on", + "//comment\n" + "// clang-format on"); verifyFormat("void f() {\n" " // Doesn't do anything\n" @@ -84,11 +84,11 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) { "#include \"a/b/c\" // comment"); verifyFormat("#include // comment\n" "#include // comment"); - EXPECT_EQ("#include \"a\" // comment\n" - "#include \"a/b/c\" // comment", - format("#include \\\n" - " \"a\" // comment\n" - "#include \"a/b/c\" // comment")); + verifyFormat("#include \"a\" // comment\n" + "#include \"a/b/c\" // comment", + "#include \\\n" + " \"a\" // comment\n" + "#include \"a/b/c\" // comment"); verifyFormat("enum E {\n" " // comment\n" @@ -96,63 +96,65 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) { " VAL_B\n" "};"); - EXPECT_EQ("enum A {\n" - " // line a\n" - " a,\n" - " b, // line b\n" - "\n" - " // line c\n" - " c\n" - "};", - format("enum A {\n" - " // line a\n" - " a,\n" - " b, // line b\n" - "\n" - " // line c\n" - " c\n" - "};", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("enum A {\n" - " a, // line 1\n" - " // line 2\n" - "};", - format("enum A {\n" - " a, // line 1\n" - " // line 2\n" - "};", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("enum A {\n" - " a, // line 1\n" - " // line 2\n" - "};", - format("enum A {\n" - " a, // line 1\n" - " // line 2\n" - "};", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("enum A {\n" - " a, // line 1\n" - " // line 2\n" - " b\n" - "};", - format("enum A {\n" - " a, // line 1\n" - " // line 2\n" - " b\n" - "};", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("enum A {\n" - " a, // line 1\n" - " // line 2\n" - " b\n" - "};", - format("enum A {\n" - " a, // line 1\n" - " // line 2\n" - " b\n" - "};", - getLLVMStyleWithColumns(20))); + const auto Style20 = getLLVMStyleWithColumns(20); + + verifyFormat("enum A {\n" + " // line a\n" + " a,\n" + " b, // line b\n" + "\n" + " // line c\n" + " c\n" + "};", + "enum A {\n" + " // line a\n" + " a,\n" + " b, // line b\n" + "\n" + " // line c\n" + " c\n" + "};", + Style20); + verifyFormat("enum A {\n" + " a, // line 1\n" + " // line 2\n" + "};", + "enum A {\n" + " a, // line 1\n" + " // line 2\n" + "};", + Style20); + verifyFormat("enum A {\n" + " a, // line 1\n" + " // line 2\n" + "};", + "enum A {\n" + " a, // line 1\n" + " // line 2\n" + "};", + Style20); + verifyFormat("enum A {\n" + " a, // line 1\n" + " // line 2\n" + " b\n" + "};", + "enum A {\n" + " a, // line 1\n" + " // line 2\n" + " b\n" + "};", + Style20); + verifyFormat("enum A {\n" + " a, // line 1\n" + " // line 2\n" + " b\n" + "};", + "enum A {\n" + " a, // line 1\n" + " // line 2\n" + " b\n" + "};", + Style20); verifyFormat( "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa =\n" " bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb; // Trailing comment"); @@ -172,28 +174,28 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) { verifyFormat("int aaaa; // aaaaa\n" "int aa; // aaaaaaa", - getLLVMStyleWithColumns(20)); + Style20); + + verifyFormat("void f() { // This does something ..\n" + "}\n" + "int a; // This is unrelated", + "void f() { // This does something ..\n" + " }\n" + "int a; // This is unrelated"); + verifyFormat("class C {\n" + " void f() { // This does something ..\n" + " } // awesome..\n" + "\n" + " int a; // This is unrelated\n" + "};", + "class C{void f() { // This does something ..\n" + " } // awesome..\n" + " \n" + "int a; // This is unrelated\n" + "};"); - EXPECT_EQ("void f() { // This does something ..\n" - "}\n" - "int a; // This is unrelated", - format("void f() { // This does something ..\n" - " }\n" - "int a; // This is unrelated")); - EXPECT_EQ("class C {\n" - " void f() { // This does something ..\n" - " } // awesome..\n" - "\n" - " int a; // This is unrelated\n" - "};", - format("class C{void f() { // This does something ..\n" - " } // awesome..\n" - " \n" - "int a; // This is unrelated\n" - "};")); - - EXPECT_EQ("int i; // single line trailing comment", - format("int i;\\\n// single line trailing comment")); + verifyFormat("int i; // single line trailing comment", + "int i;\\\n// single line trailing comment"); verifyGoogleFormat("int a; // Trailing comment."); @@ -210,99 +212,99 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) { verifyGoogleFormat( "aaaaaaaaaaaaaaaaaaaaaaaaaa(\n" " aaaaaaaaaaaaaaaaaaaaaa); // 81_cols_with_this_comment"); - EXPECT_EQ("D(a, {\n" - " // test\n" - " int a;\n" - "});", - format("D(a, {\n" - "// test\n" - "int a;\n" - "});")); - - EXPECT_EQ("lineWith(); // comment\n" - "// at start\n" - "otherLine();", - format("lineWith(); // comment\n" - "// at start\n" - "otherLine();")); - EXPECT_EQ("lineWith(); // comment\n" - "/*\n" - " * at start */\n" - "otherLine();", - format("lineWith(); // comment\n" - "/*\n" - " * at start */\n" - "otherLine();")); - EXPECT_EQ("lineWith(); // comment\n" - " // at start\n" - "otherLine();", - format("lineWith(); // comment\n" - " // at start\n" - "otherLine();")); - - EXPECT_EQ("lineWith(); // comment\n" - "// at start\n" - "otherLine(); // comment", - format("lineWith(); // comment\n" - "// at start\n" - "otherLine(); // comment")); - EXPECT_EQ("lineWith();\n" - "// at start\n" - "otherLine(); // comment", - format("lineWith();\n" - " // at start\n" - "otherLine(); // comment")); - EXPECT_EQ("// first\n" - "// at start\n" - "otherLine(); // comment", - format("// first\n" - " // at start\n" - "otherLine(); // comment")); - EXPECT_EQ("f();\n" - "// first\n" - "// at start\n" - "otherLine(); // comment", - format("f();\n" - "// first\n" - " // at start\n" - "otherLine(); // comment")); + verifyFormat("D(a, {\n" + " // test\n" + " int a;\n" + "});", + "D(a, {\n" + "// test\n" + "int a;\n" + "});"); + + verifyFormat("lineWith(); // comment\n" + "// at start\n" + "otherLine();", + "lineWith(); // comment\n" + "// at start\n" + "otherLine();"); + verifyFormat("lineWith(); // comment\n" + "/*\n" + " * at start */\n" + "otherLine();", + "lineWith(); // comment\n" + "/*\n" + " * at start */\n" + "otherLine();"); + verifyFormat("lineWith(); // comment\n" + " // at start\n" + "otherLine();", + "lineWith(); // comment\n" + " // at start\n" + "otherLine();"); + + verifyFormat("lineWith(); // comment\n" + "// at start\n" + "otherLine(); // comment", + "lineWith(); // comment\n" + "// at start\n" + "otherLine(); // comment"); + verifyFormat("lineWith();\n" + "// at start\n" + "otherLine(); // comment", + "lineWith();\n" + " // at start\n" + "otherLine(); // comment"); + verifyFormat("// first\n" + "// at start\n" + "otherLine(); // comment", + "// first\n" + " // at start\n" + "otherLine(); // comment"); + verifyFormat("f();\n" + "// first\n" + "// at start\n" + "otherLine(); // comment", + "f();\n" + "// first\n" + " // at start\n" + "otherLine(); // comment"); verifyFormat("f(); // comment\n" "// first\n" "// at start\n" "otherLine();"); - EXPECT_EQ("f(); // comment\n" - "// first\n" - "// at start\n" - "otherLine();", - format("f(); // comment\n" - "// first\n" - " // at start\n" - "otherLine();")); - EXPECT_EQ("f(); // comment\n" - " // first\n" - "// at start\n" - "otherLine();", - format("f(); // comment\n" - " // first\n" - "// at start\n" - "otherLine();")); - EXPECT_EQ("void f() {\n" - " lineWith(); // comment\n" - " // at start\n" - "}", - format("void f() {\n" - " lineWith(); // comment\n" - " // at start\n" - "}")); - EXPECT_EQ("int xy; // a\n" - "int z; // b", - format("int xy; // a\n" - "int z; //b")); - EXPECT_EQ("int xy; // a\n" - "int z; // bb", - format("int xy; // a\n" - "int z; //bb", - getLLVMStyleWithColumns(12))); + verifyFormat("f(); // comment\n" + "// first\n" + "// at start\n" + "otherLine();", + "f(); // comment\n" + "// first\n" + " // at start\n" + "otherLine();"); + verifyFormat("f(); // comment\n" + " // first\n" + "// at start\n" + "otherLine();", + "f(); // comment\n" + " // first\n" + "// at start\n" + "otherLine();"); + verifyFormat("void f() {\n" + " lineWith(); // comment\n" + " // at start\n" + "}", + "void f() {\n" + " lineWith(); // comment\n" + " // at start\n" + "}"); + verifyFormat("int xy; // a\n" + "int z; // b", + "int xy; // a\n" + "int z; //b"); + verifyFormat("int xy; // a\n" + "int z; // bb", + "int xy; // a\n" + "int z; //bb", + getLLVMStyleWithColumns(12)); verifyFormat("#define A \\\n" " int i; /* iiiiiiiiiiiiiiiiiiiii */ \\\n" @@ -317,14 +319,14 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) { verifyFormat("if ( // This is some comment\n" " x + 3) {\n" "}"); - EXPECT_EQ("if ( // This is some comment\n" - " // spanning two lines\n" - " x + 3) {\n" - "}", - format("if( // This is some comment\n" - " // spanning two lines\n" - " x + 3) {\n" - "}")); + verifyFormat("if ( // This is some comment\n" + " // spanning two lines\n" + " x + 3) {\n" + "}", + "if( // This is some comment\n" + " // spanning two lines\n" + " x + 3) {\n" + "}"); verifyNoCrash("/\\\n/"); verifyNoCrash("/\\\n* */"); @@ -333,35 +335,35 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) { } TEST_F(FormatTestComments, KeepsParameterWithTrailingCommentsOnTheirOwnLine) { - EXPECT_EQ("SomeFunction(a,\n" - " b, // comment\n" - " c);", - format("SomeFunction(a,\n" - " b, // comment\n" - " c);")); - EXPECT_EQ("SomeFunction(a, b,\n" - " // comment\n" - " c);", - format("SomeFunction(a,\n" - " b,\n" - " // comment\n" - " c);")); - EXPECT_EQ("SomeFunction(a, b, // comment (unclear relation)\n" - " c);", - format("SomeFunction(a, b, // comment (unclear relation)\n" - " c);")); - EXPECT_EQ("SomeFunction(a, // comment\n" - " b,\n" - " c); // comment", - format("SomeFunction(a, // comment\n" - " b,\n" - " c); // comment")); - EXPECT_EQ("aaaaaaaaaa(aaaa(aaaa,\n" - " aaaa), //\n" - " aaaa, bbbbb);", - format("aaaaaaaaaa(aaaa(aaaa,\n" - "aaaa), //\n" - "aaaa, bbbbb);")); + verifyFormat("SomeFunction(a,\n" + " b, // comment\n" + " c);", + "SomeFunction(a,\n" + " b, // comment\n" + " c);"); + verifyFormat("SomeFunction(a, b,\n" + " // comment\n" + " c);", + "SomeFunction(a,\n" + " b,\n" + " // comment\n" + " c);"); + verifyFormat("SomeFunction(a, b, // comment (unclear relation)\n" + " c);", + "SomeFunction(a, b, // comment (unclear relation)\n" + " c);"); + verifyFormat("SomeFunction(a, // comment\n" + " b,\n" + " c); // comment", + "SomeFunction(a, // comment\n" + " b,\n" + " c); // comment"); + verifyFormat("aaaaaaaaaa(aaaa(aaaa,\n" + " aaaa), //\n" + " aaaa, bbbbb);", + "aaaaaaaaaa(aaaa(aaaa,\n" + "aaaa), //\n" + "aaaa, bbbbb);"); FormatStyle BreakAlways = getLLVMStyle(); BreakAlways.BinPackParameters = FormatStyle::BPPS_AlwaysOnePerLine; @@ -378,12 +380,12 @@ TEST_F(FormatTestComments, KeepsParameterWithTrailingCommentsOnTheirOwnLine) { } TEST_F(FormatTestComments, RemovesTrailingWhitespaceOfComments) { - EXPECT_EQ("// comment", format("// comment ")); - EXPECT_EQ("int aaaaaaa, bbbbbbb; // comment", - format("int aaaaaaa, bbbbbbb; // comment ", - getLLVMStyleWithColumns(33))); - EXPECT_EQ("// comment\\\n", format("// comment\\\n \t \v \f ")); - EXPECT_EQ("// comment \\\n", format("// comment \\\n \t \v \f ")); + verifyFormat("// comment", "// comment "); + verifyFormat("int aaaaaaa, bbbbbbb; // comment", + "int aaaaaaa, bbbbbbb; // comment ", + getLLVMStyleWithColumns(33)); + verifyFormat("// comment\\\n", "// comment\\\n \t \v \f "); + verifyFormat("// comment \\\n", "// comment \\\n \t \v \f "); } TEST_F(FormatTestComments, UnderstandsBlockComments) { @@ -393,16 +395,15 @@ TEST_F(FormatTestComments, UnderstandsBlockComments) { " /*qq_=*/move(q), [this, b](bar b) {},\n" " c);", getLLVMStyleWithColumns(60)); - EXPECT_EQ("f(aaaaaaaaaaaaaaaaaaaaaaaaa, /* Trailing comment for aa... */\n" - " bbbbbbbbbbbbbbbbbbbbbbbbb);", - format("f(aaaaaaaaaaaaaaaaaaaaaaaaa , \\\n" - "/* Trailing comment for aa... */\n" - " bbbbbbbbbbbbbbbbbbbbbbbbb);")); - EXPECT_EQ( - "f(aaaaaaaaaaaaaaaaaaaaaaaaa,\n" - " /* Leading comment for bb... */ bbbbbbbbbbbbbbbbbbbbbbbbb);", - format("f(aaaaaaaaaaaaaaaaaaaaaaaaa , \n" - "/* Leading comment for bb... */ bbbbbbbbbbbbbbbbbbbbbbbbb);")); + verifyFormat("f(aaaaaaaaaaaaaaaaaaaaaaaaa, /* Trailing comment for aa... */\n" + " bbbbbbbbbbbbbbbbbbbbbbbbb);", + "f(aaaaaaaaaaaaaaaaaaaaaaaaa , \\\n" + "/* Trailing comment for aa... */\n" + " bbbbbbbbbbbbbbbbbbbbbbbbb);"); + verifyFormat("f(aaaaaaaaaaaaaaaaaaaaaaaaa,\n" + " /* Leading comment for bb... */ bbbbbbbbbbbbbbbbbbbbbbbbb);", + "f(aaaaaaaaaaaaaaaaaaaaaaaaa , \n" + "/* Leading comment for bb... */ bbbbbbbbbbbbbbbbbbbbbbbbb);"); verifyFormat( "void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n" @@ -445,77 +446,77 @@ TEST_F(FormatTestComments, UnderstandsBlockComments) { } TEST_F(FormatTestComments, AlignsBlockComments) { - EXPECT_EQ("/*\n" - " * Really multi-line\n" - " * comment.\n" - " */\n" - "void f() {}", - format(" /*\n" - " * Really multi-line\n" - " * comment.\n" - " */\n" - " void f() {}")); - EXPECT_EQ("class C {\n" - " /*\n" - " * Another multi-line\n" - " * comment.\n" - " */\n" - " void f() {}\n" - "};", - format("class C {\n" - "/*\n" - " * Another multi-line\n" - " * comment.\n" - " */\n" - "void f() {}\n" - "};")); - EXPECT_EQ("/*\n" - " 1. This is a comment with non-trivial formatting.\n" - " 1.1. We have to indent/outdent all lines equally\n" - " 1.1.1. to keep the formatting.\n" - " */", - format(" /*\n" - " 1. This is a comment with non-trivial formatting.\n" - " 1.1. We have to indent/outdent all lines equally\n" - " 1.1.1. to keep the formatting.\n" - " */")); - EXPECT_EQ("/*\n" - "Don't try to outdent if there's not enough indentation.\n" - "*/", - format(" /*\n" - " Don't try to outdent if there's not enough indentation.\n" - " */")); - - EXPECT_EQ("int i; /* Comment with empty...\n" - " *\n" - " * line. */", - format("int i; /* Comment with empty...\n" - " *\n" - " * line. */")); - EXPECT_EQ("int foobar = 0; /* comment */\n" - "int bar = 0; /* multiline\n" - " comment 1 */\n" - "int baz = 0; /* multiline\n" - " comment 2 */\n" - "int bzz = 0; /* multiline\n" - " comment 3 */", - format("int foobar = 0; /* comment */\n" - "int bar = 0; /* multiline\n" - " comment 1 */\n" - "int baz = 0; /* multiline\n" - " comment 2 */\n" - "int bzz = 0; /* multiline\n" - " comment 3 */")); - EXPECT_EQ("int foobar = 0; /* comment */\n" - "int bar = 0; /* multiline\n" - " comment */\n" - "int baz = 0; /* multiline\n" - "comment */", - format("int foobar = 0; /* comment */\n" - "int bar = 0; /* multiline\n" - "comment */\n" - "int baz = 0; /* multiline\n" - "comment */")); + verifyFormat("/*\n" + " * Really multi-line\n" + " * comment.\n" + " */\n" + "void f() {}", + " /*\n" + " * Really multi-line\n" + " * comment.\n" + " */\n" + " void f() {}"); + verifyFormat("class C {\n" + " /*\n" + " * Another multi-line\n" + " * comment.\n" + " */\n" + " void f() {}\n" + "};", + "class C {\n" + "/*\n" + " * Another multi-line\n" + " * comment.\n" + " */\n" + "void f() {}\n" + "};"); + verifyFormat("/*\n" + " 1. This is a comment with non-trivial formatting.\n" + " 1.1. We have to indent/outdent all lines equally\n" + " 1.1.1. to keep the formatting.\n" + " */", + " /*\n" + " 1. This is a comment with non-trivial formatting.\n" + " 1.1. We have to indent/outdent all lines equally\n" + " 1.1.1. to keep the formatting.\n" + " */"); + verifyFormat("/*\n" + "Don't try to outdent if there's not enough indentation.\n" + "*/", + " /*\n" + " Don't try to outdent if there's not enough indentation.\n" + " */"); + + verifyFormat("int i; /* Comment with empty...\n" + " *\n" + " * line. */", + "int i; /* Comment with empty...\n" + " *\n" + " * line. */"); + verifyFormat("int foobar = 0; /* comment */\n" + "int bar = 0; /* multiline\n" + " comment 1 */\n" + "int baz = 0; /* multiline\n" + " comment 2 */\n" + "int bzz = 0; /* multiline\n" + " comment 3 */", + "int foobar = 0; /* comment */\n" + "int bar = 0; /* multiline\n" + " comment 1 */\n" + "int baz = 0; /* multiline\n" + " comment 2 */\n" + "int bzz = 0; /* multiline\n" + " comment 3 */"); + verifyFormat("int foobar = 0; /* comment */\n" + "int bar = 0; /* multiline\n" + " comment */\n" + "int baz = 0; /* multiline\n" + "comment */", + "int foobar = 0; /* comment */\n" + "int bar = 0; /* multiline\n" + "comment */\n" + "int baz = 0; /* multiline\n" + "comment */"); } TEST_F(FormatTestComments, CommentReflowingCanBeTurnedOff) { @@ -553,11 +554,11 @@ TEST_F(FormatTestComments, CommentReflowingCanApplyOnlyToIndents) { } TEST_F(FormatTestComments, CorrectlyHandlesLengthOfBlockComments) { - EXPECT_EQ("double *x; /* aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n" - " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa */", - format("double *x; /* aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n" - " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa */")); - EXPECT_EQ( + verifyFormat("double *x; /* aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n" + " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa */", + "double *x; /* aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n" + " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa */"); + verifyFormat( "void ffffffffffff(\n" " int aaaaaaaa, int bbbbbbbb,\n" " int cccccccccccc) { /*\n" @@ -567,150 +568,146 @@ TEST_F(FormatTestComments, CorrectlyHandlesLengthOfBlockComments) { " bbbbbbbbbb\n" " */\n" "}", - format("void ffffffffffff(int aaaaaaaa, int bbbbbbbb, int cccccccccccc)\n" - "{ /*\n" - " aaaaaaaaaa aaaaaaaaaaaaa\n" - " bbbbbbbbbbbbbb bbbbbbbbbb\n" - " */\n" - "}", - getLLVMStyleWithColumns(40))); + "void ffffffffffff(int aaaaaaaa, int bbbbbbbb, int cccccccccccc)\n" + "{ /*\n" + " aaaaaaaaaa aaaaaaaaaaaaa\n" + " bbbbbbbbbbbbbb bbbbbbbbbb\n" + " */\n" + "}", + getLLVMStyleWithColumns(40)); } TEST_F(FormatTestComments, DontBreakNonTrailingBlockComments) { - EXPECT_EQ("void ffffffffff(\n" - " int aaaaa /* test */);", - format("void ffffffffff(int aaaaa /* test */);", - getLLVMStyleWithColumns(35))); + verifyFormat("void ffffffffff(\n" + " int aaaaa /* test */);", + "void ffffffffff(int aaaaa /* test */);", + getLLVMStyleWithColumns(35)); } TEST_F(FormatTestComments, SplitsLongCxxComments) { - EXPECT_EQ("// A comment that\n" - "// doesn't fit on\n" - "// one line", - format("// A comment that doesn't fit on one line", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("/// A comment that\n" - "/// doesn't fit on\n" - "/// one line", - format("/// A comment that doesn't fit on one line", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("//! A comment that\n" - "//! doesn't fit on\n" - "//! one line", - format("//! A comment that doesn't fit on one line", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("// a b c d\n" - "// e f g\n" - "// h i j k", - format("// a b c d e f g h i j k", getLLVMStyleWithColumns(10))); - EXPECT_EQ( - "// a b c d\n" - "// e f g\n" - "// h i j k", - format("\\\n// a b c d e f g h i j k", getLLVMStyleWithColumns(10))); - EXPECT_EQ("if (true) // A comment that\n" - " // doesn't fit on\n" - " // one line", - format("if (true) // A comment that doesn't fit on one line ", - getLLVMStyleWithColumns(30))); - verifyNoChange("// Don't_touch_leading_whitespace", - getLLVMStyleWithColumns(20)); - EXPECT_EQ("// Add leading\n" - "// whitespace", - format("//Add leading whitespace", getLLVMStyleWithColumns(20))); - EXPECT_EQ("/// Add leading\n" - "/// whitespace", - format("///Add leading whitespace", getLLVMStyleWithColumns(20))); - EXPECT_EQ("//! Add leading\n" - "//! whitespace", - format("//!Add leading whitespace", getLLVMStyleWithColumns(20))); - EXPECT_EQ("// whitespace", format("//whitespace")); - EXPECT_EQ("// Even if it makes the line exceed the column\n" - "// limit", - format("//Even if it makes the line exceed the column limit", - getLLVMStyleWithColumns(51))); + const auto Style10 = getLLVMStyleWithColumns(10); + const auto Style20 = getLLVMStyleWithColumns(20); + const auto Style22 = getLLVMStyleWithColumns(22); + const auto Style30 = getLLVMStyleWithColumns(30); + + verifyFormat("// A comment that\n" + "// doesn't fit on\n" + "// one line", + "// A comment that doesn't fit on one line", Style20); + verifyFormat("/// A comment that\n" + "/// doesn't fit on\n" + "/// one line", + "/// A comment that doesn't fit on one line", Style20); + verifyFormat("//! A comment that\n" + "//! doesn't fit on\n" + "//! one line", + "//! A comment that doesn't fit on one line", Style20); + verifyFormat("// a b c d\n" + "// e f g\n" + "// h i j k", + "// a b c d e f g h i j k", Style10); + verifyFormat("// a b c d\n" + "// e f g\n" + "// h i j k", + "\\\n// a b c d e f g h i j k", Style10); + verifyFormat("if (true) // A comment that\n" + " // doesn't fit on\n" + " // one line", + "if (true) // A comment that doesn't fit on one line ", + Style30); + verifyNoChange("// Don't_touch_leading_whitespace", Style20); + verifyFormat("// Add leading\n" + "// whitespace", + "//Add leading whitespace", Style20); + verifyFormat("/// Add leading\n" + "/// whitespace", + "///Add leading whitespace", Style20); + verifyFormat("//! Add leading\n" + "//! whitespace", + "//!Add leading whitespace", Style20); + verifyFormat("// whitespace", "//whitespace"); + verifyFormat("// Even if it makes the line exceed the column\n" + "// limit", + "//Even if it makes the line exceed the column limit", + getLLVMStyleWithColumns(51)); verifyFormat("//--But not here"); - EXPECT_EQ("/// line 1\n" - "// add leading whitespace", - format("/// line 1\n" - "//add leading whitespace", - getLLVMStyleWithColumns(30))); - EXPECT_EQ("/// line 1\n" - "/// line 2\n" - "//! line 3\n" - "//! line 4\n" - "//! line 5\n" - "// line 6\n" - "// line 7", - format("///line 1\n" - "///line 2\n" - "//! line 3\n" - "//!line 4\n" - "//!line 5\n" - "// line 6\n" - "//line 7", - getLLVMStyleWithColumns(20))); - - EXPECT_EQ("// aa bb cc dd", - format("// aa bb cc dd ", - getLLVMStyleWithColumns(15))); - - EXPECT_EQ("// A comment before\n" - "// a macro\n" - "// definition\n" - "#define a b", - format("// A comment before a macro definition\n" - "#define a b", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("void ffffff(\n" - " int aaaaaaaaa, // wwww\n" - " int bbbbbbbbbb, // xxxxxxx\n" - " // yyyyyyyyyy\n" - " int c, int d, int e) {}", - format("void ffffff(\n" - " int aaaaaaaaa, // wwww\n" - " int bbbbbbbbbb, // xxxxxxx yyyyyyyyyy\n" - " int c, int d, int e) {}", - getLLVMStyleWithColumns(40))); - verifyFormat("//\t aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", - getLLVMStyleWithColumns(20)); - EXPECT_EQ( - "#define XXX // a b c d\n" - " // e f g h", - format("#define XXX // a b c d e f g h", getLLVMStyleWithColumns(22))); - EXPECT_EQ( - "#define XXX // q w e r\n" - " // t y u i", - format("#define XXX //q w e r t y u i", getLLVMStyleWithColumns(22))); - EXPECT_EQ("{\n" - " //\n" - " //\\\n" - " // long 1 2 3 4 5\n" - "}", - format("{\n" - " //\n" - " //\\\n" - " // long 1 2 3 4 5\n" - "}", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("{\n" - " //\n" - " //\\\n" - " // long 1 2 3 4 5\n" - " // 6\n" - "}", - format("{\n" - " //\n" - " //\\\n" - " // long 1 2 3 4 5 6\n" - "}", - getLLVMStyleWithColumns(20))); - - EXPECT_EQ("//: A comment that\n" - "//: doesn't fit on\n" - "//: one line", - format("//: A comment that doesn't fit on one line", - getLLVMStyleWithColumns(20))); + verifyFormat("/// line 1\n" + "// add leading whitespace", + "/// line 1\n" + "//add leading whitespace", + Style30); + verifyFormat("/// line 1\n" + "/// line 2\n" + "//! line 3\n" + "//! line 4\n" + "//! line 5\n" + "// line 6\n" + "// line 7", + "///line 1\n" + "///line 2\n" + "//! line 3\n" + "//!line 4\n" + "//!line 5\n" + "// line 6\n" + "//line 7", + Style20); + + verifyFormat("// aa bb cc dd", + "// aa bb cc dd ", + getLLVMStyleWithColumns(15)); + + verifyFormat("// A comment before\n" + "// a macro\n" + "// definition\n" + "#define a b", + "// A comment before a macro definition\n" + "#define a b", + Style20); + verifyFormat("void ffffff(\n" + " int aaaaaaaaa, // wwww\n" + " int bbbbbbbbbb, // xxxxxxx\n" + " // yyyyyyyyyy\n" + " int c, int d, int e) {}", + "void ffffff(\n" + " int aaaaaaaaa, // wwww\n" + " int bbbbbbbbbb, // xxxxxxx yyyyyyyyyy\n" + " int c, int d, int e) {}", + getLLVMStyleWithColumns(40)); + verifyFormat("//\t aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", Style20); + verifyFormat("#define XXX // a b c d\n" + " // e f g h", + "#define XXX // a b c d e f g h", Style22); + verifyFormat("#define XXX // q w e r\n" + " // t y u i", + "#define XXX //q w e r t y u i", Style22); + verifyFormat("{\n" + " //\n" + " //\\\n" + " // long 1 2 3 4 5\n" + "}", + "{\n" + " //\n" + " //\\\n" + " // long 1 2 3 4 5\n" + "}", + Style20); + verifyFormat("{\n" + " //\n" + " //\\\n" + " // long 1 2 3 4 5\n" + " // 6\n" + "}", + "{\n" + " //\n" + " //\\\n" + " // long 1 2 3 4 5 6\n" + "}", + Style20); + + verifyFormat("//: A comment that\n" + "//: doesn't fit on\n" + "//: one line", + "//: A comment that doesn't fit on one line", Style20); verifyFormat( "//\t\t\t\tofMap(message.velocity, 0, 127, 0, ofGetWidth()\n" @@ -719,34 +716,33 @@ TEST_F(FormatTestComments, SplitsLongCxxComments) { } TEST_F(FormatTestComments, PreservesHangingIndentInCxxComments) { - EXPECT_EQ("// A comment\n" - "// that doesn't\n" - "// fit on one\n" - "// line", - format("// A comment that doesn't fit on one line", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("/// A comment\n" - "/// that doesn't\n" - "/// fit on one\n" - "/// line", - format("/// A comment that doesn't fit on one line", - getLLVMStyleWithColumns(20))); + const auto Style20 = getLLVMStyleWithColumns(20); + verifyFormat("// A comment\n" + "// that doesn't\n" + "// fit on one\n" + "// line", + "// A comment that doesn't fit on one line", Style20); + verifyFormat("/// A comment\n" + "/// that doesn't\n" + "/// fit on one\n" + "/// line", + "/// A comment that doesn't fit on one line", Style20); } TEST_F(FormatTestComments, DontSplitLineCommentsWithEscapedNewlines) { - EXPECT_EQ("// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n" - "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n" - "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", - format("// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n" - "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n" - "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")); - EXPECT_EQ("int a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n" - " // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n" - " // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", - format("int a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n" - " // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n" - " // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", - getLLVMStyleWithColumns(50))); + verifyFormat("// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n" + "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n" + "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n" + "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n" + "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); + verifyFormat("int a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n" + " // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n" + " // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", + "int a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n" + " // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n" + " // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", + getLLVMStyleWithColumns(50)); verifyFormat("double\n" " a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n" " // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n" @@ -759,84 +755,83 @@ TEST_F(FormatTestComments, DontSplitLineCommentsWithEscapedNewlines) { TEST_F(FormatTestComments, DontIntroduceMultilineComments) { // Avoid introducing a multiline comment by breaking after `\`. + auto Style = getLLVMStyle(); for (int ColumnLimit = 15; ColumnLimit <= 17; ++ColumnLimit) { - EXPECT_EQ( - "// aaaaaaaaaa\n" - "// \\ bb", - format("// aaaaaaaaaa \\ bb", getLLVMStyleWithColumns(ColumnLimit))); - EXPECT_EQ( - "// aaaaaaaaa\n" - "// \\ bb", - format("// aaaaaaaaa \\ bb", getLLVMStyleWithColumns(ColumnLimit))); - EXPECT_EQ( - "// aaaaaaaaa\n" - "// \\ \\ bb", - format("// aaaaaaaaa \\ \\ bb", getLLVMStyleWithColumns(ColumnLimit))); + Style.ColumnLimit = ColumnLimit; + verifyFormat("// aaaaaaaaaa\n" + "// \\ bb", + "// aaaaaaaaaa \\ bb", Style); + verifyFormat("// aaaaaaaaa\n" + "// \\ bb", + "// aaaaaaaaa \\ bb", Style); + verifyFormat("// aaaaaaaaa\n" + "// \\ \\ bb", + "// aaaaaaaaa \\ \\ bb", Style); } } TEST_F(FormatTestComments, DontSplitLineCommentsWithPragmas) { FormatStyle Pragmas = getLLVMStyleWithColumns(30); Pragmas.CommentPragmas = "^ IWYU pragma:"; - EXPECT_EQ( - "// IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb", - format("// IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb", Pragmas)); - EXPECT_EQ( - "/* IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb */", - format("/* IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb */", Pragmas)); + verifyFormat("// IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb", + "// IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb", Pragmas); + verifyFormat("/* IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb */", + "/* IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb */", Pragmas); } TEST_F(FormatTestComments, PriorityOfCommentBreaking) { - EXPECT_EQ("if (xxx ==\n" - " yyy && // aaaaaaaaaaaa bbbbbbbbb\n" - " zzz)\n" - " q();", - format("if (xxx == yyy && // aaaaaaaaaaaa bbbbbbbbb\n" - " zzz) q();", - getLLVMStyleWithColumns(40))); - EXPECT_EQ("if (xxxxxxxxxx ==\n" - " yyy && // aaaaaa bbbbbbbb cccc\n" - " zzz)\n" - " q();", - format("if (xxxxxxxxxx == yyy && // aaaaaa bbbbbbbb cccc\n" - " zzz) q();", - getLLVMStyleWithColumns(40))); - EXPECT_EQ("if (xxxxxxxxxx &&\n" - " yyy || // aaaaaa bbbbbbbb cccc\n" - " zzz)\n" - " q();", - format("if (xxxxxxxxxx && yyy || // aaaaaa bbbbbbbb cccc\n" - " zzz) q();", - getLLVMStyleWithColumns(40))); - EXPECT_EQ("fffffffff(\n" - " &xxx, // aaaaaaaaaaaa bbbbbbbbbbb\n" - " zzz);", - format("fffffffff(&xxx, // aaaaaaaaaaaa bbbbbbbbbbb\n" - " zzz);", - getLLVMStyleWithColumns(40))); + const auto Style40 = getLLVMStyleWithColumns(40); + verifyFormat("if (xxx ==\n" + " yyy && // aaaaaaaaaaaa bbbbbbbbb\n" + " zzz)\n" + " q();", + "if (xxx == yyy && // aaaaaaaaaaaa bbbbbbbbb\n" + " zzz) q();", + Style40); + verifyFormat("if (xxxxxxxxxx ==\n" + " yyy && // aaaaaa bbbbbbbb cccc\n" + " zzz)\n" + " q();", + "if (xxxxxxxxxx == yyy && // aaaaaa bbbbbbbb cccc\n" + " zzz) q();", + Style40); + verifyFormat("if (xxxxxxxxxx &&\n" + " yyy || // aaaaaa bbbbbbbb cccc\n" + " zzz)\n" + " q();", + "if (xxxxxxxxxx && yyy || // aaaaaa bbbbbbbb cccc\n" + " zzz) q();", + Style40); + verifyFormat("fffffffff(\n" + " &xxx, // aaaaaaaaaaaa bbbbbbbbbbb\n" + " zzz);", + "fffffffff(&xxx, // aaaaaaaaaaaa bbbbbbbbbbb\n" + " zzz);", + Style40); } TEST_F(FormatTestComments, MultiLineCommentsInDefines) { - EXPECT_EQ("#define A(x) /* \\\n" - " a comment \\\n" - " inside */ \\\n" - " f();", - format("#define A(x) /* \\\n" - " a comment \\\n" - " inside */ \\\n" - " f();", - getLLVMStyleWithColumns(17))); - EXPECT_EQ("#define A( \\\n" - " x) /* \\\n" - " a comment \\\n" - " inside */ \\\n" - " f();", - format("#define A( \\\n" - " x) /* \\\n" - " a comment \\\n" - " inside */ \\\n" - " f();", - getLLVMStyleWithColumns(17))); + const auto Style17 = getLLVMStyleWithColumns(17); + verifyFormat("#define A(x) /* \\\n" + " a comment \\\n" + " inside */ \\\n" + " f();", + "#define A(x) /* \\\n" + " a comment \\\n" + " inside */ \\\n" + " f();", + Style17); + verifyFormat("#define A( \\\n" + " x) /* \\\n" + " a comment \\\n" + " inside */ \\\n" + " f();", + "#define A( \\\n" + " x) /* \\\n" + " a comment \\\n" + " inside */ \\\n" + " f();", + Style17); } TEST_F(FormatTestComments, LineCommentsInMacrosDoNotGetEscapedNewlines) { @@ -859,285 +854,285 @@ TEST_F(FormatTestComments, LineCommentsInMacrosDoNotGetEscapedNewlines) { } TEST_F(FormatTestComments, ParsesCommentsAdjacentToPPDirectives) { - EXPECT_EQ("namespace {}\n// Test\n#define A", - format("namespace {}\n // Test\n#define A")); - EXPECT_EQ("namespace {}\n/* Test */\n#define A", - format("namespace {}\n /* Test */\n#define A")); - EXPECT_EQ("namespace {}\n/* Test */ #define A", - format("namespace {}\n /* Test */ #define A")); + verifyFormat("namespace {}\n// Test\n#define A", + "namespace {}\n // Test\n#define A"); + verifyFormat("namespace {}\n/* Test */\n#define A", + "namespace {}\n /* Test */\n#define A"); + verifyFormat("namespace {}\n/* Test */ #define A", + "namespace {}\n /* Test */ #define A"); } TEST_F(FormatTestComments, KeepsLevelOfCommentBeforePPDirective) { // Keep the current level if the comment was originally not aligned with // the preprocessor directive. - EXPECT_EQ("void f() {\n" - " int i;\n" - " /* comment */\n" - "#ifdef A\n" - " int j;\n" - "}", - format("void f() {\n" - " int i;\n" - " /* comment */\n" - "#ifdef A\n" - " int j;\n" - "}")); - - EXPECT_EQ("void f() {\n" - " int i;\n" - " /* comment */\n" - "\n" - "#ifdef A\n" - " int j;\n" - "}", - format("void f() {\n" - " int i;\n" - " /* comment */\n" - "\n" - "#ifdef A\n" - " int j;\n" - "}")); - - EXPECT_EQ("int f(int i) {\n" - " if (true) {\n" - " ++i;\n" - " }\n" - " // comment\n" - "#ifdef A\n" - " int j;\n" - "#endif\n" - "}", - format("int f(int i) {\n" - " if (true) {\n" - " ++i;\n" - " }\n" - " // comment\n" - "#ifdef A\n" - "int j;\n" - "#endif\n" - "}")); - - EXPECT_EQ("int f(int i) {\n" - " if (true) {\n" - " i++;\n" - " } else {\n" - " // comment in else\n" - "#ifdef A\n" - " j++;\n" - "#endif\n" - " }\n" - "}", - format("int f(int i) {\n" - " if (true) {\n" - " i++;\n" - " } else {\n" - " // comment in else\n" - "#ifdef A\n" - " j++;\n" - "#endif\n" - " }\n" - "}")); - - EXPECT_EQ("int f(int i) {\n" - " if (true) {\n" - " i++;\n" - " } else {\n" - " /* comment in else */\n" - "#ifdef A\n" - " j++;\n" - "#endif\n" - " }\n" - "}", - format("int f(int i) {\n" - " if (true) {\n" - " i++;\n" - " } else {\n" - " /* comment in else */\n" - "#ifdef A\n" - " j++;\n" - "#endif\n" - " }\n" - "}")); + verifyFormat("void f() {\n" + " int i;\n" + " /* comment */\n" + "#ifdef A\n" + " int j;\n" + "}", + "void f() {\n" + " int i;\n" + " /* comment */\n" + "#ifdef A\n" + " int j;\n" + "}"); + + verifyFormat("void f() {\n" + " int i;\n" + " /* comment */\n" + "\n" + "#ifdef A\n" + " int j;\n" + "}", + "void f() {\n" + " int i;\n" + " /* comment */\n" + "\n" + "#ifdef A\n" + " int j;\n" + "}"); + + verifyFormat("int f(int i) {\n" + " if (true) {\n" + " ++i;\n" + " }\n" + " // comment\n" + "#ifdef A\n" + " int j;\n" + "#endif\n" + "}", + "int f(int i) {\n" + " if (true) {\n" + " ++i;\n" + " }\n" + " // comment\n" + "#ifdef A\n" + "int j;\n" + "#endif\n" + "}"); + + verifyFormat("int f(int i) {\n" + " if (true) {\n" + " i++;\n" + " } else {\n" + " // comment in else\n" + "#ifdef A\n" + " j++;\n" + "#endif\n" + " }\n" + "}", + "int f(int i) {\n" + " if (true) {\n" + " i++;\n" + " } else {\n" + " // comment in else\n" + "#ifdef A\n" + " j++;\n" + "#endif\n" + " }\n" + "}"); + + verifyFormat("int f(int i) {\n" + " if (true) {\n" + " i++;\n" + " } else {\n" + " /* comment in else */\n" + "#ifdef A\n" + " j++;\n" + "#endif\n" + " }\n" + "}", + "int f(int i) {\n" + " if (true) {\n" + " i++;\n" + " } else {\n" + " /* comment in else */\n" + "#ifdef A\n" + " j++;\n" + "#endif\n" + " }\n" + "}"); // Keep the current level if there is an empty line between the comment and // the preprocessor directive. - EXPECT_EQ("void f() {\n" - " int i;\n" - " /* comment */\n" - "\n" - "#ifdef A\n" - " int j;\n" - "}", - format("void f() {\n" - " int i;\n" - "/* comment */\n" - "\n" - "#ifdef A\n" - " int j;\n" - "}")); - - EXPECT_EQ("void f() {\n" - " int i;\n" - " return i;\n" - "}\n" - "// comment\n" - "\n" - "#ifdef A\n" - "int i;\n" - "#endif // A", - format("void f() {\n" - " int i;\n" - " return i;\n" - "}\n" - "// comment\n" - "\n" - "#ifdef A\n" - "int i;\n" - "#endif // A")); - - EXPECT_EQ("int f(int i) {\n" - " if (true) {\n" - " ++i;\n" - " }\n" - " // comment\n" - "\n" - "#ifdef A\n" - " int j;\n" - "#endif\n" - "}", - format("int f(int i) {\n" - " if (true) {\n" - " ++i;\n" - " }\n" - " // comment\n" - "\n" - "#ifdef A\n" - " int j;\n" - "#endif\n" - "}")); - - EXPECT_EQ("int f(int i) {\n" - " if (true) {\n" - " i++;\n" - " } else {\n" - " // comment in else\n" - "\n" - "#ifdef A\n" - " j++;\n" - "#endif\n" - " }\n" - "}", - format("int f(int i) {\n" - " if (true) {\n" - " i++;\n" - " } else {\n" - "// comment in else\n" - "\n" - "#ifdef A\n" - " j++;\n" - "#endif\n" - " }\n" - "}")); - - EXPECT_EQ("int f(int i) {\n" - " if (true) {\n" - " i++;\n" - " } else {\n" - " /* comment in else */\n" - "\n" - "#ifdef A\n" - " j++;\n" - "#endif\n" - " }\n" - "}", - format("int f(int i) {\n" - " if (true) {\n" - " i++;\n" - " } else {\n" - "/* comment in else */\n" - "\n" - "#ifdef A\n" - " j++;\n" - "#endif\n" - " }\n" - "}")); + verifyFormat("void f() {\n" + " int i;\n" + " /* comment */\n" + "\n" + "#ifdef A\n" + " int j;\n" + "}", + "void f() {\n" + " int i;\n" + "/* comment */\n" + "\n" + "#ifdef A\n" + " int j;\n" + "}"); + + verifyFormat("void f() {\n" + " int i;\n" + " return i;\n" + "}\n" + "// comment\n" + "\n" + "#ifdef A\n" + "int i;\n" + "#endif // A", + "void f() {\n" + " int i;\n" + " return i;\n" + "}\n" + "// comment\n" + "\n" + "#ifdef A\n" + "int i;\n" + "#endif // A"); + + verifyFormat("int f(int i) {\n" + " if (true) {\n" + " ++i;\n" + " }\n" + " // comment\n" + "\n" + "#ifdef A\n" + " int j;\n" + "#endif\n" + "}", + "int f(int i) {\n" + " if (true) {\n" + " ++i;\n" + " }\n" + " // comment\n" + "\n" + "#ifdef A\n" + " int j;\n" + "#endif\n" + "}"); + + verifyFormat("int f(int i) {\n" + " if (true) {\n" + " i++;\n" + " } else {\n" + " // comment in else\n" + "\n" + "#ifdef A\n" + " j++;\n" + "#endif\n" + " }\n" + "}", + "int f(int i) {\n" + " if (true) {\n" + " i++;\n" + " } else {\n" + "// comment in else\n" + "\n" + "#ifdef A\n" + " j++;\n" + "#endif\n" + " }\n" + "}"); + + verifyFormat("int f(int i) {\n" + " if (true) {\n" + " i++;\n" + " } else {\n" + " /* comment in else */\n" + "\n" + "#ifdef A\n" + " j++;\n" + "#endif\n" + " }\n" + "}", + "int f(int i) {\n" + " if (true) {\n" + " i++;\n" + " } else {\n" + "/* comment in else */\n" + "\n" + "#ifdef A\n" + " j++;\n" + "#endif\n" + " }\n" + "}"); // Align with the preprocessor directive if the comment was originally aligned // with the preprocessor directive and there is no newline between the comment // and the preprocessor directive. - EXPECT_EQ("void f() {\n" - " int i;\n" - "/* comment */\n" - "#ifdef A\n" - " int j;\n" - "}", - format("void f() {\n" - " int i;\n" - "/* comment */\n" - "#ifdef A\n" - " int j;\n" - "}")); - - EXPECT_EQ("int f(int i) {\n" - " if (true) {\n" - " ++i;\n" - " }\n" - "// comment\n" - "#ifdef A\n" - " int j;\n" - "#endif\n" - "}", - format("int f(int i) {\n" - " if (true) {\n" - " ++i;\n" - " }\n" - "// comment\n" - "#ifdef A\n" - " int j;\n" - "#endif\n" - "}")); - - EXPECT_EQ("int f(int i) {\n" - " if (true) {\n" - " i++;\n" - " } else {\n" - "// comment in else\n" - "#ifdef A\n" - " j++;\n" - "#endif\n" - " }\n" - "}", - format("int f(int i) {\n" - " if (true) {\n" - " i++;\n" - " } else {\n" - " // comment in else\n" - " #ifdef A\n" - " j++;\n" - "#endif\n" - " }\n" - "}")); - - EXPECT_EQ("int f(int i) {\n" - " if (true) {\n" - " i++;\n" - " } else {\n" - "/* comment in else */\n" - "#ifdef A\n" - " j++;\n" - "#endif\n" - " }\n" - "}", - format("int f(int i) {\n" - " if (true) {\n" - " i++;\n" - " } else {\n" - " /* comment in else */\n" - " #ifdef A\n" - " j++;\n" - "#endif\n" - " }\n" - "}")); + verifyFormat("void f() {\n" + " int i;\n" + "/* comment */\n" + "#ifdef A\n" + " int j;\n" + "}", + "void f() {\n" + " int i;\n" + "/* comment */\n" + "#ifdef A\n" + " int j;\n" + "}"); + + verifyFormat("int f(int i) {\n" + " if (true) {\n" + " ++i;\n" + " }\n" + "// comment\n" + "#ifdef A\n" + " int j;\n" + "#endif\n" + "}", + "int f(int i) {\n" + " if (true) {\n" + " ++i;\n" + " }\n" + "// comment\n" + "#ifdef A\n" + " int j;\n" + "#endif\n" + "}"); + + verifyFormat("int f(int i) {\n" + " if (true) {\n" + " i++;\n" + " } else {\n" + "// comment in else\n" + "#ifdef A\n" + " j++;\n" + "#endif\n" + " }\n" + "}", + "int f(int i) {\n" + " if (true) {\n" + " i++;\n" + " } else {\n" + " // comment in else\n" + " #ifdef A\n" + " j++;\n" + "#endif\n" + " }\n" + "}"); + + verifyFormat("int f(int i) {\n" + " if (true) {\n" + " i++;\n" + " } else {\n" + "/* comment in else */\n" + "#ifdef A\n" + " j++;\n" + "#endif\n" + " }\n" + "}", + "int f(int i) {\n" + " if (true) {\n" + " i++;\n" + " } else {\n" + " /* comment in else */\n" + " #ifdef A\n" + " j++;\n" + "#endif\n" + " }\n" + "}"); constexpr StringRef Code("void func() {\n" " // clang-format off\n" @@ -1189,245 +1184,242 @@ TEST_F(FormatTestComments, CommentsBetweenUnbracedBodyAndPPDirective) { } TEST_F(FormatTestComments, SplitsLongLinesInComments) { + const auto Style10 = getLLVMStyleWithColumns(10); + const auto Style15 = getLLVMStyleWithColumns(15); + const auto Style20 = getLLVMStyleWithColumns(20); + // FIXME: Do we need to fix up the " */" at the end? // It doesn't look like any of our current logic triggers this. - EXPECT_EQ("/* This is a long\n" - " * comment that\n" - " * doesn't fit on\n" - " * one line. */", - format("/* " - "This is a long " - "comment that " - "doesn't " - "fit on one line. */", - getLLVMStyleWithColumns(20))); - EXPECT_EQ( - "/* a b c d\n" - " * e f g\n" - " * h i j k\n" - " */", - format("/* a b c d e f g h i j k */", getLLVMStyleWithColumns(10))); - EXPECT_EQ( - "/* a b c d\n" - " * e f g\n" - " * h i j k\n" - " */", - format("\\\n/* a b c d e f g h i j k */", getLLVMStyleWithColumns(10))); - EXPECT_EQ("/*\n" - "This is a long\n" - "comment that doesn't\n" - "fit on one line.\n" - "*/", - format("/*\n" - "This is a long " - "comment that doesn't " - "fit on one line. \n" - "*/", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("/*\n" - " * This is a long\n" - " * comment that\n" - " * doesn't fit on\n" - " * one line.\n" - " */", - format("/* \n" - " * This is a long " - " comment that " - " doesn't fit on " - " one line. \n" - " */", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("/*\n" - " * This_is_a_comment_with_words_that_dont_fit_on_one_line\n" - " * so_it_should_be_broken\n" - " * wherever_a_space_occurs\n" - " */", - format("/*\n" - " * This_is_a_comment_with_words_that_dont_fit_on_one_line " - " so_it_should_be_broken " - " wherever_a_space_occurs \n" - " */", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("/*\n" - " * This_comment_can_not_be_broken_into_lines\n" - " */", - format("/*\n" - " * This_comment_can_not_be_broken_into_lines\n" - " */", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("{\n" - " /*\n" - " This is another\n" - " long comment that\n" - " doesn't fit on one\n" - " line 1234567890\n" - " */\n" - "}", - format("{\n" - "/*\n" - "This is another " - " long comment that " - " doesn't fit on one" - " line 1234567890\n" - "*/\n" - "}", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("{\n" - " /*\n" - " * This i s\n" - " * another comment\n" - " * t hat doesn' t\n" - " * fit on one l i\n" - " * n e\n" - " */\n" - "}", - format("{\n" - "/*\n" - " * This i s" - " another comment" - " t hat doesn' t" - " fit on one l i" - " n e\n" - " */\n" - "}", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("/*\n" - " * This is a long\n" - " * comment that\n" - " * doesn't fit on\n" - " * one line\n" - " */", - format(" /*\n" - " * This is a long comment that doesn't fit on one line\n" - " */", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("{\n" - " if (something) /* This is a\n" - " long\n" - " comment */\n" - " ;\n" - "}", - format("{\n" - " if (something) /* This is a long comment */\n" - " ;\n" - "}", - getLLVMStyleWithColumns(30))); - - EXPECT_EQ("/* A comment before\n" - " * a macro\n" - " * definition */\n" - "#define a b", - format("/* A comment before a macro definition */\n" - "#define a b", - getLLVMStyleWithColumns(20))); - - EXPECT_EQ("/* some comment\n" - " * a comment that\n" - " * we break another\n" - " * comment we have\n" - " * to break a left\n" - " * comment\n" - " */", - format(" /* some comment\n" - " * a comment that we break\n" - " * another comment we have to break\n" - "* a left comment\n" - " */", - getLLVMStyleWithColumns(20))); - - EXPECT_EQ("/**\n" - " * multiline block\n" - " * comment\n" - " *\n" - " */", - format("/**\n" - " * multiline block comment\n" - " *\n" - " */", - getLLVMStyleWithColumns(20))); + verifyFormat("/* This is a long\n" + " * comment that\n" + " * doesn't fit on\n" + " * one line. */", + "/* " + "This is a long " + "comment that " + "doesn't " + "fit on one line. */", + Style20); + verifyFormat("/* a b c d\n" + " * e f g\n" + " * h i j k\n" + " */", + "/* a b c d e f g h i j k */", Style10); + verifyFormat("/* a b c d\n" + " * e f g\n" + " * h i j k\n" + " */", + "\\\n/* a b c d e f g h i j k */", Style10); + verifyFormat("/*\n" + "This is a long\n" + "comment that doesn't\n" + "fit on one line.\n" + "*/", + "/*\n" + "This is a long " + "comment that doesn't " + "fit on one line. \n" + "*/", + Style20); + verifyFormat("/*\n" + " * This is a long\n" + " * comment that\n" + " * doesn't fit on\n" + " * one line.\n" + " */", + "/* \n" + " * This is a long " + " comment that " + " doesn't fit on " + " one line. \n" + " */", + Style20); + verifyFormat("/*\n" + " * This_is_a_comment_with_words_that_dont_fit_on_one_line\n" + " * so_it_should_be_broken\n" + " * wherever_a_space_occurs\n" + " */", + "/*\n" + " * This_is_a_comment_with_words_that_dont_fit_on_one_line " + " so_it_should_be_broken " + " wherever_a_space_occurs \n" + " */", + Style20); + verifyFormat("/*\n" + " * This_comment_can_not_be_broken_into_lines\n" + " */", + "/*\n" + " * This_comment_can_not_be_broken_into_lines\n" + " */", + Style20); + verifyFormat("{\n" + " /*\n" + " This is another\n" + " long comment that\n" + " doesn't fit on one\n" + " line 1234567890\n" + " */\n" + "}", + "{\n" + "/*\n" + "This is another " + " long comment that " + " doesn't fit on one" + " line 1234567890\n" + "*/\n" + "}", + Style20); + verifyFormat("{\n" + " /*\n" + " * This i s\n" + " * another comment\n" + " * t hat doesn' t\n" + " * fit on one l i\n" + " * n e\n" + " */\n" + "}", + "{\n" + "/*\n" + " * This i s" + " another comment" + " t hat doesn' t" + " fit on one l i" + " n e\n" + " */\n" + "}", + Style20); + verifyFormat("/*\n" + " * This is a long\n" + " * comment that\n" + " * doesn't fit on\n" + " * one line\n" + " */", + " /*\n" + " * This is a long comment that doesn't fit on one line\n" + " */", + Style20); + verifyFormat("{\n" + " if (something) /* This is a\n" + " long\n" + " comment */\n" + " ;\n" + "}", + "{\n" + " if (something) /* This is a long comment */\n" + " ;\n" + "}", + getLLVMStyleWithColumns(30)); + + verifyFormat("/* A comment before\n" + " * a macro\n" + " * definition */\n" + "#define a b", + "/* A comment before a macro definition */\n" + "#define a b", + Style20); + + verifyFormat("/* some comment\n" + " * a comment that\n" + " * we break another\n" + " * comment we have\n" + " * to break a left\n" + " * comment\n" + " */", + " /* some comment\n" + " * a comment that we break\n" + " * another comment we have to break\n" + "* a left comment\n" + " */", + Style20); + + verifyFormat("/**\n" + " * multiline block\n" + " * comment\n" + " *\n" + " */", + "/**\n" + " * multiline block comment\n" + " *\n" + " */", + Style20); // This reproduces a crashing bug where both adaptStartOfLine and // getCommentSplit were trying to wrap after the "/**". - verifyFormat("/** multilineblockcommentwithnowrapopportunity */", - getLLVMStyleWithColumns(20)); + verifyFormat("/** multilineblockcommentwithnowrapopportunity */", Style20); - EXPECT_EQ("/*\n" - "\n" - "\n" - " */", - format(" /* \n" - " \n" - " \n" - " */")); - - EXPECT_EQ("/* a a */", - format("/* a a */", getLLVMStyleWithColumns(15))); - EXPECT_EQ("/* a a bc */", - format("/* a a bc */", getLLVMStyleWithColumns(15))); - EXPECT_EQ("/* aaa aaa\n" - " * aaaaa */", - format("/* aaa aaa aaaaa */", getLLVMStyleWithColumns(15))); - EXPECT_EQ("/* aaa aaa\n" - " * aaaaa */", - format("/* aaa aaa aaaaa */", getLLVMStyleWithColumns(15))); + verifyFormat("/*\n" + "\n" + "\n" + " */", + " /* \n" + " \n" + " \n" + " */"); + + verifyFormat("/* a a */", "/* a a */", Style15); + verifyFormat("/* a a bc */", "/* a a bc */", Style15); + verifyFormat("/* aaa aaa\n" + " * aaaaa */", + "/* aaa aaa aaaaa */", Style15); + verifyFormat("/* aaa aaa\n" + " * aaaaa */", + "/* aaa aaa aaaaa */", Style15); } TEST_F(FormatTestComments, SplitsLongLinesInCommentsInPreprocessor) { - EXPECT_EQ("#define X \\\n" - " /* \\\n" - " Test \\\n" - " Macro comment \\\n" - " with a long \\\n" - " line \\\n" - " */ \\\n" - " A + B", - format("#define X \\\n" - " /*\n" - " Test\n" - " Macro comment with a long line\n" - " */ \\\n" - " A + B", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("#define X \\\n" - " /* Macro comment \\\n" - " with a long \\\n" - " line */ \\\n" - " A + B", - format("#define X \\\n" - " /* Macro comment with a long\n" - " line */ \\\n" - " A + B", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("#define X \\\n" - " /* Macro comment \\\n" - " * with a long \\\n" - " * line */ \\\n" - " A + B", - format("#define X \\\n" - " /* Macro comment with a long line */ \\\n" - " A + B", - getLLVMStyleWithColumns(20))); + const auto Style20 = getLLVMStyleWithColumns(20); + verifyFormat("#define X \\\n" + " /* \\\n" + " Test \\\n" + " Macro comment \\\n" + " with a long \\\n" + " line \\\n" + " */ \\\n" + " A + B", + "#define X \\\n" + " /*\n" + " Test\n" + " Macro comment with a long line\n" + " */ \\\n" + " A + B", + Style20); + verifyFormat("#define X \\\n" + " /* Macro comment \\\n" + " with a long \\\n" + " line */ \\\n" + " A + B", + "#define X \\\n" + " /* Macro comment with a long\n" + " line */ \\\n" + " A + B", + Style20); + verifyFormat("#define X \\\n" + " /* Macro comment \\\n" + " * with a long \\\n" + " * line */ \\\n" + " A + B", + "#define X \\\n" + " /* Macro comment with a long line */ \\\n" + " A + B", + Style20); } TEST_F(FormatTestComments, KeepsTrailingPPCommentsAndSectionCommentsSeparate) { verifyFormat("#ifdef A // line about A\n" "// section comment\n" - "#endif", - getLLVMStyleWithColumns(80)); + "#endif"); + verifyFormat("#ifdef A // line 1 about A\n" + " // line 2 about A\n" + "// section comment\n" + "#endif"); verifyFormat("#ifdef A // line 1 about A\n" " // line 2 about A\n" "// section comment\n" "#endif", - getLLVMStyleWithColumns(80)); - EXPECT_EQ("#ifdef A // line 1 about A\n" - " // line 2 about A\n" - "// section comment\n" - "#endif", - format("#ifdef A // line 1 about A\n" - " // line 2 about A\n" - "// section comment\n" - "#endif", - getLLVMStyleWithColumns(80))); + "#ifdef A // line 1 about A\n" + " // line 2 about A\n" + "// section comment\n" + "#endif"); verifyFormat("int f() {\n" " int i;\n" "#ifdef A // comment about A\n" @@ -1438,46 +1430,46 @@ TEST_F(FormatTestComments, KeepsTrailingPPCommentsAndSectionCommentsSeparate) { " // section comment 3\n" " i = 4;\n" "#endif\n" - "}", - getLLVMStyleWithColumns(80)); + "}"); } TEST_F(FormatTestComments, AlignsPPElseEndifComments) { + const auto Style20 = getLLVMStyleWithColumns(20); verifyFormat("#if A\n" "#else // A\n" "int iiii;\n" "#endif // B", - getLLVMStyleWithColumns(20)); + Style20); verifyFormat("#if A\n" "#else // A\n" "int iiii; // CC\n" "#endif // B", - getLLVMStyleWithColumns(20)); - EXPECT_EQ("#if A\n" - "#else // A1\n" - " // A2\n" - "int ii;\n" - "#endif // B", - format("#if A\n" - "#else // A1\n" - " // A2\n" - "int ii;\n" - "#endif // B", - getLLVMStyleWithColumns(20))); + Style20); + verifyFormat("#if A\n" + "#else // A1\n" + " // A2\n" + "int ii;\n" + "#endif // B", + "#if A\n" + "#else // A1\n" + " // A2\n" + "int ii;\n" + "#endif // B", + Style20); } TEST_F(FormatTestComments, CommentsInStaticInitializers) { - EXPECT_EQ( + verifyFormat( "static SomeType type = {aaaaaaaaaaaaaaaaaaaa, /* comment */\n" " aaaaaaaaaaaaaaaaaaaa /* comment */,\n" " /* comment */ aaaaaaaaaaaaaaaaaaaa,\n" " aaaaaaaaaaaaaaaaaaaa, // comment\n" " aaaaaaaaaaaaaaaaaaaa};", - format("static SomeType type = { aaaaaaaaaaaaaaaaaaaa , /* comment */\n" - " aaaaaaaaaaaaaaaaaaaa /* comment */ ,\n" - " /* comment */ aaaaaaaaaaaaaaaaaaaa ,\n" - " aaaaaaaaaaaaaaaaaaaa , // comment\n" - " aaaaaaaaaaaaaaaaaaaa };")); + "static SomeType type = { aaaaaaaaaaaaaaaaaaaa , /* comment */\n" + " aaaaaaaaaaaaaaaaaaaa /* comment */ ,\n" + " /* comment */ aaaaaaaaaaaaaaaaaaaa ,\n" + " aaaaaaaaaaaaaaaaaaaa , // comment\n" + " aaaaaaaaaaaaaaaaaaaa };"); verifyFormat("static SomeType type = {aaaaaaaaaaa, // comment for aa...\n" " bbbbbbbbbbb, ccccccccccc};"); verifyFormat("static SomeType type = {aaaaaaaaaaa,\n" @@ -1500,32 +1492,32 @@ TEST_F(FormatTestComments, CommentsInStaticInitializers) { " {// Group #3\n" " g, h, i}};"); - EXPECT_EQ("S s = {\n" - " // Some comment\n" - " a,\n" - "\n" - " // Comment after empty line\n" - " b}", - format("S s = {\n" - " // Some comment\n" - " a,\n" - " \n" - " // Comment after empty line\n" - " b\n" - "}")); - EXPECT_EQ("S s = {\n" - " /* Some comment */\n" - " a,\n" - "\n" - " /* Comment after empty line */\n" - " b}", - format("S s = {\n" - " /* Some comment */\n" - " a,\n" - " \n" - " /* Comment after empty line */\n" - " b\n" - "}")); + verifyFormat("S s = {\n" + " // Some comment\n" + " a,\n" + "\n" + " // Comment after empty line\n" + " b}", + "S s = {\n" + " // Some comment\n" + " a,\n" + " \n" + " // Comment after empty line\n" + " b\n" + "}"); + verifyFormat("S s = {\n" + " /* Some comment */\n" + " a,\n" + "\n" + " /* Comment after empty line */\n" + " b}", + "S s = {\n" + " /* Some comment */\n" + " a,\n" + " \n" + " /* Comment after empty line */\n" + " b\n" + "}"); verifyFormat("const uint8_t aaaaaaaaaaaaaaaaaaaaaa[0] = {\n" " 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // comment\n" " 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // comment\n" @@ -1533,486 +1525,485 @@ TEST_F(FormatTestComments, CommentsInStaticInitializers) { } TEST_F(FormatTestComments, LineCommentsAfterRightBrace) { - EXPECT_EQ("if (true) { // comment about branch\n" - " // comment about f\n" - " f();\n" - "}", - format("if (true) { // comment about branch\n" - " // comment about f\n" - " f();\n" - "}", - getLLVMStyleWithColumns(80))); - EXPECT_EQ("if (1) { // if line 1\n" - " // if line 2\n" - " // if line 3\n" - " // f line 1\n" - " // f line 2\n" - " f();\n" - "} else { // else line 1\n" - " // else line 2\n" - " // else line 3\n" - " // g line 1\n" - " g();\n" - "}", - format("if (1) { // if line 1\n" - " // if line 2\n" - " // if line 3\n" - " // f line 1\n" - " // f line 2\n" - " f();\n" - "} else { // else line 1\n" - " // else line 2\n" - " // else line 3\n" - " // g line 1\n" - " g();\n" - "}")); - EXPECT_EQ("do { // line 1\n" - " // line 2\n" - " // line 3\n" - " f();\n" - "} while (true);", - format("do { // line 1\n" - " // line 2\n" - " // line 3\n" - " f();\n" - "} while (true);", - getLLVMStyleWithColumns(80))); - EXPECT_EQ("while (a < b) { // line 1\n" - " // line 2\n" - " // line 3\n" - " f();\n" - "}", - format("while (a < b) {// line 1\n" - " // line 2\n" - " // line 3\n" - " f();\n" - "}", - getLLVMStyleWithColumns(80))); + verifyFormat("if (true) { // comment about branch\n" + " // comment about f\n" + " f();\n" + "}", + "if (true) { // comment about branch\n" + " // comment about f\n" + " f();\n" + "}"); + verifyFormat("if (1) { // if line 1\n" + " // if line 2\n" + " // if line 3\n" + " // f line 1\n" + " // f line 2\n" + " f();\n" + "} else { // else line 1\n" + " // else line 2\n" + " // else line 3\n" + " // g line 1\n" + " g();\n" + "}", + "if (1) { // if line 1\n" + " // if line 2\n" + " // if line 3\n" + " // f line 1\n" + " // f line 2\n" + " f();\n" + "} else { // else line 1\n" + " // else line 2\n" + " // else line 3\n" + " // g line 1\n" + " g();\n" + "}"); + verifyFormat("do { // line 1\n" + " // line 2\n" + " // line 3\n" + " f();\n" + "} while (true);", + "do { // line 1\n" + " // line 2\n" + " // line 3\n" + " f();\n" + "} while (true);"); + verifyFormat("while (a < b) { // line 1\n" + " // line 2\n" + " // line 3\n" + " f();\n" + "}", + "while (a < b) {// line 1\n" + " // line 2\n" + " // line 3\n" + " f();\n" + "}"); } TEST_F(FormatTestComments, ReflowsComments) { + const auto Style20 = getLLVMStyleWithColumns(20); + const auto Style22 = getLLVMStyleWithColumns(22); // Break a long line and reflow with the full next line. - EXPECT_EQ("// long long long\n" - "// long long", - format("// long long long long\n" - "// long", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long long", + "// long long long long\n" + "// long", + Style20); // Keep the trailing newline while reflowing. - EXPECT_EQ("// long long long\n" - "// long long", - format("// long long long long\n" - "// long", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long long", + "// long long long long\n" + "// long", + Style20); // Break a long line and reflow with a part of the next line. - EXPECT_EQ("// long long long\n" - "// long long\n" - "// long_long", - format("// long long long long\n" - "// long long_long", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long long\n" + "// long_long", + "// long long long long\n" + "// long long_long", + Style20); // Break but do not reflow if the first word from the next line is too long. - EXPECT_EQ("// long long long\n" - "// long\n" - "// long_long_long", - format("// long long long long\n" - "// long_long_long", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long\n" + "// long_long_long", + "// long long long long\n" + "// long_long_long", + Style20); // Don't break or reflow short lines. verifyFormat("// long\n" "// long long long lo\n" "// long long long lo\n" "// long", - getLLVMStyleWithColumns(20)); + Style20); // Keep prefixes and decorations while reflowing. - EXPECT_EQ("/// long long long\n" - "/// long long", - format("/// long long long long\n" - "/// long", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("//! long long long\n" - "//! long long", - format("//! long long long long\n" - "//! long", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("/* long long long\n" - " * long long */", - format("/* long long long long\n" - " * long */", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("///< long long long\n" - "///< long long", - format("///< long long long long\n" - "///< long", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("//!< long long long\n" - "//!< long long", - format("//!< long long long long\n" - "//!< long", - getLLVMStyleWithColumns(20))); + verifyFormat("/// long long long\n" + "/// long long", + "/// long long long long\n" + "/// long", + Style20); + verifyFormat("//! long long long\n" + "//! long long", + "//! long long long long\n" + "//! long", + Style20); + verifyFormat("/* long long long\n" + " * long long */", + "/* long long long long\n" + " * long */", + Style20); + verifyFormat("///< long long long\n" + "///< long long", + "///< long long long long\n" + "///< long", + Style20); + verifyFormat("//!< long long long\n" + "//!< long long", + "//!< long long long long\n" + "//!< long", + Style20); // Don't bring leading whitespace up while reflowing. - EXPECT_EQ("/* long long long\n" - " * long long long\n" - " */", - format("/* long long long long\n" - " * long long\n" - " */", - getLLVMStyleWithColumns(20))); + verifyFormat("/* long long long\n" + " * long long long\n" + " */", + "/* long long long long\n" + " * long long\n" + " */", + Style20); // Reflow the last line of a block comment with its trailing '*/'. - EXPECT_EQ("/* long long long\n" - " long long */", - format("/* long long long long\n" - " long */", - getLLVMStyleWithColumns(20))); + verifyFormat("/* long long long\n" + " long long */", + "/* long long long long\n" + " long */", + Style20); // Reflow two short lines; keep the postfix of the last one. - EXPECT_EQ("/* long long long\n" - " * long long long */", - format("/* long long long long\n" - " * long\n" - " * long */", - getLLVMStyleWithColumns(20))); + verifyFormat("/* long long long\n" + " * long long long */", + "/* long long long long\n" + " * long\n" + " * long */", + Style20); // Put the postfix of the last short reflow line on a newline if it doesn't // fit. - EXPECT_EQ("/* long long long\n" - " * long long longg\n" - " */", - format("/* long long long long\n" - " * long\n" - " * longg */", - getLLVMStyleWithColumns(20))); + verifyFormat("/* long long long\n" + " * long long longg\n" + " */", + "/* long long long long\n" + " * long\n" + " * longg */", + Style20); // Reflow lines with leading whitespace. - EXPECT_EQ("{\n" - " /*\n" - " * long long long\n" - " * long long long\n" - " * long long long\n" - " */\n" - "}", - format("{\n" - "/*\n" - " * long long long long\n" - " * long\n" - " * long long long long\n" - " */\n" - "}", - getLLVMStyleWithColumns(20))); + verifyFormat("{\n" + " /*\n" + " * long long long\n" + " * long long long\n" + " * long long long\n" + " */\n" + "}", + "{\n" + "/*\n" + " * long long long long\n" + " * long\n" + " * long long long long\n" + " */\n" + "}", + Style20); // Break single line block comments that are first in the line with ' *' // decoration. - EXPECT_EQ("/* long long long\n" - " * long */", - format("/* long long long long */", getLLVMStyleWithColumns(20))); + verifyFormat("/* long long long\n" + " * long */", + "/* long long long long */", Style20); // Break single line block comment that are not first in the line with ' ' // decoration. - EXPECT_EQ("int i; /* long long\n" - " long */", - format("int i; /* long long long */", getLLVMStyleWithColumns(20))); + verifyFormat("int i; /* long long\n" + " long */", + "int i; /* long long long */", Style20); // Reflow a line that goes just over the column limit. - EXPECT_EQ("// long long long\n" - "// lon long", - format("// long long long lon\n" - "// long", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// lon long", + "// long long long lon\n" + "// long", + Style20); // Stop reflowing if the next line has a different indentation than the // previous line. - EXPECT_EQ("// long long long\n" - "// long\n" - "// long long\n" - "// long", - format("// long long long long\n" - "// long long\n" - "// long", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long\n" + "// long long\n" + "// long", + "// long long long long\n" + "// long long\n" + "// long", + Style20); // Reflow into the last part of a really long line that has been broken into // multiple lines. - EXPECT_EQ("// long long long\n" - "// long long long\n" - "// long long long", - format("// long long long long long long long long\n" - "// long", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long long long\n" + "// long long long", + "// long long long long long long long long\n" + "// long", + Style20); // Break the first line, then reflow the beginning of the second and third // line up. - EXPECT_EQ("// long long long\n" - "// lon1 lon2 lon2\n" - "// lon2 lon3 lon3", - format("// long long long lon1\n" - "// lon2 lon2 lon2\n" - "// lon3 lon3", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// lon1 lon2 lon2\n" + "// lon2 lon3 lon3", + "// long long long lon1\n" + "// lon2 lon2 lon2\n" + "// lon3 lon3", + Style20); // Reflow the beginning of the second line, then break the rest. - EXPECT_EQ("// long long long\n" - "// lon1 lon2 lon2\n" - "// lon2 lon2 lon2\n" - "// lon3", - format("// long long long lon1\n" - "// lon2 lon2 lon2 lon2 lon2 lon3", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// lon1 lon2 lon2\n" + "// lon2 lon2 lon2\n" + "// lon3", + "// long long long lon1\n" + "// lon2 lon2 lon2 lon2 lon2 lon3", + Style20); // Shrink the first line, then reflow the second line up. - EXPECT_EQ("// long long long", format("// long long\n" - "// long", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long", + "// long long\n" + "// long", + Style20); // Don't shrink leading whitespace. - verifyNoChange("int i; /// a", getLLVMStyleWithColumns(20)); + verifyNoChange("int i; /// a", Style20); // Shrink trailing whitespace if there is no postfix and reflow. - EXPECT_EQ("// long long long\n" - "// long long", - format("// long long long long \n" - "// long", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long long", + "// long long long long \n" + "// long", + Style20); // Shrink trailing whitespace to a single one if there is postfix. - EXPECT_EQ("/* long long long */", - format("/* long long long */", getLLVMStyleWithColumns(20))); + verifyFormat("/* long long long */", "/* long long long */", Style20); // Break a block comment postfix if exceeding the line limit. - EXPECT_EQ("/* long\n" - " */", - format("/* long */", getLLVMStyleWithColumns(20))); + verifyFormat("/* long\n" + " */", + "/* long */", Style20); // Reflow indented comments. - EXPECT_EQ("{\n" - " // long long long\n" - " // long long\n" - " int i; /* long lon\n" - " g long\n" - " */\n" - "}", - format("{\n" - " // long long long long\n" - " // long\n" - " int i; /* long lon g\n" - " long */\n" - "}", - getLLVMStyleWithColumns(20))); + verifyFormat("{\n" + " // long long long\n" + " // long long\n" + " int i; /* long lon\n" + " g long\n" + " */\n" + "}", + "{\n" + " // long long long long\n" + " // long\n" + " int i; /* long lon g\n" + " long */\n" + "}", + Style20); // Don't realign trailing comments after reflow has happened. - EXPECT_EQ("// long long long\n" - "// long long\n" - "long i; // long", - format("// long long long long\n" - "// long\n" - "long i; // long", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("// long long long\n" - "// longng long long\n" - "// long lo", - format("// long long long longng\n" - "// long long long\n" - "// lo", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long long\n" + "long i; // long", + "// long long long long\n" + "// long\n" + "long i; // long", + Style20); + verifyFormat("// long long long\n" + "// longng long long\n" + "// long lo", + "// long long long longng\n" + "// long long long\n" + "// lo", + Style20); // Reflow lines after a broken line. - EXPECT_EQ("int a; // Trailing\n" - " // comment on\n" - " // 2 or 3\n" - " // lines.", - format("int a; // Trailing comment\n" - " // on 2\n" - " // or 3\n" - " // lines.", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("/// This long line\n" - "/// gets reflown.", - format("/// This long line gets\n" - "/// reflown.", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("//! This long line\n" - "//! gets reflown.", - format(" //! This long line gets\n" - " //! reflown.", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("/* This long line\n" - " * gets reflown.\n" - " */", - format("/* This long line gets\n" - " * reflown.\n" - " */", - getLLVMStyleWithColumns(20))); + verifyFormat("int a; // Trailing\n" + " // comment on\n" + " // 2 or 3\n" + " // lines.", + "int a; // Trailing comment\n" + " // on 2\n" + " // or 3\n" + " // lines.", + Style20); + verifyFormat("/// This long line\n" + "/// gets reflown.", + "/// This long line gets\n" + "/// reflown.", + Style20); + verifyFormat("//! This long line\n" + "//! gets reflown.", + " //! This long line gets\n" + " //! reflown.", + Style20); + verifyFormat("/* This long line\n" + " * gets reflown.\n" + " */", + "/* This long line gets\n" + " * reflown.\n" + " */", + Style20); // Reflow after indentation makes a line too long. - EXPECT_EQ("{\n" - " // long long long\n" - " // lo long\n" - "}", - format("{\n" - "// long long long lo\n" - "// long\n" - "}", - getLLVMStyleWithColumns(20))); + verifyFormat("{\n" + " // long long long\n" + " // lo long\n" + "}", + "{\n" + "// long long long lo\n" + "// long\n" + "}", + Style20); // Break and reflow multiple lines. - EXPECT_EQ("/*\n" - " * Reflow the end of\n" - " * line by 11 22 33\n" - " * 4.\n" - " */", - format("/*\n" - " * Reflow the end of line\n" - " * by\n" - " * 11\n" - " * 22\n" - " * 33\n" - " * 4.\n" - " */", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("/// First line gets\n" - "/// broken. Second\n" - "/// line gets\n" - "/// reflown and\n" - "/// broken. Third\n" - "/// gets reflown.", - format("/// First line gets broken.\n" - "/// Second line gets reflown and broken.\n" - "/// Third gets reflown.", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("int i; // first long\n" - " // long snd\n" - " // long.", - format("int i; // first long long\n" - " // snd long.", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("{\n" - " // first long line\n" - " // line second\n" - " // long line line\n" - " // third long line\n" - " // line\n" - "}", - format("{\n" - " // first long line line\n" - " // second long line line\n" - " // third long line line\n" - "}", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("int i; /* first line\n" - " * second\n" - " * line third\n" - " * line\n" - " */", - format("int i; /* first line\n" - " * second line\n" - " * third line\n" - " */", - getLLVMStyleWithColumns(20))); + verifyFormat("/*\n" + " * Reflow the end of\n" + " * line by 11 22 33\n" + " * 4.\n" + " */", + "/*\n" + " * Reflow the end of line\n" + " * by\n" + " * 11\n" + " * 22\n" + " * 33\n" + " * 4.\n" + " */", + Style20); + verifyFormat("/// First line gets\n" + "/// broken. Second\n" + "/// line gets\n" + "/// reflown and\n" + "/// broken. Third\n" + "/// gets reflown.", + "/// First line gets broken.\n" + "/// Second line gets reflown and broken.\n" + "/// Third gets reflown.", + Style20); + verifyFormat("int i; // first long\n" + " // long snd\n" + " // long.", + "int i; // first long long\n" + " // snd long.", + Style20); + verifyFormat("{\n" + " // first long line\n" + " // line second\n" + " // long line line\n" + " // third long line\n" + " // line\n" + "}", + "{\n" + " // first long line line\n" + " // second long line line\n" + " // third long line line\n" + "}", + Style20); + verifyFormat("int i; /* first line\n" + " * second\n" + " * line third\n" + " * line\n" + " */", + "int i; /* first line\n" + " * second line\n" + " * third line\n" + " */", + Style20); // Reflow the last two lines of a section that starts with a line having // different indentation. - EXPECT_EQ("// long\n" - "// long long long\n" - "// long long", - format("// long\n" - "// long long long long\n" - "// long", - getLLVMStyleWithColumns(20))); + verifyFormat("// long\n" + "// long long long\n" + "// long long", + "// long\n" + "// long long long long\n" + "// long", + Style20); // Keep the block comment endling '*/' while reflowing. - EXPECT_EQ("/* Long long long\n" - " * line short */", - format("/* Long long long line\n" - " * short */", - getLLVMStyleWithColumns(20))); + verifyFormat("/* Long long long\n" + " * line short */", + "/* Long long long line\n" + " * short */", + Style20); // Don't reflow between separate blocks of comments. - EXPECT_EQ("/* First comment\n" - " * block will */\n" - "/* Snd\n" - " */", - format("/* First comment block\n" - " * will */\n" - "/* Snd\n" - " */", - getLLVMStyleWithColumns(20))); + verifyFormat("/* First comment\n" + " * block will */\n" + "/* Snd\n" + " */", + "/* First comment block\n" + " * will */\n" + "/* Snd\n" + " */", + Style20); // Don't reflow across blank comment lines. - EXPECT_EQ("int i; // This long\n" - " // line gets\n" - " // broken.\n" - " //\n" - " // keep.", - format("int i; // This long line gets broken.\n" - " // \n" - " // keep.", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("{\n" - " /// long long long\n" - " /// long long\n" - " ///\n" - " /// long\n" - "}", - format("{\n" - " /// long long long long\n" - " /// long\n" - " ///\n" - " /// long\n" - "}", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("//! long long long\n" - "//! long\n" - "\n" - "//! long", - format("//! long long long long\n" - "\n" - "//! long", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("/* long long long\n" - " long\n" - "\n" - " long */", - format("/* long long long long\n" - "\n" - " long */", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("/* long long long\n" - " * long\n" - " *\n" - " * long */", - format("/* long long long long\n" - " *\n" - " * long */", - getLLVMStyleWithColumns(20))); + verifyFormat("int i; // This long\n" + " // line gets\n" + " // broken.\n" + " //\n" + " // keep.", + "int i; // This long line gets broken.\n" + " // \n" + " // keep.", + Style20); + verifyFormat("{\n" + " /// long long long\n" + " /// long long\n" + " ///\n" + " /// long\n" + "}", + "{\n" + " /// long long long long\n" + " /// long\n" + " ///\n" + " /// long\n" + "}", + Style20); + verifyFormat("//! long long long\n" + "//! long\n" + "\n" + "//! long", + "//! long long long long\n" + "\n" + "//! long", + Style20); + verifyFormat("/* long long long\n" + " long\n" + "\n" + " long */", + "/* long long long long\n" + "\n" + " long */", + Style20); + verifyFormat("/* long long long\n" + " * long\n" + " *\n" + " * long */", + "/* long long long long\n" + " *\n" + " * long */", + Style20); // Don't reflow lines having content that is a single character. - EXPECT_EQ("// long long long\n" - "// long\n" - "// l", - format("// long long long long\n" - "// l", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long\n" + "// l", + "// long long long long\n" + "// l", + Style20); // Don't reflow lines starting with two punctuation characters. - EXPECT_EQ("// long long long\n" - "// long\n" - "// ... --- ...", - format("// long long long long\n" - "// ... --- ...", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long\n" + "// ... --- ...", + "// long long long long\n" + "// ... --- ...", + Style20); // Don't reflow lines starting with '@'. - EXPECT_EQ("// long long long\n" - "// long\n" - "// @param arg", - format("// long long long long\n" - "// @param arg", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long\n" + "// @param arg", + "// long long long long\n" + "// @param arg", + Style20); // Don't reflow lines starting with '\'. verifyFormat("// long long long\n" @@ -2020,433 +2011,435 @@ TEST_F(FormatTestComments, ReflowsComments) { "// \\param arg", "// long long long long\n" "// \\param arg", - getLLVMStyleWithColumns(20)); + Style20); // Don't reflow lines starting with 'TODO'. - EXPECT_EQ("// long long long\n" - "// long\n" - "// TODO: long", - format("// long long long long\n" - "// TODO: long", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long\n" + "// TODO: long", + "// long long long long\n" + "// TODO: long", + Style20); // Don't reflow lines starting with 'FIXME'. - EXPECT_EQ("// long long long\n" - "// long\n" - "// FIXME: long", - format("// long long long long\n" - "// FIXME: long", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long\n" + "// FIXME: long", + "// long long long long\n" + "// FIXME: long", + Style20); // Don't reflow lines starting with 'XXX'. - EXPECT_EQ("// long long long\n" - "// long\n" - "// XXX: long", - format("// long long long long\n" - "// XXX: long", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long\n" + "// XXX: long", + "// long long long long\n" + "// XXX: long", + Style20); // Don't reflow comment pragmas. - EXPECT_EQ("// long long long\n" - "// long\n" - "// IWYU pragma:", - format("// long long long long\n" - "// IWYU pragma:", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("/* long long long\n" - " * long\n" - " * IWYU pragma:\n" - " */", - format("/* long long long long\n" - " * IWYU pragma:\n" - " */", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long\n" + "// IWYU pragma:", + "// long long long long\n" + "// IWYU pragma:", + Style20); + verifyFormat("/* long long long\n" + " * long\n" + " * IWYU pragma:\n" + " */", + "/* long long long long\n" + " * IWYU pragma:\n" + " */", + Style20); // Reflow lines that have a non-punctuation character among their first 2 // characters. - EXPECT_EQ("// long long long\n" - "// long 'long'", - format("// long long long long\n" - "// 'long'", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long 'long'", + "// long long long long\n" + "// 'long'", + Style20); // Don't reflow between separate blocks of comments. - EXPECT_EQ("/* First comment\n" - " * block will */\n" - "/* Snd\n" - " */", - format("/* First comment block\n" - " * will */\n" - "/* Snd\n" - " */", - getLLVMStyleWithColumns(20))); + verifyFormat("/* First comment\n" + " * block will */\n" + "/* Snd\n" + " */", + "/* First comment block\n" + " * will */\n" + "/* Snd\n" + " */", + Style20); // Don't reflow lines having different indentation. - EXPECT_EQ("// long long long\n" - "// long\n" - "// long", - format("// long long long long\n" - "// long", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long\n" + "// long", + "// long long long long\n" + "// long", + Style20); // Don't reflow separate bullets in list - EXPECT_EQ("// - long long long\n" - "// long\n" - "// - long", - format("// - long long long long\n" - "// - long", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("// * long long long\n" - "// long\n" - "// * long", - format("// * long long long long\n" - "// * long", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("// + long long long\n" - "// long\n" - "// + long", - format("// + long long long long\n" - "// + long", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("// 1. long long long\n" - "// long\n" - "// 2. long", - format("// 1. long long long long\n" - "// 2. long", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("// -# long long long\n" - "// long\n" - "// -# long", - format("// -# long long long long\n" - "// -# long", - getLLVMStyleWithColumns(20))); - - EXPECT_EQ("// - long long long\n" - "// long long long\n" - "// - long", - format("// - long long long long\n" - "// long long\n" - "// - long", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("// - long long long\n" - "// long long long\n" - "// long\n" - "// - long", - format("// - long long long long\n" - "// long long long\n" - "// - long", - getLLVMStyleWithColumns(20))); + verifyFormat("// - long long long\n" + "// long\n" + "// - long", + "// - long long long long\n" + "// - long", + Style20); + verifyFormat("// * long long long\n" + "// long\n" + "// * long", + "// * long long long long\n" + "// * long", + Style20); + verifyFormat("// + long long long\n" + "// long\n" + "// + long", + "// + long long long long\n" + "// + long", + Style20); + verifyFormat("// 1. long long long\n" + "// long\n" + "// 2. long", + "// 1. long long long long\n" + "// 2. long", + Style20); + verifyFormat("// -# long long long\n" + "// long\n" + "// -# long", + "// -# long long long long\n" + "// -# long", + Style20); + + verifyFormat("// - long long long\n" + "// long long long\n" + "// - long", + "// - long long long long\n" + "// long long\n" + "// - long", + Style20); + verifyFormat("// - long long long\n" + "// long long long\n" + "// long\n" + "// - long", + "// - long long long long\n" + "// long long long\n" + "// - long", + Style20); // Large number (>2 digits) are not list items - EXPECT_EQ("// long long long\n" - "// long 1024. long.", - format("// long long long long\n" - "// 1024. long.", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long long\n" + "// long 1024. long.", + "// long long long long\n" + "// 1024. long.", + Style20); // Do not break before number, to avoid introducing a non-reflowable doxygen // list item. - EXPECT_EQ("// long long\n" - "// long 10. long.", - format("// long long long 10.\n" - "// long.", - getLLVMStyleWithColumns(20))); + verifyFormat("// long long\n" + "// long 10. long.", + "// long long long 10.\n" + "// long.", + Style20); // Don't break or reflow after implicit string literals. verifyFormat("#include // l l l\n" " // l", - getLLVMStyleWithColumns(20)); + Style20); // Don't break or reflow comments on import lines. - EXPECT_EQ("#include \"t\" /* l l l\n" - " * l */", - format("#include \"t\" /* l l l\n" - " * l */", - getLLVMStyleWithColumns(20))); + verifyFormat("#include \"t\" /* l l l\n" + " * l */", + "#include \"t\" /* l l l\n" + " * l */", + Style20); // Don't reflow between different trailing comment sections. - EXPECT_EQ("int i; // long long\n" - " // long\n" - "int j; // long long\n" - " // long", - format("int i; // long long long\n" - "int j; // long long long", - getLLVMStyleWithColumns(20))); + verifyFormat("int i; // long long\n" + " // long\n" + "int j; // long long\n" + " // long", + "int i; // long long long\n" + "int j; // long long long", + Style20); // Don't reflow if the first word on the next line is longer than the // available space at current line. - EXPECT_EQ("int i; // trigger\n" - " // reflow\n" - " // longsec", - format("int i; // trigger reflow\n" - " // longsec", - getLLVMStyleWithColumns(20))); + verifyFormat("int i; // trigger\n" + " // reflow\n" + " // longsec", + "int i; // trigger reflow\n" + " // longsec", + Style20); // Simple case that correctly handles reflow in parameter lists. - EXPECT_EQ("a = f(/* looooooooong\n" - " * long long\n" - " */\n" - " a);", - format("a = f(/* looooooooong long\n* long\n*/ a);", - getLLVMStyleWithColumns(22))); + verifyFormat("a = f(/* looooooooong\n" + " * long long\n" + " */\n" + " a);", + "a = f(/* looooooooong long\n* long\n*/ a);", Style22); // Tricky case that has fewer lines if we reflow the comment, ending up with // fewer lines. - EXPECT_EQ("a = f(/* loooooong\n" - " * long long\n" - " */\n" - " a);", - format("a = f(/* loooooong long\n* long\n*/ a);", - getLLVMStyleWithColumns(22))); + verifyFormat("a = f(/* loooooong\n" + " * long long\n" + " */\n" + " a);", + "a = f(/* loooooong long\n* long\n*/ a);", Style22); // Keep empty comment lines. - EXPECT_EQ("/**/", format(" /**/", getLLVMStyleWithColumns(20))); - EXPECT_EQ("/* */", format(" /* */", getLLVMStyleWithColumns(20))); - EXPECT_EQ("/* */", format(" /* */", getLLVMStyleWithColumns(20))); - EXPECT_EQ("//", format(" // ", getLLVMStyleWithColumns(20))); - EXPECT_EQ("///", format(" /// ", getLLVMStyleWithColumns(20))); + verifyFormat("/**/", " /**/", Style20); + verifyFormat("/* */", " /* */", Style20); + verifyFormat("/* */", " /* */", Style20); + verifyFormat("//", " // ", Style20); + verifyFormat("///", " /// ", Style20); } TEST_F(FormatTestComments, ReflowsCommentsPrecise) { + auto Style = getLLVMStyleWithColumns(20); + // FIXME: This assumes we do not continue compressing whitespace once we are // in reflow mode. Consider compressing whitespace. // Test that we stop reflowing precisely at the column limit. // After reflowing, "// reflows into foo" does not fit the column limit, // so we compress the whitespace. - EXPECT_EQ("// some text that\n" - "// reflows into foo", - format("// some text that reflows\n" - "// into foo", - getLLVMStyleWithColumns(20))); + verifyFormat("// some text that\n" + "// reflows into foo", + "// some text that reflows\n" + "// into foo", + Style); + Style.ColumnLimit = 21; // Given one more column, "// reflows into foo" does fit the limit, so we // do not compress the whitespace. - EXPECT_EQ("// some text that\n" - "// reflows into foo", - format("// some text that reflows\n" - "// into foo", - getLLVMStyleWithColumns(21))); + verifyFormat("// some text that\n" + "// reflows into foo", + "// some text that reflows\n" + "// into foo", + Style); // Make sure that we correctly account for the space added in the reflow case // when making the reflowing decision. // First, when the next line ends precisely one column over the limit, do not // reflow. - EXPECT_EQ("// some text that\n" - "// reflows\n" - "// into1234567", - format("// some text that reflows\n" - "// into1234567", - getLLVMStyleWithColumns(21))); + verifyFormat("// some text that\n" + "// reflows\n" + "// into1234567", + "// some text that reflows\n" + "// into1234567", + Style); // Secondly, when the next line ends later, but the first word in that line // is precisely one column over the limit, do not reflow. - EXPECT_EQ("// some text that\n" - "// reflows\n" - "// into1234567 f", - format("// some text that reflows\n" - "// into1234567 f", - getLLVMStyleWithColumns(21))); + verifyFormat("// some text that\n" + "// reflows\n" + "// into1234567 f", + "// some text that reflows\n" + "// into1234567 f", + Style); } TEST_F(FormatTestComments, ReflowsCommentsWithExtraWhitespace) { + const auto Style16 = getLLVMStyleWithColumns(16); // Baseline. - EXPECT_EQ("// some text\n" - "// that re flows", - format("// some text that\n" - "// re flows", - getLLVMStyleWithColumns(16))); - EXPECT_EQ("// some text\n" - "// that re flows", - format("// some text that\n" - "// re flows", - getLLVMStyleWithColumns(16))); - EXPECT_EQ("/* some text\n" - " * that re flows\n" - " */", - format("/* some text that\n" - "* re flows\n" - "*/", - getLLVMStyleWithColumns(16))); + verifyFormat("// some text\n" + "// that re flows", + "// some text that\n" + "// re flows", + Style16); + verifyFormat("// some text\n" + "// that re flows", + "// some text that\n" + "// re flows", + Style16); + verifyFormat("/* some text\n" + " * that re flows\n" + " */", + "/* some text that\n" + "* re flows\n" + "*/", + Style16); // FIXME: We do not reflow if the indent of two subsequent lines differs; // given that this is different behavior from block comments, do we want // to keep this? - EXPECT_EQ("// some text\n" - "// that\n" - "// re flows", - format("// some text that\n" - "// re flows", - getLLVMStyleWithColumns(16))); + verifyFormat("// some text\n" + "// that\n" + "// re flows", + "// some text that\n" + "// re flows", + Style16); // Space within parts of a line that fit. // FIXME: Use the earliest possible split while reflowing to compress the // whitespace within the line. - EXPECT_EQ("// some text that\n" - "// does re flow\n" - "// more here", - format("// some text that does\n" - "// re flow more here", - getLLVMStyleWithColumns(21))); + verifyFormat("// some text that\n" + "// does re flow\n" + "// more here", + "// some text that does\n" + "// re flow more here", + getLLVMStyleWithColumns(21)); } TEST_F(FormatTestComments, IgnoresIf0Contents) { - EXPECT_EQ("#if 0\n" - "}{)(&*(^%%#%@! fsadj f;ldjs ,:;| <<<>>>][)(][\n" - "#endif\n" - "void f() {}", - format("#if 0\n" - "}{)(&*(^%%#%@! fsadj f;ldjs ,:;| <<<>>>][)(][\n" - "#endif\n" - "void f( ) { }")); - EXPECT_EQ("#if false\n" - "void f( ) { }\n" - "#endif\n" - "void g() {}", - format("#if false\n" - "void f( ) { }\n" - "#endif\n" - "void g( ) { }")); - EXPECT_EQ("enum E {\n" - " One,\n" - " Two,\n" - "#if 0\n" - "Three,\n" - " Four,\n" - "#endif\n" - " Five\n" - "};", - format("enum E {\n" - " One,Two,\n" - "#if 0\n" - "Three,\n" - " Four,\n" - "#endif\n" - " Five};")); - EXPECT_EQ("enum F {\n" - " One,\n" - "#if 1\n" - " Two,\n" - "#if 0\n" - "Three,\n" - " Four,\n" - "#endif\n" - " Five\n" - "#endif\n" - "};", - format("enum F {\n" - "One,\n" - "#if 1\n" - "Two,\n" - "#if 0\n" - "Three,\n" - " Four,\n" - "#endif\n" - "Five\n" - "#endif\n" - "};")); - EXPECT_EQ("enum G {\n" - " One,\n" - "#if 0\n" - "Two,\n" - "#else\n" - " Three,\n" - "#endif\n" - " Four\n" - "};", - format("enum G {\n" - "One,\n" - "#if 0\n" - "Two,\n" - "#else\n" - "Three,\n" - "#endif\n" - "Four\n" - "};")); - EXPECT_EQ("enum H {\n" - " One,\n" - "#if 0\n" - "#ifdef Q\n" - "Two,\n" - "#else\n" - "Three,\n" - "#endif\n" - "#endif\n" - " Four\n" - "};", - format("enum H {\n" - "One,\n" - "#if 0\n" - "#ifdef Q\n" - "Two,\n" - "#else\n" - "Three,\n" - "#endif\n" - "#endif\n" - "Four\n" - "};")); - EXPECT_EQ("enum I {\n" - " One,\n" - "#if /* test */ 0 || 1\n" - "Two,\n" - "Three,\n" - "#endif\n" - " Four\n" - "};", - format("enum I {\n" - "One,\n" - "#if /* test */ 0 || 1\n" - "Two,\n" - "Three,\n" - "#endif\n" - "Four\n" - "};")); - EXPECT_EQ("enum J {\n" - " One,\n" - "#if 0\n" - "#if 0\n" - "Two,\n" - "#else\n" - "Three,\n" - "#endif\n" - "Four,\n" - "#endif\n" - " Five\n" - "};", - format("enum J {\n" - "One,\n" - "#if 0\n" - "#if 0\n" - "Two,\n" - "#else\n" - "Three,\n" - "#endif\n" - "Four,\n" - "#endif\n" - "Five\n" - "};")); + verifyFormat("#if 0\n" + "}{)(&*(^%%#%@! fsadj f;ldjs ,:;| <<<>>>][)(][\n" + "#endif\n" + "void f() {}", + "#if 0\n" + "}{)(&*(^%%#%@! fsadj f;ldjs ,:;| <<<>>>][)(][\n" + "#endif\n" + "void f( ) { }"); + verifyFormat("#if false\n" + "void f( ) { }\n" + "#endif\n" + "void g() {}", + "#if false\n" + "void f( ) { }\n" + "#endif\n" + "void g( ) { }"); + verifyFormat("enum E {\n" + " One,\n" + " Two,\n" + "#if 0\n" + "Three,\n" + " Four,\n" + "#endif\n" + " Five\n" + "};", + "enum E {\n" + " One,Two,\n" + "#if 0\n" + "Three,\n" + " Four,\n" + "#endif\n" + " Five};"); + verifyFormat("enum F {\n" + " One,\n" + "#if 1\n" + " Two,\n" + "#if 0\n" + "Three,\n" + " Four,\n" + "#endif\n" + " Five\n" + "#endif\n" + "};", + "enum F {\n" + "One,\n" + "#if 1\n" + "Two,\n" + "#if 0\n" + "Three,\n" + " Four,\n" + "#endif\n" + "Five\n" + "#endif\n" + "};"); + verifyFormat("enum G {\n" + " One,\n" + "#if 0\n" + "Two,\n" + "#else\n" + " Three,\n" + "#endif\n" + " Four\n" + "};", + "enum G {\n" + "One,\n" + "#if 0\n" + "Two,\n" + "#else\n" + "Three,\n" + "#endif\n" + "Four\n" + "};"); + verifyFormat("enum H {\n" + " One,\n" + "#if 0\n" + "#ifdef Q\n" + "Two,\n" + "#else\n" + "Three,\n" + "#endif\n" + "#endif\n" + " Four\n" + "};", + "enum H {\n" + "One,\n" + "#if 0\n" + "#ifdef Q\n" + "Two,\n" + "#else\n" + "Three,\n" + "#endif\n" + "#endif\n" + "Four\n" + "};"); + verifyFormat("enum I {\n" + " One,\n" + "#if /* test */ 0 || 1\n" + "Two,\n" + "Three,\n" + "#endif\n" + " Four\n" + "};", + "enum I {\n" + "One,\n" + "#if /* test */ 0 || 1\n" + "Two,\n" + "Three,\n" + "#endif\n" + "Four\n" + "};"); + verifyFormat("enum J {\n" + " One,\n" + "#if 0\n" + "#if 0\n" + "Two,\n" + "#else\n" + "Three,\n" + "#endif\n" + "Four,\n" + "#endif\n" + " Five\n" + "};", + "enum J {\n" + "One,\n" + "#if 0\n" + "#if 0\n" + "Two,\n" + "#else\n" + "Three,\n" + "#endif\n" + "Four,\n" + "#endif\n" + "Five\n" + "};"); // Ignore stuff in SWIG-blocks. - EXPECT_EQ("#ifdef SWIG\n" - "}{)(&*(^%%#%@! fsadj f;ldjs ,:;| <<<>>>][)(][\n" - "#endif\n" - "void f() {}", - format("#ifdef SWIG\n" - "}{)(&*(^%%#%@! fsadj f;ldjs ,:;| <<<>>>][)(][\n" - "#endif\n" - "void f( ) { }")); - EXPECT_EQ("#ifndef SWIG\n" - "void f() {}\n" - "#endif", - format("#ifndef SWIG\n" - "void f( ) { }\n" - "#endif")); + verifyFormat("#ifdef SWIG\n" + "}{)(&*(^%%#%@! fsadj f;ldjs ,:;| <<<>>>][)(][\n" + "#endif\n" + "void f() {}", + "#ifdef SWIG\n" + "}{)(&*(^%%#%@! fsadj f;ldjs ,:;| <<<>>>][)(][\n" + "#endif\n" + "void f( ) { }"); + verifyFormat("#ifndef SWIG\n" + "void f() {}\n" + "#endif", + "#ifndef SWIG\n" + "void f( ) { }\n" + "#endif"); } TEST_F(FormatTestComments, DontCrashOnBlockComments) { - EXPECT_EQ( + verifyFormat( "int xxxxxxxxx; /* " "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\n" "zzzzzz\n" "0*/", - format("int xxxxxxxxx; /* " - "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy zzzzzz\n" - "0*/")); + "int xxxxxxxxx; /* " + "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy zzzzzz\n" + "0*/"); } TEST_F(FormatTestComments, BlockCommentsInControlLoops) { @@ -2470,225 +2463,229 @@ TEST_F(FormatTestComments, BlockCommentsInControlLoops) { } TEST_F(FormatTestComments, BlockComments) { - EXPECT_EQ("/* */ /* */ /* */\n/* */ /* */ /* */", - format("/* *//* */ /* */\n/* *//* */ /* */")); - EXPECT_EQ("/* */ a /* */ b;", format(" /* */ a/* */ b;")); - EXPECT_EQ("#define A /*123*/ \\\n" - " b\n" - "/* */\n" - "someCall(\n" - " parameter);", - format("#define A /*123*/ b\n" - "/* */\n" - "someCall(parameter);", - getLLVMStyleWithColumns(15))); - - EXPECT_EQ("#define A\n" - "/* */ someCall(\n" - " parameter);", - format("#define A\n" - "/* */someCall(parameter);", - getLLVMStyleWithColumns(15))); + const auto Style10 = getLLVMStyleWithColumns(10); + const auto Style15 = getLLVMStyleWithColumns(15); + + verifyFormat("/* */ /* */ /* */\n/* */ /* */ /* */", + "/* *//* */ /* */\n/* *//* */ /* */"); + verifyFormat("/* */ a /* */ b;", " /* */ a/* */ b;"); + verifyFormat("#define A /*123*/ \\\n" + " b\n" + "/* */\n" + "someCall(\n" + " parameter);", + "#define A /*123*/ b\n" + "/* */\n" + "someCall(parameter);", + Style15); + + verifyFormat("#define A\n" + "/* */ someCall(\n" + " parameter);", + "#define A\n" + "/* */someCall(parameter);", + Style15); verifyNoChange("/*\n**\n*/"); - EXPECT_EQ("/*\n" - " *\n" - " * aaaaaa\n" - " * aaaaaa\n" - " */", - format("/*\n" - "*\n" - " * aaaaaa aaaaaa\n" - "*/", - getLLVMStyleWithColumns(10))); - EXPECT_EQ("/*\n" - "**\n" - "* aaaaaa\n" - "* aaaaaa\n" - "*/", - format("/*\n" - "**\n" - "* aaaaaa aaaaaa\n" - "*/", - getLLVMStyleWithColumns(10))); - EXPECT_EQ("int aaaaaaaaaaaaaaaaaaaaaaaaaaaa =\n" - " /* line 1\n" - " bbbbbbbbbbbb */\n" - " bbbbbbbbbbbbbbbbbbbbbbbbbbbb;", - format("int aaaaaaaaaaaaaaaaaaaaaaaaaaaa =\n" - " /* line 1\n" - " bbbbbbbbbbbb */ bbbbbbbbbbbbbbbbbbbbbbbbbbbb;", - getLLVMStyleWithColumns(50))); + verifyFormat("/*\n" + " *\n" + " * aaaaaa\n" + " * aaaaaa\n" + " */", + "/*\n" + "*\n" + " * aaaaaa aaaaaa\n" + "*/", + Style10); + verifyFormat("/*\n" + "**\n" + "* aaaaaa\n" + "* aaaaaa\n" + "*/", + "/*\n" + "**\n" + "* aaaaaa aaaaaa\n" + "*/", + Style10); + verifyFormat("int aaaaaaaaaaaaaaaaaaaaaaaaaaaa =\n" + " /* line 1\n" + " bbbbbbbbbbbb */\n" + " bbbbbbbbbbbbbbbbbbbbbbbbbbbb;", + "int aaaaaaaaaaaaaaaaaaaaaaaaaaaa =\n" + " /* line 1\n" + " bbbbbbbbbbbb */ bbbbbbbbbbbbbbbbbbbbbbbbbbbb;", + getLLVMStyleWithColumns(50)); FormatStyle NoBinPacking = getLLVMStyle(); NoBinPacking.BinPackParameters = FormatStyle::BPPS_OnePerLine; - EXPECT_EQ("someFunction(1, /* comment 1 */\n" - " 2, /* comment 2 */\n" - " 3, /* comment 3 */\n" - " aaaa,\n" - " bbbb);", - format("someFunction (1, /* comment 1 */\n" - " 2, /* comment 2 */ \n" - " 3, /* comment 3 */\n" - "aaaa, bbbb );", - NoBinPacking)); + verifyFormat("someFunction(1, /* comment 1 */\n" + " 2, /* comment 2 */\n" + " 3, /* comment 3 */\n" + " aaaa,\n" + " bbbb);", + "someFunction (1, /* comment 1 */\n" + " 2, /* comment 2 */ \n" + " 3, /* comment 3 */\n" + "aaaa, bbbb );", + NoBinPacking); verifyFormat( "bool aaaaaaaaaaaaa = /* comment: */ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ||\n" " aaaaaaaaaaaaaaaaaaaaaaaaaaaa;"); - EXPECT_EQ( + verifyFormat( "bool aaaaaaaaaaaaa = /* trailing comment */\n" " aaaaaaaaaaaaaaaaaaaaaaaaaaa || aaaaaaaaaaaaaaaaaaaaaaaaa ||\n" " aaaaaaaaaaaaaaaaaaaaaaaaaaaa || aaaaaaaaaaaaaaaaaaaaaaaaaa;", - format( - "bool aaaaaaaaaaaaa = /* trailing comment */\n" - " aaaaaaaaaaaaaaaaaaaaaaaaaaa||aaaaaaaaaaaaaaaaaaaaaaaaa ||\n" - " aaaaaaaaaaaaaaaaaaaaaaaaaaaa || aaaaaaaaaaaaaaaaaaaaaaaaaa;")); - EXPECT_EQ( - "int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa; /* comment */\n" - "int bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb; /* comment */\n" - "int cccccccccccccccccccccccccccccc; /* comment */", - format("int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa; /* comment */\n" - "int bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb; /* comment */\n" - "int cccccccccccccccccccccccccccccc; /* comment */")); + "bool aaaaaaaaaaaaa = /* trailing comment */\n" + " aaaaaaaaaaaaaaaaaaaaaaaaaaa||aaaaaaaaaaaaaaaaaaaaaaaaa ||\n" + " aaaaaaaaaaaaaaaaaaaaaaaaaaaa || aaaaaaaaaaaaaaaaaaaaaaaaaa;"); + verifyFormat("int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa; /* comment */\n" + "int bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb; /* comment */\n" + "int cccccccccccccccccccccccccccccc; /* comment */", + "int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa; /* comment */\n" + "int bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb; /* comment */\n" + "int cccccccccccccccccccccccccccccc; /* comment */"); verifyFormat("void f(int * /* unused */) {}"); - EXPECT_EQ("/*\n" - " **\n" - " */", - format("/*\n" - " **\n" - " */")); - EXPECT_EQ("/*\n" - " *q\n" - " */", - format("/*\n" - " *q\n" - " */")); - EXPECT_EQ("/*\n" - " * q\n" - " */", - format("/*\n" - " * q\n" - " */")); - EXPECT_EQ("/*\n" - " **/", - format("/*\n" - " **/")); - EXPECT_EQ("/*\n" - " ***/", - format("/*\n" - " ***/")); + verifyFormat("/*\n" + " **\n" + " */", + "/*\n" + " **\n" + " */"); + verifyFormat("/*\n" + " *q\n" + " */", + "/*\n" + " *q\n" + " */"); + verifyFormat("/*\n" + " * q\n" + " */", + "/*\n" + " * q\n" + " */"); + verifyFormat("/*\n" + " **/", + "/*\n" + " **/"); + verifyFormat("/*\n" + " ***/", + "/*\n" + " ***/"); } TEST_F(FormatTestComments, BlockCommentsInMacros) { - EXPECT_EQ("#define A \\\n" - " { \\\n" - " /* one line */ \\\n" - " someCall();", - format("#define A { \\\n" - " /* one line */ \\\n" - " someCall();", - getLLVMStyleWithColumns(20))); - EXPECT_EQ("#define A \\\n" - " { \\\n" - " /* previous */ \\\n" - " /* one line */ \\\n" - " someCall();", - format("#define A { \\\n" - " /* previous */ \\\n" - " /* one line */ \\\n" - " someCall();", - getLLVMStyleWithColumns(20))); + const auto Style20 = getLLVMStyleWithColumns(20); + + verifyFormat("#define A \\\n" + " { \\\n" + " /* one line */ \\\n" + " someCall();", + "#define A { \\\n" + " /* one line */ \\\n" + " someCall();", + Style20); + verifyFormat("#define A \\\n" + " { \\\n" + " /* previous */ \\\n" + " /* one line */ \\\n" + " someCall();", + "#define A { \\\n" + " /* previous */ \\\n" + " /* one line */ \\\n" + " someCall();", + Style20); } TEST_F(FormatTestComments, BlockCommentsAtEndOfLine) { - EXPECT_EQ("a = {\n" - " 1111 /* */\n" - "};", - format("a = {1111 /* */\n" - "};", - getLLVMStyleWithColumns(15))); - EXPECT_EQ("a = {\n" - " 1111 /* */\n" - "};", - format("a = {1111 /* */\n" - "};", - getLLVMStyleWithColumns(15))); - EXPECT_EQ("a = {\n" - " 1111 /* a\n" - " */\n" - "};", - format("a = {1111 /* a */\n" - "};", - getLLVMStyleWithColumns(15))); + const auto Style15 = getLLVMStyleWithColumns(15); + + verifyFormat("a = {\n" + " 1111 /* */\n" + "};", + "a = {1111 /* */\n" + "};", + Style15); + verifyFormat("a = {\n" + " 1111 /* */\n" + "};", + "a = {1111 /* */\n" + "};", + Style15); + verifyFormat("a = {\n" + " 1111 /* a\n" + " */\n" + "};", + "a = {1111 /* a */\n" + "};", + Style15); } TEST_F(FormatTestComments, BreaksAfterMultilineBlockCommentsInParamLists) { - EXPECT_EQ("a = f(/* long\n" - " long */\n" - " a);", - format("a = f(/* long long */ a);", getLLVMStyleWithColumns(16))); - EXPECT_EQ("a = f(\n" - " /* long\n" - " long */\n" - " a);", - format("a = f(/* long long */ a);", getLLVMStyleWithColumns(15))); - - EXPECT_EQ("a = f(/* long\n" - " long\n" - " */\n" - " a);", - format("a = f(/* long\n" - " long\n" - " */a);", - getLLVMStyleWithColumns(16))); - - EXPECT_EQ("a = f(/* long\n" - " long\n" - " */\n" - " a);", - format("a = f(/* long\n" - " long\n" - " */ a);", - getLLVMStyleWithColumns(16))); - - EXPECT_EQ("a = f(/* long\n" - " long\n" - " */\n" - " (1 + 1));", - format("a = f(/* long\n" - " long\n" - " */ (1 + 1));", - getLLVMStyleWithColumns(16))); - - EXPECT_EQ( - "a = f(a,\n" - " /* long\n" - " long */\n" - " b);", - format("a = f(a, /* long long */ b);", getLLVMStyleWithColumns(16))); - - EXPECT_EQ( - "a = f(\n" - " a,\n" - " /* long\n" - " long */\n" - " b);", - format("a = f(a, /* long long */ b);", getLLVMStyleWithColumns(15))); - - EXPECT_EQ("a = f(a,\n" - " /* long\n" - " long */\n" - " (1 + 1));", - format("a = f(a, /* long long */ (1 + 1));", - getLLVMStyleWithColumns(16))); - EXPECT_EQ("a = f(\n" - " a,\n" - " /* long\n" - " long */\n" - " (1 + 1));", - format("a = f(a, /* long long */ (1 + 1));", - getLLVMStyleWithColumns(15))); + const auto Style15 = getLLVMStyleWithColumns(15); + const auto Style16 = getLLVMStyleWithColumns(16); + + verifyFormat("a = f(/* long\n" + " long */\n" + " a);", + "a = f(/* long long */ a);", Style16); + verifyFormat("a = f(\n" + " /* long\n" + " long */\n" + " a);", + "a = f(/* long long */ a);", Style15); + + verifyFormat("a = f(/* long\n" + " long\n" + " */\n" + " a);", + "a = f(/* long\n" + " long\n" + " */a);", + Style16); + + verifyFormat("a = f(/* long\n" + " long\n" + " */\n" + " a);", + "a = f(/* long\n" + " long\n" + " */ a);", + Style16); + + verifyFormat("a = f(/* long\n" + " long\n" + " */\n" + " (1 + 1));", + "a = f(/* long\n" + " long\n" + " */ (1 + 1));", + Style16); + + verifyFormat("a = f(a,\n" + " /* long\n" + " long */\n" + " b);", + "a = f(a, /* long long */ b);", Style16); + + verifyFormat("a = f(\n" + " a,\n" + " /* long\n" + " long */\n" + " b);", + "a = f(a, /* long long */ b);", Style15); + + verifyFormat("a = f(a,\n" + " /* long\n" + " long */\n" + " (1 + 1));", + "a = f(a, /* long long */ (1 + 1));", Style16); + verifyFormat("a = f(\n" + " a,\n" + " /* long\n" + " long */\n" + " (1 + 1));", + "a = f(a, /* long long */ (1 + 1));", Style15); } TEST_F(FormatTestComments, IndentLineCommentsInStartOfBlockAtEndOfFile) { @@ -2698,229 +2695,224 @@ TEST_F(FormatTestComments, IndentLineCommentsInStartOfBlockAtEndOfFile) { } TEST_F(FormatTestComments, AlignTrailingComments) { - EXPECT_EQ("#define MACRO(V) \\\n" - " V(Rt2) /* one more char */ \\\n" - " V(Rs) /* than here */ \\\n" - "/* comment 3 */\n", - format("#define MACRO(V)\\\n" - "V(Rt2) /* one more char */ \\\n" - "V(Rs) /* than here */ \\\n" - "/* comment 3 */\n", - getLLVMStyleWithColumns(40))); - EXPECT_EQ("int i = f(abc, // line 1\n" - " d, // line 2\n" - " // line 3\n" - " b);", - format("int i = f(abc, // line 1\n" - " d, // line 2\n" - " // line 3\n" - " b);", - getLLVMStyleWithColumns(40))); + const auto Style15 = getLLVMStyleWithColumns(15); + const auto Style40 = getLLVMStyleWithColumns(40); + + verifyFormat("#define MACRO(V) \\\n" + " V(Rt2) /* one more char */ \\\n" + " V(Rs) /* than here */ \\\n" + "/* comment 3 */\n", + "#define MACRO(V)\\\n" + "V(Rt2) /* one more char */ \\\n" + "V(Rs) /* than here */ \\\n" + "/* comment 3 */\n", + Style40); + verifyFormat("int i = f(abc, // line 1\n" + " d, // line 2\n" + " // line 3\n" + " b);", + "int i = f(abc, // line 1\n" + " d, // line 2\n" + " // line 3\n" + " b);", + Style40); // Align newly broken trailing comments. - EXPECT_EQ("int ab; // line\n" - "int a; // long\n" - " // long", - format("int ab; // line\n" - "int a; // long long", - getLLVMStyleWithColumns(15))); - EXPECT_EQ("int ab; // line\n" - "int a; // long\n" - " // long\n" - " // long", - format("int ab; // line\n" - "int a; // long long\n" - " // long", - getLLVMStyleWithColumns(15))); - EXPECT_EQ("int ab; // line\n" - "int a; // long\n" - " // long\n" - "pt c; // long", - format("int ab; // line\n" - "int a; // long long\n" - "pt c; // long", - getLLVMStyleWithColumns(15))); - EXPECT_EQ("int ab; // line\n" - "int a; // long\n" - " // long\n" - "\n" - "// long", - format("int ab; // line\n" - "int a; // long long\n" - "\n" - "// long", - getLLVMStyleWithColumns(15))); + verifyFormat("int ab; // line\n" + "int a; // long\n" + " // long", + "int ab; // line\n" + "int a; // long long", + Style15); + verifyFormat("int ab; // line\n" + "int a; // long\n" + " // long\n" + " // long", + "int ab; // line\n" + "int a; // long long\n" + " // long", + Style15); + verifyFormat("int ab; // line\n" + "int a; // long\n" + " // long\n" + "pt c; // long", + "int ab; // line\n" + "int a; // long long\n" + "pt c; // long", + Style15); + verifyFormat("int ab; // line\n" + "int a; // long\n" + " // long\n" + "\n" + "// long", + "int ab; // line\n" + "int a; // long long\n" + "\n" + "// long", + Style15); // Don't align newly broken trailing comments if that would put them over the // column limit. - EXPECT_EQ("int i, j; // line 1\n" - "int k; // line longg\n" - " // long", - format("int i, j; // line 1\n" - "int k; // line longg long", - getLLVMStyleWithColumns(20))); + verifyFormat("int i, j; // line 1\n" + "int k; // line longg\n" + " // long", + "int i, j; // line 1\n" + "int k; // line longg long", + getLLVMStyleWithColumns(20)); // Always align if ColumnLimit = 0 - EXPECT_EQ("int i, j; // line 1\n" - "int k; // line longg long", - format("int i, j; // line 1\n" - "int k; // line longg long", - getLLVMStyleWithColumns(0))); + verifyFormat("int i, j; // line 1\n" + "int k; // line longg long", + "int i, j; // line 1\n" + "int k; // line longg long", + getLLVMStyleWithColumns(0)); // Align comment line sections aligned with the next token with the next // token. - EXPECT_EQ("class A {\n" - "public: // public comment\n" - " // comment about a\n" - " int a;\n" - "};", - format("class A {\n" - "public: // public comment\n" - " // comment about a\n" - " int a;\n" - "};", - getLLVMStyleWithColumns(40))); - EXPECT_EQ("class A {\n" - "public: // public comment 1\n" - " // public comment 2\n" - " // comment 1 about a\n" - " // comment 2 about a\n" - " int a;\n" - "};", - format("class A {\n" - "public: // public comment 1\n" - " // public comment 2\n" - " // comment 1 about a\n" - " // comment 2 about a\n" - " int a;\n" - "};", - getLLVMStyleWithColumns(40))); - EXPECT_EQ("int f(int n) { // comment line 1 on f\n" - " // comment line 2 on f\n" - " // comment line 1 before return\n" - " // comment line 2 before return\n" - " return n; // comment line 1 on return\n" - " // comment line 2 on return\n" - " // comment line 1 after return\n" - "}", - format("int f(int n) { // comment line 1 on f\n" - " // comment line 2 on f\n" - " // comment line 1 before return\n" - " // comment line 2 before return\n" - " return n; // comment line 1 on return\n" - " // comment line 2 on return\n" - " // comment line 1 after return\n" - "}", - getLLVMStyleWithColumns(40))); - EXPECT_EQ("int f(int n) {\n" - " switch (n) { // comment line 1 on switch\n" - " // comment line 2 on switch\n" - " // comment line 1 before case 1\n" - " // comment line 2 before case 1\n" - " case 1: // comment line 1 on case 1\n" - " // comment line 2 on case 1\n" - " // comment line 1 before return 1\n" - " // comment line 2 before return 1\n" - " return 1; // comment line 1 on return 1\n" - " // comment line 2 on return 1\n" - " // comment line 1 before default\n" - " // comment line 2 before default\n" - " default: // comment line 1 on default\n" - " // comment line 2 on default\n" - " // comment line 1 before return 2\n" - " return 2 * f(n - 1); // comment line 1 on return 2\n" - " // comment line 2 on return 2\n" - " // comment line 1 after return\n" - " // comment line 2 after return\n" - " }\n" - "}", - format("int f(int n) {\n" - " switch (n) { // comment line 1 on switch\n" - " // comment line 2 on switch\n" - " // comment line 1 before case 1\n" - " // comment line 2 before case 1\n" - " case 1: // comment line 1 on case 1\n" - " // comment line 2 on case 1\n" - " // comment line 1 before return 1\n" - " // comment line 2 before return 1\n" - " return 1; // comment line 1 on return 1\n" - " // comment line 2 on return 1\n" - " // comment line 1 before default\n" - " // comment line 2 before default\n" - " default: // comment line 1 on default\n" - " // comment line 2 on default\n" - " // comment line 1 before return 2\n" - " return 2 * f(n - 1); // comment line 1 on return 2\n" - " // comment line 2 on return 2\n" - " // comment line 1 after return\n" - " // comment line 2 after return\n" - " }\n" - "}", - getLLVMStyleWithColumns(80))); + verifyFormat("class A {\n" + "public: // public comment\n" + " // comment about a\n" + " int a;\n" + "};", + "class A {\n" + "public: // public comment\n" + " // comment about a\n" + " int a;\n" + "};", + Style40); + verifyFormat("class A {\n" + "public: // public comment 1\n" + " // public comment 2\n" + " // comment 1 about a\n" + " // comment 2 about a\n" + " int a;\n" + "};", + "class A {\n" + "public: // public comment 1\n" + " // public comment 2\n" + " // comment 1 about a\n" + " // comment 2 about a\n" + " int a;\n" + "};", + Style40); + verifyFormat("int f(int n) { // comment line 1 on f\n" + " // comment line 2 on f\n" + " // comment line 1 before return\n" + " // comment line 2 before return\n" + " return n; // comment line 1 on return\n" + " // comment line 2 on return\n" + " // comment line 1 after return\n" + "}", + "int f(int n) { // comment line 1 on f\n" + " // comment line 2 on f\n" + " // comment line 1 before return\n" + " // comment line 2 before return\n" + " return n; // comment line 1 on return\n" + " // comment line 2 on return\n" + " // comment line 1 after return\n" + "}", + Style40); + verifyFormat("int f(int n) {\n" + " switch (n) { // comment line 1 on switch\n" + " // comment line 2 on switch\n" + " // comment line 1 before case 1\n" + " // comment line 2 before case 1\n" + " case 1: // comment line 1 on case 1\n" + " // comment line 2 on case 1\n" + " // comment line 1 before return 1\n" + " // comment line 2 before return 1\n" + " return 1; // comment line 1 on return 1\n" + " // comment line 2 on return 1\n" + " // comment line 1 before default\n" + " // comment line 2 before default\n" + " default: // comment line 1 on default\n" + " // comment line 2 on default\n" + " // comment line 1 before return 2\n" + " return 2 * f(n - 1); // comment line 1 on return 2\n" + " // comment line 2 on return 2\n" + " // comment line 1 after return\n" + " // comment line 2 after return\n" + " }\n" + "}", + "int f(int n) {\n" + " switch (n) { // comment line 1 on switch\n" + " // comment line 2 on switch\n" + " // comment line 1 before case 1\n" + " // comment line 2 before case 1\n" + " case 1: // comment line 1 on case 1\n" + " // comment line 2 on case 1\n" + " // comment line 1 before return 1\n" + " // comment line 2 before return 1\n" + " return 1; // comment line 1 on return 1\n" + " // comment line 2 on return 1\n" + " // comment line 1 before default\n" + " // comment line 2 before default\n" + " default: // comment line 1 on default\n" + " // comment line 2 on default\n" + " // comment line 1 before return 2\n" + " return 2 * f(n - 1); // comment line 1 on return 2\n" + " // comment line 2 on return 2\n" + " // comment line 1 after return\n" + " // comment line 2 after return\n" + " }\n" + "}"); // If all the lines in a sequence of line comments are aligned with the next // token, the first line belongs to the previous token and the other lines // belong to the next token. - EXPECT_EQ("int a; // line about a\n" - "long b;", - format("int a; // line about a\n" - " long b;", - getLLVMStyleWithColumns(80))); - EXPECT_EQ("int a; // line about a\n" - "// line about b\n" - "long b;", - format("int a; // line about a\n" - " // line about b\n" - " long b;", - getLLVMStyleWithColumns(80))); - EXPECT_EQ("int a; // line about a\n" - "// line 1 about b\n" - "// line 2 about b\n" - "long b;", - format("int a; // line about a\n" - " // line 1 about b\n" - " // line 2 about b\n" - " long b;", - getLLVMStyleWithColumns(80))); + verifyFormat("int a; // line about a\n" + "long b;", + "int a; // line about a\n" + " long b;"); + verifyFormat("int a; // line about a\n" + "// line about b\n" + "long b;", + "int a; // line about a\n" + " // line about b\n" + " long b;"); + verifyFormat("int a; // line about a\n" + "// line 1 about b\n" + "// line 2 about b\n" + "long b;", + "int a; // line about a\n" + " // line 1 about b\n" + " // line 2 about b\n" + " long b;"); // Checks an edge case in preprocessor handling. // These comments should *not* be aligned - EXPECT_EQ( - "#if FOO\n" - "#else\n" - "long a; // Line about a\n" - "#endif\n" - "#if BAR\n" - "#else\n" - "long b_long_name; // Line about b\n" - "#endif", - format("#if FOO\n" - "#else\n" - "long a; // Line about a\n" // Previous (bad) behavior - "#endif\n" - "#if BAR\n" - "#else\n" - "long b_long_name; // Line about b\n" - "#endif", - getLLVMStyleWithColumns(80))); + verifyFormat("#if FOO\n" + "#else\n" + "long a; // Line about a\n" + "#endif\n" + "#if BAR\n" + "#else\n" + "long b_long_name; // Line about b\n" + "#endif", + "#if FOO\n" + "#else\n" + "long a; // Line about a\n" // Previous (bad) behavior + "#endif\n" + "#if BAR\n" + "#else\n" + "long b_long_name; // Line about b\n" + "#endif"); // bug 47589 - EXPECT_EQ( - "namespace m {\n\n" - "#define FOO_GLOBAL 0 // Global scope.\n" - "#define FOO_LINKLOCAL 1 // Link-local scope.\n" - "#define FOO_SITELOCAL 2 // Site-local scope (deprecated).\n" - "#define FOO_UNIQUELOCAL 3 // Unique local\n" - "#define FOO_NODELOCAL 4 // Loopback\n\n" - "} // namespace m", - format("namespace m {\n\n" - "#define FOO_GLOBAL 0 // Global scope.\n" - "#define FOO_LINKLOCAL 1 // Link-local scope.\n" - "#define FOO_SITELOCAL 2 // Site-local scope (deprecated).\n" - "#define FOO_UNIQUELOCAL 3 // Unique local\n" - "#define FOO_NODELOCAL 4 // Loopback\n\n" - "} // namespace m", - getLLVMStyleWithColumns(80))); + verifyFormat("namespace m {\n\n" + "#define FOO_GLOBAL 0 // Global scope.\n" + "#define FOO_LINKLOCAL 1 // Link-local scope.\n" + "#define FOO_SITELOCAL 2 // Site-local scope (deprecated).\n" + "#define FOO_UNIQUELOCAL 3 // Unique local\n" + "#define FOO_NODELOCAL 4 // Loopback\n\n" + "} // namespace m", + "namespace m {\n\n" + "#define FOO_GLOBAL 0 // Global scope.\n" + "#define FOO_LINKLOCAL 1 // Link-local scope.\n" + "#define FOO_SITELOCAL 2 // Site-local scope (deprecated).\n" + "#define FOO_UNIQUELOCAL 3 // Unique local\n" + "#define FOO_NODELOCAL 4 // Loopback\n\n" + "} // namespace m"); // https://llvm.org/PR53441 verifyFormat("/* */ //\n" @@ -2980,193 +2972,193 @@ TEST_F(FormatTestComments, AlignTrailingCommentsAcrossEmptyLines) { Style.AlignTrailingComments.OverEmptyLines = 2; // Cannot use verifyFormat here // test::messUp removes all new lines which changes the logic - EXPECT_EQ("#include \"a.h\" // comment\n" - "\n" - "\n" - "\n" - "#include \"ab.h\" // comment\n" - "\n" - "\n" - "#include \"abcdefg.h\" // comment", - format("#include \"a.h\" // comment\n" - "\n" - "\n" - "\n" - "#include \"ab.h\" // comment\n" - "\n" - "\n" - "#include \"abcdefg.h\" // comment", - Style)); + verifyFormat("#include \"a.h\" // comment\n" + "\n" + "\n" + "\n" + "#include \"ab.h\" // comment\n" + "\n" + "\n" + "#include \"abcdefg.h\" // comment", + "#include \"a.h\" // comment\n" + "\n" + "\n" + "\n" + "#include \"ab.h\" // comment\n" + "\n" + "\n" + "#include \"abcdefg.h\" // comment", + Style); Style.MaxEmptyLinesToKeep = 1; Style.AlignTrailingComments.OverEmptyLines = 1; // End of testing OverEmptyLines Style.ColumnLimit = 15; - EXPECT_EQ("int ab; // line\n" - "int a; // long\n" - " // long\n" - "\n" - " // long", - format("int ab; // line\n" - "int a; // long long\n" - "\n" - "// long", - Style)); + verifyFormat("int ab; // line\n" + "int a; // long\n" + " // long\n" + "\n" + " // long", + "int ab; // line\n" + "int a; // long long\n" + "\n" + "// long", + Style); Style.ColumnLimit = 15; - EXPECT_EQ("int ab; // line\n" - "\n" - "int a; // long\n" - " // long", - format("int ab; // line\n" - "\n" - "int a; // long long", - Style)); + verifyFormat("int ab; // line\n" + "\n" + "int a; // long\n" + " // long", + "int ab; // line\n" + "\n" + "int a; // long long", + Style); Style.ColumnLimit = 30; - EXPECT_EQ("int foo = 12345; // comment\n" - "int bar =\n" - " 1234; // This is a very\n" - " // long comment\n" - " // which is wrapped\n" - " // arround.\n" - "\n" - "int x = 2; // Is this still\n" - " // aligned?", - format("int foo = 12345; // comment\n" - "int bar = 1234; // This is a very long comment\n" - " // which is wrapped arround.\n" - "\n" - "int x = 2; // Is this still aligned?", - Style)); + verifyFormat("int foo = 12345; // comment\n" + "int bar =\n" + " 1234; // This is a very\n" + " // long comment\n" + " // which is wrapped\n" + " // arround.\n" + "\n" + "int x = 2; // Is this still\n" + " // aligned?", + "int foo = 12345; // comment\n" + "int bar = 1234; // This is a very long comment\n" + " // which is wrapped arround.\n" + "\n" + "int x = 2; // Is this still aligned?", + Style); Style.ColumnLimit = 35; - EXPECT_EQ("int foo = 12345; // comment\n" - "int bar =\n" - " 1234; // This is a very long\n" - " // comment which is\n" - " // wrapped arround.\n" - "\n" - "int x =\n" - " 2; // Is this still aligned?", - format("int foo = 12345; // comment\n" - "int bar = 1234; // This is a very long comment\n" - " // which is wrapped arround.\n" - "\n" - "int x = 2; // Is this still aligned?", - Style)); + verifyFormat("int foo = 12345; // comment\n" + "int bar =\n" + " 1234; // This is a very long\n" + " // comment which is\n" + " // wrapped arround.\n" + "\n" + "int x =\n" + " 2; // Is this still aligned?", + "int foo = 12345; // comment\n" + "int bar = 1234; // This is a very long comment\n" + " // which is wrapped arround.\n" + "\n" + "int x = 2; // Is this still aligned?", + Style); Style.ColumnLimit = 40; - EXPECT_EQ("int foo = 12345; // comment\n" - "int bar =\n" - " 1234; // This is a very long comment\n" - " // which is wrapped arround.\n" - "\n" - "int x = 2; // Is this still aligned?", - format("int foo = 12345; // comment\n" - "int bar = 1234; // This is a very long comment\n" - " // which is wrapped arround.\n" - "\n" - "int x = 2; // Is this still aligned?", - Style)); + verifyFormat("int foo = 12345; // comment\n" + "int bar =\n" + " 1234; // This is a very long comment\n" + " // which is wrapped arround.\n" + "\n" + "int x = 2; // Is this still aligned?", + "int foo = 12345; // comment\n" + "int bar = 1234; // This is a very long comment\n" + " // which is wrapped arround.\n" + "\n" + "int x = 2; // Is this still aligned?", + Style); Style.ColumnLimit = 45; - EXPECT_EQ("int foo = 12345; // comment\n" - "int bar =\n" - " 1234; // This is a very long comment\n" - " // which is wrapped arround.\n" - "\n" - "int x = 2; // Is this still aligned?", - format("int foo = 12345; // comment\n" - "int bar = 1234; // This is a very long comment\n" - " // which is wrapped arround.\n" - "\n" - "int x = 2; // Is this still aligned?", - Style)); + verifyFormat("int foo = 12345; // comment\n" + "int bar =\n" + " 1234; // This is a very long comment\n" + " // which is wrapped arround.\n" + "\n" + "int x = 2; // Is this still aligned?", + "int foo = 12345; // comment\n" + "int bar = 1234; // This is a very long comment\n" + " // which is wrapped arround.\n" + "\n" + "int x = 2; // Is this still aligned?", + Style); Style.ColumnLimit = 80; - EXPECT_EQ("int a; // line about a\n" - "\n" - "// line about b\n" - "long b;", - format("int a; // line about a\n" - "\n" - " // line about b\n" - " long b;", - Style)); + verifyFormat("int a; // line about a\n" + "\n" + "// line about b\n" + "long b;", + "int a; // line about a\n" + "\n" + " // line about b\n" + " long b;", + Style); Style.ColumnLimit = 80; - EXPECT_EQ("int a; // line about a\n" - "\n" - "// line 1 about b\n" - "// line 2 about b\n" - "long b;", - format("int a; // line about a\n" - "\n" - " // line 1 about b\n" - " // line 2 about b\n" - " long b;", - Style)); + verifyFormat("int a; // line about a\n" + "\n" + "// line 1 about b\n" + "// line 2 about b\n" + "long b;", + "int a; // line about a\n" + "\n" + " // line 1 about b\n" + " // line 2 about b\n" + " long b;", + Style); } -TEST_F(FormatTestComments, AlignTrailingCommentsLeave) { - FormatStyle Style = getLLVMStyle(); - Style.AlignTrailingComments.Kind = FormatStyle::TCAS_Leave; +TEST_F(FormatTestComments, AlignTrailingCommentsLeave) { + FormatStyle Style = getLLVMStyle(); + Style.AlignTrailingComments.Kind = FormatStyle::TCAS_Leave; + + verifyFormat("int a;// do not touch\n" + "int b; // any comments\n" + "int c; // comment\n" + "int d; // comment", + "int a;// do not touch\n" + "int b; // any comments\n" + "int c; // comment\n" + "int d; // comment", + Style); + + verifyFormat("int a; // do not touch\n" + "int b; // any comments\n" + "int c; // comment\n" + "int d;// comment", + "int a; // do not touch\n" + "int b; // any comments\n" + "int c; // comment\n" + "int d;// comment", + Style); + + verifyFormat("// do not touch\n" + "int a; // any comments\n" + "\n" + " // comment\n" + "// comment\n" + "\n" + "// comment", + "// do not touch\n" + "int a; // any comments\n" + "\n" + " // comment\n" + "// comment\n" + "\n" + "// comment", + Style); - EXPECT_EQ("int a;// do not touch\n" - "int b; // any comments\n" - "int c; // comment\n" - "int d; // comment", - format("int a;// do not touch\n" - "int b; // any comments\n" - "int c; // comment\n" - "int d; // comment", - Style)); - - EXPECT_EQ("int a; // do not touch\n" - "int b; // any comments\n" - "int c; // comment\n" - "int d;// comment", - format("int a; // do not touch\n" - "int b; // any comments\n" - "int c; // comment\n" - "int d;// comment", - Style)); - - EXPECT_EQ("// do not touch\n" - "int a; // any comments\n" - "\n" - " // comment\n" - "// comment\n" - "\n" - "// comment", - format("// do not touch\n" - "int a; // any comments\n" - "\n" - " // comment\n" - "// comment\n" - "\n" - "// comment", - Style)); - - EXPECT_EQ("// do not touch\n" - "int a; // any comments\n" - "\n" - " // comment\n" - "// comment\n" - "\n" - "// comment", - format("// do not touch\n" - "int a; // any comments\n" - "\n" - "\n" - " // comment\n" - "// comment\n" - "\n" - "\n" - "// comment", - Style)); + verifyFormat("// do not touch\n" + "int a; // any comments\n" + "\n" + " // comment\n" + "// comment\n" + "\n" + "// comment", + "// do not touch\n" + "int a; // any comments\n" + "\n" + "\n" + " // comment\n" + "// comment\n" + "\n" + "\n" + "// comment", + Style); verifyFormat("namespace ns {\n" "int i;\n" @@ -3186,36 +3178,36 @@ TEST_F(FormatTestComments, AlignTrailingCommentsLeave) { // Allow to keep 2 empty lines Style.MaxEmptyLinesToKeep = 2; - EXPECT_EQ("// do not touch\n" - "int a; // any comments\n" - "\n" - "\n" - " // comment\n" - "// comment\n" - "\n" - "// comment", - format("// do not touch\n" - "int a; // any comments\n" - "\n" - "\n" - " // comment\n" - "// comment\n" - "\n" - "// comment", - Style)); + verifyFormat("// do not touch\n" + "int a; // any comments\n" + "\n" + "\n" + " // comment\n" + "// comment\n" + "\n" + "// comment", + "// do not touch\n" + "int a; // any comments\n" + "\n" + "\n" + " // comment\n" + "// comment\n" + "\n" + "// comment", + Style); Style.MaxEmptyLinesToKeep = 1; // Just format comments normally when leaving exceeds the column limit Style.ColumnLimit = 35; - EXPECT_EQ("int foo = 12345; // comment\n" - "int bar =\n" - " 1234; // This is a very long\n" - " // comment which is\n" - " // wrapped arround.", - format("int foo = 12345; // comment\n" - "int bar = 1234; // This is a very long comment\n" - " // which is wrapped arround.", - Style)); + verifyFormat("int foo = 12345; // comment\n" + "int bar =\n" + " 1234; // This is a very long\n" + " // comment which is\n" + " // wrapped arround.", + "int foo = 12345; // comment\n" + "int bar = 1234; // This is a very long comment\n" + " // which is wrapped arround.", + Style); Style = getLLVMStyle(); Style.AlignTrailingComments.Kind = FormatStyle::TCAS_Leave; @@ -3502,171 +3494,171 @@ TEST_F(FormatTestComments, DontAlignOverScope) { } TEST_F(FormatTestComments, AlignsBlockCommentDecorations) { - EXPECT_EQ("/*\n" - " */", - format("/*\n" - "*/")); - EXPECT_EQ("/*\n" - " */", - format("/*\n" - " */")); - EXPECT_EQ("/*\n" - " */", - format("/*\n" - " */")); + verifyFormat("/*\n" + " */", + "/*\n" + "*/"); + verifyFormat("/*\n" + " */", + "/*\n" + " */"); + verifyFormat("/*\n" + " */", + "/*\n" + " */"); // Align a single line. - EXPECT_EQ("/*\n" - " * line */", - format("/*\n" - "* line */")); - EXPECT_EQ("/*\n" - " * line */", - format("/*\n" - " * line */")); - EXPECT_EQ("/*\n" - " * line */", - format("/*\n" - " * line */")); - EXPECT_EQ("/*\n" - " * line */", - format("/*\n" - " * line */")); - EXPECT_EQ("/**\n" - " * line */", - format("/**\n" - "* line */")); - EXPECT_EQ("/**\n" - " * line */", - format("/**\n" - " * line */")); - EXPECT_EQ("/**\n" - " * line */", - format("/**\n" - " * line */")); - EXPECT_EQ("/**\n" - " * line */", - format("/**\n" - " * line */")); - EXPECT_EQ("/**\n" - " * line */", - format("/**\n" - " * line */")); + verifyFormat("/*\n" + " * line */", + "/*\n" + "* line */"); + verifyFormat("/*\n" + " * line */", + "/*\n" + " * line */"); + verifyFormat("/*\n" + " * line */", + "/*\n" + " * line */"); + verifyFormat("/*\n" + " * line */", + "/*\n" + " * line */"); + verifyFormat("/**\n" + " * line */", + "/**\n" + "* line */"); + verifyFormat("/**\n" + " * line */", + "/**\n" + " * line */"); + verifyFormat("/**\n" + " * line */", + "/**\n" + " * line */"); + verifyFormat("/**\n" + " * line */", + "/**\n" + " * line */"); + verifyFormat("/**\n" + " * line */", + "/**\n" + " * line */"); // Align the end '*/' after a line. - EXPECT_EQ("/*\n" - " * line\n" - " */", - format("/*\n" - "* line\n" - "*/")); - EXPECT_EQ("/*\n" - " * line\n" - " */", - format("/*\n" - " * line\n" - " */")); - EXPECT_EQ("/*\n" - " * line\n" - " */", - format("/*\n" - " * line\n" - " */")); + verifyFormat("/*\n" + " * line\n" + " */", + "/*\n" + "* line\n" + "*/"); + verifyFormat("/*\n" + " * line\n" + " */", + "/*\n" + " * line\n" + " */"); + verifyFormat("/*\n" + " * line\n" + " */", + "/*\n" + " * line\n" + " */"); // Align two lines. - EXPECT_EQ("/* line 1\n" - " * line 2 */", - format("/* line 1\n" - " * line 2 */")); - EXPECT_EQ("/* line 1\n" - " * line 2 */", - format("/* line 1\n" - "* line 2 */")); - EXPECT_EQ("/* line 1\n" - " * line 2 */", - format("/* line 1\n" - " * line 2 */")); - EXPECT_EQ("/* line 1\n" - " * line 2 */", - format("/* line 1\n" - " * line 2 */")); - EXPECT_EQ("/* line 1\n" - " * line 2 */", - format("/* line 1\n" - " * line 2 */")); - EXPECT_EQ("int i; /* line 1\n" - " * line 2 */", - format("int i; /* line 1\n" - "* line 2 */")); - EXPECT_EQ("int i; /* line 1\n" - " * line 2 */", - format("int i; /* line 1\n" - " * line 2 */")); - EXPECT_EQ("int i; /* line 1\n" - " * line 2 */", - format("int i; /* line 1\n" - " * line 2 */")); + verifyFormat("/* line 1\n" + " * line 2 */", + "/* line 1\n" + " * line 2 */"); + verifyFormat("/* line 1\n" + " * line 2 */", + "/* line 1\n" + "* line 2 */"); + verifyFormat("/* line 1\n" + " * line 2 */", + "/* line 1\n" + " * line 2 */"); + verifyFormat("/* line 1\n" + " * line 2 */", + "/* line 1\n" + " * line 2 */"); + verifyFormat("/* line 1\n" + " * line 2 */", + "/* line 1\n" + " * line 2 */"); + verifyFormat("int i; /* line 1\n" + " * line 2 */", + "int i; /* line 1\n" + "* line 2 */"); + verifyFormat("int i; /* line 1\n" + " * line 2 */", + "int i; /* line 1\n" + " * line 2 */"); + verifyFormat("int i; /* line 1\n" + " * line 2 */", + "int i; /* line 1\n" + " * line 2 */"); // Align several lines. - EXPECT_EQ("/* line 1\n" - " * line 2\n" - " * line 3 */", - format("/* line 1\n" - " * line 2\n" - "* line 3 */")); - EXPECT_EQ("/* line 1\n" - " * line 2\n" - " * line 3 */", - format("/* line 1\n" - " * line 2\n" - "* line 3 */")); - EXPECT_EQ("/*\n" - "** line 1\n" - "** line 2\n" - "*/", - format("/*\n" - "** line 1\n" - " ** line 2\n" - "*/")); + verifyFormat("/* line 1\n" + " * line 2\n" + " * line 3 */", + "/* line 1\n" + " * line 2\n" + "* line 3 */"); + verifyFormat("/* line 1\n" + " * line 2\n" + " * line 3 */", + "/* line 1\n" + " * line 2\n" + "* line 3 */"); + verifyFormat("/*\n" + "** line 1\n" + "** line 2\n" + "*/", + "/*\n" + "** line 1\n" + " ** line 2\n" + "*/"); // Align with different indent after the decorations. - EXPECT_EQ("/*\n" - " * line 1\n" - " * line 2\n" - " * line 3\n" - " * line 4\n" - " */", - format("/*\n" - "* line 1\n" - " * line 2\n" - " * line 3\n" - "* line 4\n" - "*/")); + verifyFormat("/*\n" + " * line 1\n" + " * line 2\n" + " * line 3\n" + " * line 4\n" + " */", + "/*\n" + "* line 1\n" + " * line 2\n" + " * line 3\n" + "* line 4\n" + "*/"); // Align empty or blank lines. - EXPECT_EQ("/**\n" - " *\n" - " *\n" - " *\n" - " */", - format("/**\n" - "* \n" - " * \n" - " *\n" - "*/")); + verifyFormat("/**\n" + " *\n" + " *\n" + " *\n" + " */", + "/**\n" + "* \n" + " * \n" + " *\n" + "*/"); // Align while breaking and reflowing. - EXPECT_EQ("/*\n" - " * long long long\n" - " * long long\n" - " *\n" - " * long */", - format("/*\n" - " * long long long long\n" - " * long\n" - " *\n" - "* long */", - getLLVMStyleWithColumns(20))); + verifyFormat("/*\n" + " * long long long\n" + " * long long\n" + " *\n" + " * long */", + "/*\n" + " * long long long long\n" + " * long\n" + " *\n" + "* long */", + getLLVMStyleWithColumns(20)); } TEST_F(FormatTestComments, NoCrash_Bug34236) { @@ -3674,110 +3666,110 @@ TEST_F(FormatTestComments, NoCrash_Bug34236) { // https://bugs.llvm.org/show_bug.cgi?id=34236 // Temporarily disable formatting for readability. // clang-format off - EXPECT_EQ( + verifyFormat( "/* */ /*\n" " * a\n" " * b c d*/", - format( "/* */ /*\n" " * a b\n" -" * c d*/", - getLLVMStyleWithColumns(80))); +" * c d*/"); // clang-format on } TEST_F(FormatTestComments, NonTrailingBlockComments) { - verifyFormat("const /** comment comment */ A = B;", - getLLVMStyleWithColumns(40)); + const auto Style40 = getLLVMStyleWithColumns(40); + + verifyFormat("const /** comment comment */ A = B;", Style40); verifyFormat("const /** comment comment comment */ A =\n" " B;", - getLLVMStyleWithColumns(40)); - - EXPECT_EQ("const /** comment comment comment\n" - " comment */\n" - " A = B;", - format("const /** comment comment comment comment */\n" - " A = B;", - getLLVMStyleWithColumns(40))); + Style40); + + verifyFormat("const /** comment comment comment\n" + " comment */\n" + " A = B;", + "const /** comment comment comment comment */\n" + " A = B;", + Style40); } TEST_F(FormatTestComments, PythonStyleComments) { + const auto ProtoStyle20 = getTextProtoStyleWithColumns(20); // Keeps a space after '#'. - EXPECT_EQ("# comment\n" - "key: value", - format("#comment\n" - "key:value", - getTextProtoStyleWithColumns(20))); - EXPECT_EQ("# comment\n" - "key: value", - format("# comment\n" - "key:value", - getTextProtoStyleWithColumns(20))); + verifyFormat("# comment\n" + "key: value", + "#comment\n" + "key:value", + ProtoStyle20); + verifyFormat("# comment\n" + "key: value", + "# comment\n" + "key:value", + ProtoStyle20); // Breaks long comment. - EXPECT_EQ("# comment comment\n" - "# comment\n" - "key: value", - format("# comment comment comment\n" - "key:value", - getTextProtoStyleWithColumns(20))); + verifyFormat("# comment comment\n" + "# comment\n" + "key: value", + "# comment comment comment\n" + "key:value", + ProtoStyle20); // Indents comments. - EXPECT_EQ("data {\n" - " # comment comment\n" - " # comment\n" - " key: value\n" - "}", - format("data {\n" - "# comment comment comment\n" - "key: value}", - getTextProtoStyleWithColumns(20))); - EXPECT_EQ("data {\n" - " # comment comment\n" - " # comment\n" - " key: value\n" - "}", - format("data {# comment comment comment\n" - "key: value}", - getTextProtoStyleWithColumns(20))); + verifyFormat("data {\n" + " # comment comment\n" + " # comment\n" + " key: value\n" + "}", + "data {\n" + "# comment comment comment\n" + "key: value}", + ProtoStyle20); + verifyFormat("data {\n" + " # comment comment\n" + " # comment\n" + " key: value\n" + "}", + "data {# comment comment comment\n" + "key: value}", + ProtoStyle20); // Reflows long comments. - EXPECT_EQ("# comment comment\n" - "# comment comment\n" - "key: value", - format("# comment comment comment\n" - "# comment\n" - "key:value", - getTextProtoStyleWithColumns(20))); + verifyFormat("# comment comment\n" + "# comment comment\n" + "key: value", + "# comment comment comment\n" + "# comment\n" + "key:value", + ProtoStyle20); // Breaks trailing comments. - EXPECT_EQ("k: val # comment\n" - " # comment\n" - "a: 1", - format("k:val#comment comment\n" - "a:1", - getTextProtoStyleWithColumns(20))); - EXPECT_EQ("id {\n" - " k: val # comment\n" - " # comment\n" - " # line line\n" - " a: 1\n" - "}", - format("id {k:val#comment comment\n" - "# line line\n" - "a:1}", - getTextProtoStyleWithColumns(20))); + verifyFormat("k: val # comment\n" + " # comment\n" + "a: 1", + "k:val#comment comment\n" + "a:1", + ProtoStyle20); + verifyFormat("id {\n" + " k: val # comment\n" + " # comment\n" + " # line line\n" + " a: 1\n" + "}", + "id {k:val#comment comment\n" + "# line line\n" + "a:1}", + ProtoStyle20); // Aligns trailing comments. - EXPECT_EQ("k: val # commen1\n" - " # commen2\n" - " # commen3\n" - "# commen4\n" - "a: 1 # commen5\n" - " # commen6\n" - " # commen7", - format("k:val#commen1 commen2\n" - " #commen3\n" - "# commen4\n" - "a:1#commen5 commen6\n" - " #commen7", - getTextProtoStyleWithColumns(20))); + verifyFormat("k: val # commen1\n" + " # commen2\n" + " # commen3\n" + "# commen4\n" + "a: 1 # commen5\n" + " # commen6\n" + " # commen7", + "k:val#commen1 commen2\n" + " #commen3\n" + "# commen4\n" + "a:1#commen5 commen6\n" + " #commen7", + ProtoStyle20); } TEST_F(FormatTestComments, BreaksBeforeTrailingUnbreakableSequence) { @@ -3791,16 +3783,15 @@ TEST_F(FormatTestComments, BreaksBeforeTrailingUnbreakableSequence) { TEST_F(FormatTestComments, ReflowBackslashCrash) { // clang-format off - EXPECT_EQ( + verifyFormat( "// How to run:\n" "// bbbbb run \\\n" "// rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr\n" "// \\ -- --output_directory=\"\"", - format( "// How to run:\n" "// bbbbb run \\\n" "// rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr \\\n" -"// -- --output_directory=\"\"")); +"// -- --output_directory=\"\""); // clang-format on } @@ -3809,136 +3800,135 @@ TEST_F(FormatTestComments, IndentsLongJavadocAnnotatedLines) { Style.ColumnLimit = 60; FormatStyle Style20 = getGoogleStyle(FormatStyle::LK_Java); Style20.ColumnLimit = 20; - EXPECT_EQ( - "/**\n" - " * @param x long long long long long long long long long\n" - " * long\n" - " */", - format("/**\n" - " * @param x long long long long long long long long long long\n" - " */", - Style)); - EXPECT_EQ("/**\n" - " * @param x long long long long long long long long long\n" - " * long long long long long long long long long long\n" - " */", - format("/**\n" - " * @param x long long long long long long long long long " - "long long long long long long long long long long\n" - " */", - Style)); - EXPECT_EQ("/**\n" - " * @param x long long long long long long long long long\n" - " * long long long long long long long long long long\n" - " * long\n" - " */", - format("/**\n" - " * @param x long long long long long long long long long " - "long long long long long long long long long long long\n" - " */", - Style)); - EXPECT_EQ("/**\n" - " * Sentence that\n" - " * should be broken.\n" - " * @param short\n" - " * keep indentation\n" - " */", - format("/**\n" - " * Sentence that should be broken.\n" - " * @param short\n" - " * keep indentation\n" - " */", - Style20)); - - EXPECT_EQ("/**\n" - " * @param l1 long1\n" - " * to break\n" - " * @param l2 long2\n" - " * to break\n" - " */", - format("/**\n" - " * @param l1 long1 to break\n" - " * @param l2 long2 to break\n" - " */", - Style20)); - - EXPECT_EQ("/**\n" - " * @param xx to\n" - " * break\n" - " * no reflow\n" - " */", - format("/**\n" - " * @param xx to break\n" - " * no reflow\n" - " */", - Style20)); - - EXPECT_EQ("/**\n" - " * @param xx to\n" - " * break yes\n" - " * reflow\n" - " */", - format("/**\n" - " * @param xx to break\n" - " * yes reflow\n" - " */", - Style20)); + verifyFormat("/**\n" + " * @param x long long long long long long long long long\n" + " * long\n" + " */", + "/**\n" + " * @param x long long long long long long long long long long\n" + " */", + Style); + verifyFormat("/**\n" + " * @param x long long long long long long long long long\n" + " * long long long long long long long long long long\n" + " */", + "/**\n" + " * @param x long long long long long long long long long " + "long long long long long long long long long long\n" + " */", + Style); + verifyFormat("/**\n" + " * @param x long long long long long long long long long\n" + " * long long long long long long long long long long\n" + " * long\n" + " */", + "/**\n" + " * @param x long long long long long long long long long " + "long long long long long long long long long long long\n" + " */", + Style); + verifyFormat("/**\n" + " * Sentence that\n" + " * should be broken.\n" + " * @param short\n" + " * keep indentation\n" + " */", + "/**\n" + " * Sentence that should be broken.\n" + " * @param short\n" + " * keep indentation\n" + " */", + Style20); + + verifyFormat("/**\n" + " * @param l1 long1\n" + " * to break\n" + " * @param l2 long2\n" + " * to break\n" + " */", + "/**\n" + " * @param l1 long1 to break\n" + " * @param l2 long2 to break\n" + " */", + Style20); + + verifyFormat("/**\n" + " * @param xx to\n" + " * break\n" + " * no reflow\n" + " */", + "/**\n" + " * @param xx to break\n" + " * no reflow\n" + " */", + Style20); + + verifyFormat("/**\n" + " * @param xx to\n" + " * break yes\n" + " * reflow\n" + " */", + "/**\n" + " * @param xx to break\n" + " * yes reflow\n" + " */", + Style20); FormatStyle JSStyle20 = getGoogleStyle(FormatStyle::LK_JavaScript); JSStyle20.ColumnLimit = 20; - EXPECT_EQ("/**\n" - " * @param l1 long1\n" - " * to break\n" - " */", - format("/**\n" - " * @param l1 long1 to break\n" - " */", - JSStyle20)); - EXPECT_EQ("/**\n" - " * @param {l1 long1\n" - " * to break}\n" - " */", - format("/**\n" - " * @param {l1 long1 to break}\n" - " */", - JSStyle20)); + verifyFormat("/**\n" + " * @param l1 long1\n" + " * to break\n" + " */", + "/**\n" + " * @param l1 long1 to break\n" + " */", + JSStyle20); + verifyFormat("/**\n" + " * @param {l1 long1\n" + " * to break}\n" + " */", + "/**\n" + " * @param {l1 long1 to break}\n" + " */", + JSStyle20); } TEST_F(FormatTestComments, SpaceAtLineCommentBegin) { FormatStyle Style = getLLVMStyle(); - StringRef NoTextInComment = " // \n" - "\n" - "void foo() {// \n" - "// \n" - "}"; - - EXPECT_EQ("//\n" - "\n" - "void foo() { //\n" - " //\n" - "}", - format(NoTextInComment, Style)); + constexpr StringRef NoTextInComment(" // \n" + "\n" + "void foo() {// \n" + "// \n" + "}"); + + verifyFormat("//\n" + "\n" + "void foo() { //\n" + " //\n" + "}", + NoTextInComment, Style); Style.SpacesInLineCommentPrefix.Minimum = 0; verifyFormat("//#comment", Style); - EXPECT_EQ("//\n" - "\n" - "void foo() { //\n" - " //\n" - "}", - format(NoTextInComment, Style)); + verifyFormat("//\n" + "\n" + "void foo() { //\n" + " //\n" + "}", + NoTextInComment, Style); Style.SpacesInLineCommentPrefix.Minimum = 5; - EXPECT_EQ("// #comment", format("//#comment", Style)); - EXPECT_EQ("//\n" - "\n" - "void foo() { //\n" - " //\n" - "}", - format(NoTextInComment, Style)); + verifyFormat("// #comment", "//#comment", Style); + verifyFormat("//\n" + "\n" + "void foo() { //\n" + " //\n" + "}", + NoTextInComment, Style); Style = getLLVMStyle(); - StringRef Code = + constexpr StringRef Code( "//Free comment without space\n" "\n" "// Free comment with 3 spaces\n" @@ -4008,226 +3998,232 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) { "//} will not move\n" "\n" "//vv will only move\n" - "//} if the line above does"; - - EXPECT_EQ("// Free comment without space\n" - "\n" - "// Free comment with 3 spaces\n" - "\n" - "/// Free Doxygen without space\n" - "\n" - "/// Free Doxygen with 3 spaces\n" - "\n" - "// 🐉 A nice dragon\n" - "\n" - "//\t abccba\n" - "\n" - "//\\t deffed\n" - "\n" - "// 🐉 Another nice dragon\n" - "\n" - "// \t Three leading spaces following tab\n" - "\n" - "// \\t Three leading spaces following backslash\n" - "\n" - "/// A Doxygen Comment with a nested list:\n" - "/// - Foo\n" - "/// - Bar\n" - "/// - Baz\n" - "/// - End\n" - "/// of the inner list\n" - "/// .\n" - "/// .\n" - "\n" - "namespace Foo {\n" - "bool bar(bool b) {\n" - " bool ret1 = true; ///< Doxygenstyle without space\n" - " bool ret2 = true; ///< Doxygenstyle with 3 spaces\n" - " if (b) {\n" - " // Foo\n" - "\n" - " // In function comment\n" - " ret2 = false;\n" - " } // End of if\n" - "\n" - " // if (ret1) {\n" - " // return ret2;\n" - " // }\n" - "\n" - " // if (ret1) {\n" - " // return ret2;\n" - " // }\n" - "\n" - " return ret1 && ret2;\n" - "}\n" - "} // namespace Foo\n" - "\n" - "namespace Bar {\n" - "int foo();\n" - "} // namespace Bar\n" - "//@Nothing added because of the non ascii char\n" - "\n" - "//@ Nothing removed because of the non ascii char\n" - "\n" - "// Comment to move to the left\n" - "// But not this?\n" - "// @but this\n" - "\n" - "// Comment to move to the right\n" - "//@ this stays\n" - "\n" - "//} will not move\n" - "\n" - "// vv will only move\n" - "// } if the line above does", - format(Code, Style)); + "//} if the line above does"); + + constexpr StringRef Code2( + "// Free comment without space\n" + "\n" + "// Free comment with 3 spaces\n" + "\n" + "/// Free Doxygen without space\n" + "\n" + "/// Free Doxygen with 3 spaces\n" + "\n" + "// 🐉 A nice dragon\n" + "\n" + "//\t abccba\n" + "\n" + "//\\t deffed\n" + "\n" + "// 🐉 Another nice dragon\n" + "\n" + "// \t Three leading spaces following tab\n" + "\n" + "// \\t Three leading spaces following backslash\n" + "\n" + "/// A Doxygen Comment with a nested list:\n" + "/// - Foo\n" + "/// - Bar\n" + "/// - Baz\n" + "/// - End\n" + "/// of the inner list\n" + "/// .\n" + "/// .\n" + "\n" + "namespace Foo {\n" + "bool bar(bool b) {\n" + " bool ret1 = true; ///< Doxygenstyle without space\n" + " bool ret2 = true; ///< Doxygenstyle with 3 spaces\n" + " if (b) {\n" + " // Foo\n" + "\n" + " // In function comment\n" + " ret2 = false;\n" + " } // End of if\n" + "\n" + " // if (ret1) {\n" + " // return ret2;\n" + " // }\n" + "\n" + " // if (ret1) {\n" + " // return ret2;\n" + " // }\n" + "\n" + " return ret1 && ret2;\n" + "}\n" + "} // namespace Foo\n" + "\n" + "namespace Bar {\n" + "int foo();\n" + "} // namespace Bar\n" + "//@Nothing added because of the non ascii char\n" + "\n" + "//@ Nothing removed because of the non ascii char\n" + "\n" + "// Comment to move to the left\n" + "// But not this?\n" + "// @but this\n" + "\n" + "// Comment to move to the right\n" + "//@ this stays\n" + "\n" + "//} will not move\n" + "\n" + "// vv will only move\n" + "// } if the line above does"); + + constexpr StringRef Code3( + "//Free comment without space\n" + "\n" + "//Free comment with 3 spaces\n" + "\n" + "///Free Doxygen without space\n" + "\n" + "///Free Doxygen with 3 spaces\n" + "\n" + "//🐉 A nice dragon\n" + "\n" + "//\t abccba\n" + "\n" + "//\\t deffed\n" + "\n" + "//🐉 Another nice dragon\n" + "\n" + "//\t Three leading spaces following tab\n" + "\n" + "//\\t Three leading spaces following backslash\n" + "\n" + "///A Doxygen Comment with a nested list:\n" + "///- Foo\n" + "///- Bar\n" + "/// - Baz\n" // Here we keep the relative indentation + "/// - End\n" + "/// of the inner list\n" + "/// .\n" + "///.\n" + "\n" + "namespace Foo {\n" + "bool bar(bool b) {\n" + " bool ret1 = true; /// Date: Wed, 29 Oct 2025 16:38:41 -0500 Subject: [PATCH 146/539] [dfsan] Fix getShadowAddress computation (#162864) Fix getShadowAddress computation by adding ShadowBase if it is not zero. Co-authored-by: anoopkg6 --- llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 5ba2167859490..cc53ec2c0f2f3 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -1957,8 +1957,12 @@ Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Value *DataFlowSanitizer::getShadowAddress(Value *Addr, BasicBlock::iterator Pos) { IRBuilder<> IRB(Pos->getParent(), Pos); - Value *ShadowOffset = getShadowOffset(Addr, IRB); - return getShadowAddress(Addr, Pos, ShadowOffset); + Value *ShadowAddr = getShadowOffset(Addr, IRB); + uint64_t ShadowBase = MapParams->ShadowBase; + if (ShadowBase != 0) + ShadowAddr = + IRB.CreateAdd(ShadowAddr, ConstantInt::get(IntptrTy, ShadowBase)); + return getShadowAddress(Addr, Pos, ShadowAddr); } Value *DFSanFunction::combineShadowsThenConvert(Type *T, Value *V1, Value *V2, From 529057584c6796d6facc77d2035e85df23dc1dd9 Mon Sep 17 00:00:00 2001 From: Erik Enikeev <47039011+Varnike@users.noreply.github.com> Date: Thu, 30 Oct 2025 00:43:43 +0300 Subject: [PATCH 147/539] [ARM] Add instruction selection for strict FP (#160696) This consists of marking the various strict opcodes as legal, and adjusting instruction selection patterns so that 'op' is 'any_op'. The changes are similar to those in D114946 for AArch64. Custom lowering and promotion are set for some FP16 strict ops to work correctly. This PR is part of the work on adding strict FP support in ARM, which was previously discussed in #137101. --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 86 +- llvm/lib/Target/ARM/ARMInstrInfo.td | 6 +- llvm/lib/Target/ARM/ARMInstrVFP.td | 175 ++-- llvm/test/CodeGen/ARM/fp-intrinsics.ll | 169 ++- llvm/test/CodeGen/ARM/fp16-fullfp16.ll | 968 +++++++++++++++++- .../test/CodeGen/ARM/strict-fp-int-promote.ll | 159 +++ llvm/test/CodeGen/ARM/strict-fp-ops.ll | 202 ++++ .../CodeGen/ARM/strictfp_f16_abi_promote.ll | 270 +++++ 8 files changed, 1896 insertions(+), 139 deletions(-) create mode 100644 llvm/test/CodeGen/ARM/strict-fp-int-promote.ll create mode 100644 llvm/test/CodeGen/ARM/strict-fp-ops.ll create mode 100644 llvm/test/CodeGen/ARM/strictfp_f16_abi_promote.ll diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index fdba45461377d..a4d3d62e9f487 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -601,10 +601,20 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); - if (!Subtarget->hasVFP2Base()) + if (!Subtarget->hasVFP2Base()) { setAllExpand(MVT::f32); - if (!Subtarget->hasFP64()) + } else { + for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, + ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) + setOperationAction(Op, MVT::f32, Legal); + } + if (!Subtarget->hasFP64()) { setAllExpand(MVT::f64); + } else { + for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, + ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) + setOperationAction(Op, MVT::f64, Legal); + } } if (Subtarget->hasFullFP16()) { @@ -1281,12 +1291,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) { setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); + setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, LibCall); + setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, LibCall); } // fp16 is a special v7 extension that adds f16 <-> f32 conversions. if (!Subtarget->hasFP16()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); + setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, LibCall); + setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, LibCall); } // Strict floating-point comparisons need custom lowering. @@ -1333,31 +1347,42 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, } // FP16 often need to be promoted to call lib functions + // clang-format off if (Subtarget->hasFullFP16()) { - setOperationAction(ISD::FREM, MVT::f16, Promote); - setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); - setOperationAction(ISD::FSIN, MVT::f16, Promote); - setOperationAction(ISD::FCOS, MVT::f16, Promote); - setOperationAction(ISD::FTAN, MVT::f16, Promote); - setOperationAction(ISD::FSINCOS, MVT::f16, Promote); - setOperationAction(ISD::FPOWI, MVT::f16, Promote); - setOperationAction(ISD::FPOW, MVT::f16, Promote); - setOperationAction(ISD::FEXP, MVT::f16, Promote); - setOperationAction(ISD::FEXP2, MVT::f16, Promote); - setOperationAction(ISD::FEXP10, MVT::f16, Promote); - setOperationAction(ISD::FLOG, MVT::f16, Promote); - setOperationAction(ISD::FLOG10, MVT::f16, Promote); - setOperationAction(ISD::FLOG2, MVT::f16, Promote); setOperationAction(ISD::LRINT, MVT::f16, Expand); setOperationAction(ISD::LROUND, MVT::f16, Expand); - - setOperationAction(ISD::FROUND, MVT::f16, Legal); - setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); - setOperationAction(ISD::FTRUNC, MVT::f16, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal); - setOperationAction(ISD::FRINT, MVT::f16, Legal); - setOperationAction(ISD::FFLOOR, MVT::f16, Legal); - setOperationAction(ISD::FCEIL, MVT::f16, Legal); + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); + + for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI, + ISD::FCOS, ISD::FSIN, ISD::FSINCOS, + ISD::FSINCOSPI, ISD::FMODF, ISD::FACOS, + ISD::FASIN, ISD::FATAN, ISD::FATAN2, + ISD::FCOSH, ISD::FSINH, ISD::FTANH, + ISD::FTAN, ISD::FEXP, ISD::FEXP2, + ISD::FEXP10, ISD::FLOG, ISD::FLOG2, + ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW, + ISD::STRICT_FPOWI, ISD::STRICT_FCOS, ISD::STRICT_FSIN, + ISD::STRICT_FACOS, ISD::STRICT_FASIN, ISD::STRICT_FATAN, + ISD::STRICT_FATAN2, ISD::STRICT_FCOSH, ISD::STRICT_FSINH, + ISD::STRICT_FTANH, ISD::STRICT_FEXP, ISD::STRICT_FEXP2, + ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10, + ISD::STRICT_FTAN}) { + setOperationAction(Op, MVT::f16, Promote); + } + + // Round-to-integer need custom lowering for fp16, as Promote doesn't work + // because the result type is integer. + for (auto Op : {ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) + setOperationAction(Op, MVT::f16, Custom); + + for (auto Op : {ISD::FROUND, ISD::FROUNDEVEN, ISD::FTRUNC, + ISD::FNEARBYINT, ISD::FRINT, ISD::FFLOOR, + ISD::FCEIL, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN, + ISD::STRICT_FTRUNC, ISD::STRICT_FNEARBYINT, ISD::STRICT_FRINT, + ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL}) { + setOperationAction(Op, MVT::f16, Legal); + } + // clang-format on } if (Subtarget->hasNEON()) { @@ -10725,6 +10750,19 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerCMP(Op, DAG); case ISD::ABS: return LowerABS(Op, DAG); + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: { + assert((Op.getOperand(1).getValueType() == MVT::f16 || + Op.getOperand(1).getValueType() == MVT::bf16) && + "Expected custom lowering of rounding operations only for f16"); + SDLoc DL(Op); + SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other}, + {Op.getOperand(0), Op.getOperand(1)}); + return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other}, + {Ext.getValue(1), Ext.getValue(0)}); + } } } diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 10d4cd5dd96c1..f7176a65d8163 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -473,15 +473,15 @@ def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs)>; // An 'fmul' node with a single use. let HasOneUse = 1 in -def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs)>; +def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (any_fmul node:$lhs, node:$rhs)>; // An 'fadd' node which checks for single non-hazardous use. -def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{ +def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(any_fadd node:$lhs, node:$rhs),[{ return hasNoVMLxHazardUse(N); }]>; // An 'fsub' node which checks for single non-hazardous use. -def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{ +def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(any_fsub node:$lhs, node:$rhs),[{ return hasNoVMLxHazardUse(N); }]>; diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td index 6771106ef2d89..e2cc97b7b4634 100644 --- a/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -439,14 +439,14 @@ let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FP def VADDD : ADbI<0b11100, 0b11, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fadd DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPALU64]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VADDS : ASbIn<0b11100, 0b11, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]>, + [(set SPR:$Sd, (any_fadd SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPALU32]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -457,21 +457,21 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP def VADDH : AHbI<0b11100, 0b11, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, + [(set (f16 HPR:$Sd), (any_fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSUBD : ADbI<0b11100, 0b11, 1, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fsub DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPALU64]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSUBS : ASbIn<0b11100, 0b11, 1, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]>, + [(set SPR:$Sd, (any_fsub SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPALU32]>{ // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -482,42 +482,42 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP def VSUBH : AHbI<0b11100, 0b11, 1, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, + [(set (f16 HPR:$Sd), (any_fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVD : ADbI<0b11101, 0b00, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fdiv DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPDIV64]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVS : ASbI<0b11101, 0b00, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>, + [(set SPR:$Sd, (any_fdiv SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPDIV32]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VDIVH : AHbI<0b11101, 0b00, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, + [(set (f16 HPR:$Sd), (any_fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPDIV32]>; let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMULD : ADbI<0b11100, 0b10, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fmul DPR:$Dn, (f64 DPR:$Dm)))]>, Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VMULS : ASbIn<0b11100, 0b10, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]>, + [(set SPR:$Sd, (any_fmul SPR:$Sn, SPR:$Sm))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -528,21 +528,21 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP def VMULH : AHbI<0b11100, 0b10, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, + [(set (f16 HPR:$Sd), (any_fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>; let TwoOperandAliasConstraint = "$Dn = $Dd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMULD : ADbI<0b11100, 0b10, 1, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>, + [(set DPR:$Dd, (fneg (any_fmul DPR:$Dn, (f64 DPR:$Dm))))]>, Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>; let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VNMULS : ASbI<0b11100, 0b10, 1, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]>, + [(set SPR:$Sd, (fneg (any_fmul SPR:$Sn, SPR:$Sm)))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -553,7 +553,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd", mayRaiseFPException = 1, Uses = [FP def VNMULH : AHbI<0b11100, 0b10, 1, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm", - [(set (f16 HPR:$Sd), (fneg (fmul (f16 HPR:$Sn), (f16 HPR:$Sm))))]>, + [(set (f16 HPR:$Sd), (fneg (any_fmul (f16 HPR:$Sn), (f16 HPR:$Sm))))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>; multiclass vsel_inst opc, int CC> { @@ -587,7 +587,7 @@ defm VSELGE : vsel_inst<"ge", 0b10, 10>; defm VSELEQ : vsel_inst<"eq", 0b00, 0>; defm VSELVS : vsel_inst<"vs", 0b01, 6>; -multiclass vmaxmin_inst { +multiclass vmaxmin_inst { let DecoderNamespace = "VFPV8", PostEncoderMethod = "", isUnpredicable = 1, mayRaiseFPException = 1 in { def H : AHbInp<0b11101, 0b00, opc, @@ -610,8 +610,8 @@ multiclass vmaxmin_inst { } } -defm VFP_VMAXNM : vmaxmin_inst<"vmaxnm", 0, fmaxnum>; -defm VFP_VMINNM : vmaxmin_inst<"vminnm", 1, fminnum>; +defm VFP_VMAXNM : vmaxmin_inst<"vmaxnm", 0, any_fmaxnum>; +defm VFP_VMINNM : vmaxmin_inst<"vminnm", 1, any_fminnum>; // Match reassociated forms only if not sign dependent rounding. def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)), @@ -746,7 +746,7 @@ let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, (outs DPR:$Dd), (ins SPR:$Sm), IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm", "", - [(set DPR:$Dd, (fpextend SPR:$Sm))]>, + [(set DPR:$Dd, (any_fpextend SPR:$Sm))]>, Sched<[WriteFPCVT]> { // Instruction operands. bits<5> Dd; @@ -766,7 +766,7 @@ def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm, IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm", "", - [(set SPR:$Sd, (fpround DPR:$Dm))]>, + [(set SPR:$Sd, (any_fpround DPR:$Dm))]>, Sched<[WriteFPCVT]> { // Instruction operands. bits<5> Sd; @@ -796,7 +796,7 @@ def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FP16Pat<(f32 (fpextend (f16 HPR:$Sm))), +def : FP16Pat<(f32 (any_fpextend (f16 HPR:$Sm))), (VCVTBHS (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>; def : FP16Pat<(f16_to_fp GPR:$a), (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>; @@ -808,16 +808,16 @@ def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sda, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FP16Pat<(f16 (fpround SPR:$Sm)), +def : FP16Pat<(f16 (any_fpround SPR:$Sm)), (COPY_TO_REGCLASS (VCVTBSH (IMPLICIT_DEF), SPR:$Sm), HPR)>; def : FP16Pat<(fp_to_f16 SPR:$a), (i32 (COPY_TO_REGCLASS (VCVTBSH (IMPLICIT_DEF), SPR:$a), GPR))>; -def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane), +def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_even:$lane), (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTBSH (EXTRACT_SUBREG (v8f16 MQPR:$src1), (SSubReg_f16_reg imm:$lane)), SPR:$src2), (SSubReg_f16_reg imm:$lane)))>; -def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane), +def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_even:$lane), (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTBSH (EXTRACT_SUBREG (v4f16 DPR:$src1), (SSubReg_f16_reg imm:$lane)), SPR:$src2), @@ -830,9 +830,9 @@ def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FP16Pat<(f32 (fpextend (extractelt (v8f16 MQPR:$src), imm_odd:$lane))), +def : FP16Pat<(f32 (any_fpextend (extractelt (v8f16 MQPR:$src), imm_odd:$lane))), (VCVTTHS (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane)))>; -def : FP16Pat<(f32 (fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))), +def : FP16Pat<(f32 (any_fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))), (VCVTTHS (EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)), (SSubReg_f16_reg imm_odd:$lane)))>; @@ -844,12 +844,12 @@ def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sda, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane), +def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_odd:$lane), (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTTSH (EXTRACT_SUBREG (v8f16 MQPR:$src1), (SSubReg_f16_reg imm:$lane)), SPR:$src2), (SSubReg_f16_reg imm:$lane)))>; -def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane), +def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (any_fpround (f32 SPR:$src2))), imm_odd:$lane), (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTTSH (EXTRACT_SUBREG (v4f16 DPR:$src1), (SSubReg_f16_reg imm:$lane)), SPR:$src2), @@ -872,7 +872,7 @@ def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0, let hasSideEffects = 0; } -def : FullFP16Pat<(f64 (fpextend (f16 HPR:$Sm))), +def : FullFP16Pat<(f64 (any_fpextend (f16 HPR:$Sm))), (VCVTBHD (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>, Requires<[HasFPARMv8, HasDPVFP]>; def : FP16Pat<(f64 (f16_to_fp GPR:$a)), @@ -898,7 +898,7 @@ def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0, let hasSideEffects = 0; } -def : FullFP16Pat<(f16 (fpround DPR:$Dm)), +def : FullFP16Pat<(f16 (any_fpround DPR:$Dm)), (COPY_TO_REGCLASS (VCVTBDH (IMPLICIT_DEF), DPR:$Dm), HPR)>, Requires<[HasFPARMv8, HasDPVFP]>; def : FP16Pat<(fp_to_f16 (f64 DPR:$a)), @@ -1007,41 +1007,41 @@ multiclass vcvt_inst rm, let Predicates = [HasFPARMv8] in { let Predicates = [HasFullFP16] in { - def : Pat<(i32 (fp_to_sint (node (f16 HPR:$a)))), + def : Pat<(i32 (any_fp_to_sint (node (f16 HPR:$a)))), (COPY_TO_REGCLASS (!cast(NAME#"SH") (f16 HPR:$a)), GPR)>; - def : Pat<(i32 (fp_to_uint (node (f16 HPR:$a)))), + def : Pat<(i32 (any_fp_to_uint (node (f16 HPR:$a)))), (COPY_TO_REGCLASS (!cast(NAME#"UH") (f16 HPR:$a)), GPR)>; } - def : Pat<(i32 (fp_to_sint (node SPR:$a))), + def : Pat<(i32 (any_fp_to_sint (node SPR:$a))), (COPY_TO_REGCLASS (!cast(NAME#"SS") SPR:$a), GPR)>; - def : Pat<(i32 (fp_to_uint (node SPR:$a))), + def : Pat<(i32 (any_fp_to_uint (node SPR:$a))), (COPY_TO_REGCLASS (!cast(NAME#"US") SPR:$a), GPR)>; } let Predicates = [HasFPARMv8, HasDPVFP] in { - def : Pat<(i32 (fp_to_sint (node (f64 DPR:$a)))), + def : Pat<(i32 (any_fp_to_sint (node (f64 DPR:$a)))), (COPY_TO_REGCLASS (!cast(NAME#"SD") DPR:$a), GPR)>; - def : Pat<(i32 (fp_to_uint (node (f64 DPR:$a)))), + def : Pat<(i32 (any_fp_to_uint (node (f64 DPR:$a)))), (COPY_TO_REGCLASS (!cast(NAME#"UD") DPR:$a), GPR)>; } } -defm VCVTA : vcvt_inst<"a", 0b00, fround>; +defm VCVTA : vcvt_inst<"a", 0b00, any_fround>; defm VCVTN : vcvt_inst<"n", 0b01>; -defm VCVTP : vcvt_inst<"p", 0b10, fceil>; -defm VCVTM : vcvt_inst<"m", 0b11, ffloor>; +defm VCVTP : vcvt_inst<"p", 0b10, any_fceil>; +defm VCVTM : vcvt_inst<"m", 0b11, any_ffloor>; def VNEGD : ADuI<0b11101, 0b11, 0b0001, 0b01, 0, (outs DPR:$Dd), (ins DPR:$Dm), @@ -1103,9 +1103,9 @@ multiclass vrint_inst_zrx; } -defm VRINTZ : vrint_inst_zrx<"z", 0, 1, ftrunc, [], 0>; -defm VRINTR : vrint_inst_zrx<"r", 0, 0, fnearbyint, [FPSCR_RM], 0>; -defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint, [FPSCR_RM], 1>; +defm VRINTZ : vrint_inst_zrx<"z", 0, 1, any_ftrunc, [], 0>; +defm VRINTR : vrint_inst_zrx<"r", 0, 0, any_fnearbyint, [FPSCR_RM], 0>; +defm VRINTX : vrint_inst_zrx<"x", 1, 0, any_frint, [FPSCR_RM], 1>; multiclass vrint_inst_anpm rm, SDPatternOperator node = null_frag> { @@ -1145,30 +1145,31 @@ multiclass vrint_inst_anpm rm, Requires<[HasFPARMv8,HasDPVFP]>; } -defm VRINTA : vrint_inst_anpm<"a", 0b00, fround>; -defm VRINTN : vrint_inst_anpm<"n", 0b01, froundeven>; -defm VRINTP : vrint_inst_anpm<"p", 0b10, fceil>; -defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>; +defm VRINTA : vrint_inst_anpm<"a", 0b00, any_fround>; +defm VRINTN : vrint_inst_anpm<"n", 0b01, any_froundeven>; +defm VRINTP : vrint_inst_anpm<"p", 0b10, any_fceil>; +defm VRINTM : vrint_inst_anpm<"m", 0b11, any_ffloor>; + let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs DPR:$Dd), (ins DPR:$Dm), IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm", "", - [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>, + [(set DPR:$Dd, (any_fsqrt (f64 DPR:$Dm)))]>, Sched<[WriteFPSQRT64]>; let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm", "", - [(set SPR:$Sd, (fsqrt SPR:$Sm))]>, + [(set SPR:$Sd, (any_fsqrt SPR:$Sm))]>, Sched<[WriteFPSQRT32]>; let mayRaiseFPException = 1, Uses = [FPSCR_RM] in def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs HPR:$Sd), (ins HPR:$Sm), IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm", - [(set (f16 HPR:$Sd), (fsqrt (f16 HPR:$Sm)))]>; + [(set (f16 HPR:$Sd), (any_fsqrt (f16 HPR:$Sm)))]>; let hasSideEffects = 0 in { let isMoveReg = 1 in { @@ -1509,10 +1510,10 @@ def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, } let Predicates=[HasVFP2, HasDPVFP] in { - def : VFPPat<(f64 (sint_to_fp GPR:$a)), + def : VFPPat<(f64 (any_sint_to_fp GPR:$a)), (VSITOD (COPY_TO_REGCLASS GPR:$a, SPR))>; - def : VFPPat<(f64 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))), + def : VFPPat<(f64 (any_sint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VSITOD (VLDRS addrmode5:$a))>; } @@ -1529,10 +1530,10 @@ def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, let D = VFPNeonA8Domain; } -def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)), +def : VFPNoNEONPat<(f32 (any_sint_to_fp GPR:$a)), (VSITOS (COPY_TO_REGCLASS GPR:$a, SPR))>; -def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))), +def : VFPNoNEONPat<(f32 (any_sint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VSITOS (VLDRS addrmode5:$a))>; let mayRaiseFPException = 1 in @@ -1545,7 +1546,7 @@ def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)), +def : VFPNoNEONPat<(f16 (any_sint_to_fp GPR:$a)), (VSITOH (COPY_TO_REGCLASS GPR:$a, SPR))>; let mayRaiseFPException = 1 in @@ -1558,10 +1559,10 @@ def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, } let Predicates=[HasVFP2, HasDPVFP] in { - def : VFPPat<(f64 (uint_to_fp GPR:$a)), + def : VFPPat<(f64 (any_uint_to_fp GPR:$a)), (VUITOD (COPY_TO_REGCLASS GPR:$a, SPR))>; - def : VFPPat<(f64 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))), + def : VFPPat<(f64 (any_uint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VUITOD (VLDRS addrmode5:$a))>; } @@ -1578,10 +1579,10 @@ def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, let D = VFPNeonA8Domain; } -def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)), +def : VFPNoNEONPat<(f32 (any_uint_to_fp GPR:$a)), (VUITOS (COPY_TO_REGCLASS GPR:$a, SPR))>; -def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))), +def : VFPNoNEONPat<(f32 (any_uint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VUITOS (VLDRS addrmode5:$a))>; let mayRaiseFPException = 1 in @@ -1594,7 +1595,7 @@ def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(f16 (uint_to_fp GPR:$a)), +def : VFPNoNEONPat<(f16 (any_uint_to_fp GPR:$a)), (VUITOH (COPY_TO_REGCLASS GPR:$a, SPR))>; // FP -> Int: @@ -1669,12 +1670,12 @@ def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011, } let Predicates=[HasVFP2, HasDPVFP] in { - def : VFPPat<(i32 (fp_to_sint (f64 DPR:$a))), + def : VFPPat<(i32 (any_fp_to_sint (f64 DPR:$a))), (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>; def : VFPPat<(i32 (fp_to_sint_sat (f64 DPR:$a), i32)), (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>; - def : VFPPat<(alignedstore32 (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr), + def : VFPPat<(alignedstore32 (i32 (any_fp_to_sint (f64 DPR:$a))), addrmode5:$ptr), (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>; def : VFPPat<(alignedstore32 (i32 (fp_to_sint_sat (f64 DPR:$a), i32)), addrmode5:$ptr), (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>; @@ -1693,12 +1694,12 @@ def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010, let D = VFPNeonA8Domain; } -def : VFPNoNEONPat<(i32 (fp_to_sint SPR:$a)), +def : VFPNoNEONPat<(i32 (any_fp_to_sint SPR:$a)), (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>; def : VFPPat<(i32 (fp_to_sint_sat SPR:$a, i32)), (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>; -def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))), +def : VFPNoNEONPat<(alignedstore32 (i32 (any_fp_to_sint (f32 SPR:$a))), addrmode5:$ptr), (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>; def : VFPPat<(alignedstore32 (i32 (fp_to_sint_sat (f32 SPR:$a), i32)), @@ -1715,7 +1716,7 @@ def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(i32 (fp_to_sint (f16 HPR:$a))), +def : VFPNoNEONPat<(i32 (any_fp_to_sint (f16 HPR:$a))), (COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>; def : VFPPat<(i32 (fp_to_sint_sat (f16 HPR:$a), i32)), (COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>; @@ -1730,12 +1731,12 @@ def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, } let Predicates=[HasVFP2, HasDPVFP] in { - def : VFPPat<(i32 (fp_to_uint (f64 DPR:$a))), + def : VFPPat<(i32 (any_fp_to_uint (f64 DPR:$a))), (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>; def : VFPPat<(i32 (fp_to_uint_sat (f64 DPR:$a), i32)), (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>; - def : VFPPat<(alignedstore32 (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr), + def : VFPPat<(alignedstore32 (i32 (any_fp_to_uint (f64 DPR:$a))), addrmode5:$ptr), (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>; def : VFPPat<(alignedstore32 (i32 (fp_to_uint_sat (f64 DPR:$a), i32)), addrmode5:$ptr), (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>; @@ -1754,12 +1755,12 @@ def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010, let D = VFPNeonA8Domain; } -def : VFPNoNEONPat<(i32 (fp_to_uint SPR:$a)), +def : VFPNoNEONPat<(i32 (any_fp_to_uint SPR:$a)), (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>; def : VFPPat<(i32 (fp_to_uint_sat SPR:$a, i32)), (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>; -def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))), +def : VFPNoNEONPat<(alignedstore32 (i32 (any_fp_to_uint (f32 SPR:$a))), addrmode5:$ptr), (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>; def : VFPPat<(alignedstore32 (i32 (fp_to_uint_sat (f32 SPR:$a), i32)), @@ -1776,7 +1777,7 @@ def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(i32 (fp_to_uint (f16 HPR:$a))), +def : VFPNoNEONPat<(i32 (any_fp_to_uint (f16 HPR:$a))), (COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>; def : VFPPat<(i32 (fp_to_uint_sat (f16 HPR:$a), i32)), (COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>; @@ -2320,13 +2321,13 @@ def : Pat<(fadd_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)), // Match @llvm.fma.* intrinsics // (fma x, y, z) -> (vfms z, x, y) -def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)), +def : Pat<(f64 (any_fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)), (VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)), +def : Pat<(f32 (any_fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)), (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))), +def : Pat<(f16 (any_fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))), (VFMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; @@ -2375,13 +2376,13 @@ def : Pat<(fsub_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)), // Match @llvm.fma.* intrinsics // (fma (fneg x), y, z) -> (vfms z, x, y) -def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)), +def : Pat<(f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)), (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)), +def : Pat<(f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)), (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))), +def : Pat<(f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))), (VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; @@ -2427,23 +2428,23 @@ def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), // Match @llvm.fma.* intrinsics // (fneg (fma x, y, z)) -> (vfnma z, x, y) -def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))), +def : Pat<(fneg (any_fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))), (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))), +def : Pat<(fneg (any_fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))), (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(fneg (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 (f16 HPR:$Sdin)))), +def : Pat<(fneg (any_fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 (f16 HPR:$Sdin)))), (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; // (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y) -def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))), +def : Pat<(f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))), (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))), +def : Pat<(f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))), (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), +def : Pat<(f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; @@ -2488,23 +2489,23 @@ def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), // Match @llvm.fma.* intrinsics // (fma x, y, (fneg z)) -> (vfnms z, x, y)) -def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))), +def : Pat<(f64 (any_fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))), (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))), +def : Pat<(f32 (any_fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))), (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), +def : Pat<(f16 (any_fma (f16 HPR:$Sn), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; // (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y) -def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))), +def : Pat<(fneg (f64 (any_fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))), (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4,HasDPVFP]>; -def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))), +def : Pat<(fneg (f32 (any_fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))), (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(fneg (f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))), +def : Pat<(fneg (f16 (any_fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))), (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; diff --git a/llvm/test/CodeGen/ARM/fp-intrinsics.ll b/llvm/test/CodeGen/ARM/fp-intrinsics.ll index 93b6a58a22b6c..cb87508d53342 100644 --- a/llvm/test/CodeGen/ARM/fp-intrinsics.ll +++ b/llvm/test/CodeGen/ARM/fp-intrinsics.ll @@ -76,7 +76,6 @@ define i32 @fptosi_f32(float %x) #0 { ; CHECK-NOSP: bl __aeabi_f2iz ; CHECK-NOSP: bl __aeabi_f2iz ; CHECK-SP: vcvt.s32.f32 -; FIXME-CHECK-SP: vcvt.s32.f32 define void @fptosi_f32_twice(float %arg, ptr %ptr) #0 { entry: %conv = call i32 @llvm.experimental.constrained.fptosi.i32.f32(float %arg, metadata !"fpexcept.strict") #0 @@ -146,6 +145,80 @@ define float @tan_f32(float %x) #0 { ret float %val } +; CHECK-LABEL: acos_f32: +; CHECK: bl acosf +define float @acos_f32(float %x, float %y) #0 { + %val = call float @llvm.experimental.constrained.acos.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %val +} + +; CHECK-LABEL: asin_f32: +; CHECK: bl asinf +define float @asin_f32(float %x, float %y) #0 { + %val = call float @llvm.experimental.constrained.asin.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %val +} + +; CHECK-LABEL: atan_f32: +; CHECK: bl atanf +define float @atan_f32(float %x, float %y) #0 { + %val = call float @llvm.experimental.constrained.atan.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %val +} + +; CHECK-LABEL: cosh_f32: +; CHECK: bl coshf +define float @cosh_f32(float %x, float %y) #0 { + %val = call float @llvm.experimental.constrained.cosh.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %val +} + +; CHECK-LABEL: sinh_f32: +; CHECK: bl sinhf +define float @sinh_f32(float %x, float %y) #0 { + %val = call float @llvm.experimental.constrained.sinh.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %val +} + +; CHECK-LABEL: tanh_f32: +; CHECK: bl tanhf +define float @tanh_f32(float %x, float %y) #0 { + %val = call float @llvm.experimental.constrained.tanh.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %val +} + +; CHECK-LABEL: fmuladd_f32: +; CHECK-SP: vfma.f32 +; CHECK-NOSP: bl __aeabi_fmul +; CHECK-NOSP: bl __aeabi_fadd +define float @fmuladd_f32(float %x, float %y, float %z) #0 { + %val = call float @llvm.experimental.constrained.fmuladd.f32(float %x, float %y, float %z, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %val +} + +; CHECK-LABEL: ldexp_f32: +; CHECK: bl ldexpf +define float @ldexp_f32(float %x, i32 %y) #0 { + %val = call float @llvm.experimental.constrained.ldexp.f32.i32(float %x, i32 %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %val +} + +; CHECK-LABEL: roundeven_f32: +; CHECK-SP-V8: vrintn.f32 +; CHECK-NOSP: bl roundevenf +define float @roundeven_f32(float %x) #0 { + %val = call float @llvm.experimental.constrained.roundeven.f32(float %x, metadata !"fpexcept.strict") #0 + ret float %val +} + +; CHECK-LABEL: uitofp_f32_i32: +; CHECK-NOSP: bl __aeabi_ui2f +; FIXME-CHECK-SP: vcvt.f32.f64 +define float @uitofp_f32_i32(i32 %x) #0 { + %val = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %val +} + ; CHECK-LABEL: atan2_f32: ; CHECK: bl atan2f define float @atan2_f32(float %x, float %y) #0 { @@ -617,6 +690,80 @@ define double @tan_f64(double %x) #0 { ret double %val } +; CHECK-LABEL: acos_f64: +; CHECK: bl acos +define double @acos_f64(double %x, double %y) #0 { + %val = call double @llvm.experimental.constrained.acos.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret double %val +} + +; CHECK-LABEL: asin_f64: +; CHECK: bl asin +define double @asin_f64(double %x, double %y) #0 { + %val = call double @llvm.experimental.constrained.asin.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret double %val +} + +; CHECK-LABEL: atan_f64: +; CHECK: bl atan +define double @atan_f64(double %x, double %y) #0 { + %val = call double @llvm.experimental.constrained.atan.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret double %val +} + +; CHECK-LABEL: cosh_f64: +; CHECK: bl cosh +define double @cosh_f64(double %x, double %y) #0 { + %val = call double @llvm.experimental.constrained.cosh.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret double %val +} + +; CHECK-LABEL: sinh_f64: +; CHECK: bl sinh +define double @sinh_f64(double %x, double %y) #0 { + %val = call double @llvm.experimental.constrained.sinh.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret double %val +} + +; CHECK-LABEL: tanh_f64: +; CHECK: bl tanh +define double @tanh_f64(double %x, double %y) #0 { + %val = call double @llvm.experimental.constrained.tanh.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret double %val +} + +; CHECK-LABEL: fmuladd_f64: +; CHECK-DP: vfma.f64 +; CHECK-NODP: bl __aeabi_dmul +; CHECK-NODP: bl __aeabi_dadd +define double @fmuladd_f64(double %x, double %y, double %z) #0 { + %val = call double @llvm.experimental.constrained.fmuladd.f64(double %x, double %y, double %z, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret double %val +} + +; CHECK-LABEL: ldexp_f64: +; CHECK: bl ldexp +define double @ldexp_f64(double %x, i32 %y) #0 { + %val = call double @llvm.experimental.constrained.ldexp.f64.i32(double %x, i32 %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret double %val +} + +; CHECK-LABEL: roundeven_f64: +; CHECK-DP-V8: vrintn.f64 +; CHECK-NODP: bl roundeven +define double @roundeven_f64(double %x) #0 { + %val = call double @llvm.experimental.constrained.roundeven.f64(double %x, metadata !"fpexcept.strict") #0 + ret double %val +} + +; CHECK-LABEL: uitofp_f64_i32: +; CHECK-NOSP: bl __aeabi_ui2d +; FIXME-CHECK-SP: vsub.f64 +define double @uitofp_f64_i32(i32 %x) #0 { + %val = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret double %val +} + ; CHECK-LABEL: atan2_f64: ; CHECK: bl atan2 define double @atan2_f64(double %x, double %y) #0 { @@ -1052,6 +1199,16 @@ declare float @llvm.experimental.constrained.powi.f32(float, i32, metadata, meta declare float @llvm.experimental.constrained.sin.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.cos.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata) +declare float @llvm.experimental.constrained.acos.f32(float, metadata, metadata) +declare float @llvm.experimental.constrained.asin.f32(float, metadata, metadata) +declare float @llvm.experimental.constrained.atan.f32(float, metadata, metadata) +declare float @llvm.experimental.constrained.cosh.f32(float, metadata, metadata) +declare float @llvm.experimental.constrained.sinh.f32(float, metadata, metadata) +declare float @llvm.experimental.constrained.tanh.f32(float, metadata, metadata) +declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata) +declare float @llvm.experimental.constrained.ldexp.f32.i32(float, i32, metadata, metadata) +declare float @llvm.experimental.constrained.roundeven.f32(float, metadata) +declare float @llvm.experimental.constrained.uitofp.f32.i32(i32, metadata, metadata) declare float @llvm.experimental.constrained.atan2.f32(float, float, metadata, metadata) declare float @llvm.experimental.constrained.pow.f32(float, float, metadata, metadata) declare float @llvm.experimental.constrained.log.f32(float, metadata, metadata) @@ -1087,6 +1244,16 @@ declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, me declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.acos.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.asin.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.atan.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.cosh.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.sinh.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.tanh.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata) +declare double @llvm.experimental.constrained.ldexp.f64.i32(double, i32, metadata, metadata) +declare double @llvm.experimental.constrained.roundeven.f64(double, metadata) +declare double @llvm.experimental.constrained.uitofp.f64.i32(i32, metadata, metadata) declare double @llvm.experimental.constrained.atan2.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata) diff --git a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll index 200b14bae56ed..b4060d5fdb574 100644 --- a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll +++ b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll @@ -98,12 +98,18 @@ define i32 @test_fptosi_i32(ptr %p) { ret i32 %r } -; FIXME -;define i64 @test_fptosi_i64(ptr %p) { -; %a = load half, ptr %p, align 2 -; %r = fptosi half %a to i64 -; ret i64 %r -;} +define i64 @test_fptosi_i64(ptr %p) { +; CHECK-LABEL: test_fptosi_i64: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: bl __fixhfdi +; CHECK-NEXT: pop {r11, pc} + %a = load half, ptr %p, align 2 + %r = fptosi half %a to i64 + ret i64 %r +} define i32 @test_fptoui_i32(ptr %p) { ; CHECK-LABEL: test_fptoui_i32: @@ -116,12 +122,18 @@ define i32 @test_fptoui_i32(ptr %p) { ret i32 %r } -; FIXME -;define i64 @test_fptoui_i64(ptr %p) { -; %a = load half, ptr %p, align 2 -; %r = fptoui half %a to i64 -; ret i64 %r -;} +define i64 @test_fptoui_i64(ptr %p) { +; CHECK-LABEL: test_fptoui_i64: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: bl __fixunshfdi +; CHECK-NEXT: pop {r11, pc} + %a = load half, ptr %p, align 2 + %r = fptoui half %a to i64 + ret i64 %r +} define void @test_sitofp_i32(i32 %a, ptr %p) { ; CHECK-LABEL: test_sitofp_i32: @@ -145,19 +157,31 @@ define void @test_uitofp_i32(i32 %a, ptr %p) { ret void } -; FIXME -;define void @test_sitofp_i64(i64 %a, ptr %p) { -; %r = sitofp i64 %a to half -; store half %r, ptr %p -; ret void -;} +define void @test_sitofp_i64(i64 %a, ptr %p) { +; CHECK-LABEL: test_sitofp_i64: +; CHECK: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: bl __floatdihf +; CHECK-NEXT: vstr.16 s0, [r4] +; CHECK-NEXT: pop {r4, pc} + %r = sitofp i64 %a to half + store half %r, ptr %p + ret void +} -; FIXME -;define void @test_uitofp_i64(i64 %a, ptr %p) { -; %r = uitofp i64 %a to half -; store half %r, ptr %p -; ret void -;} +define void @test_uitofp_i64(i64 %a, ptr %p) { +; CHECK-LABEL: test_uitofp_i64: +; CHECK: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: bl __floatundihf +; CHECK-NEXT: vstr.16 s0, [r4] +; CHECK-NEXT: pop {r4, pc} + %r = uitofp i64 %a to half + store half %r, ptr %p + ret void +} define void @test_fptrunc_float(float %f, ptr %p) { ; CHECK-LABEL: test_fptrunc_float: @@ -613,6 +637,902 @@ define void @test_fmuladd(ptr %p, ptr %q, ptr %r) { ret void } +; Half-precision intrinsics + +define half @add_f16(half %x, half %y) #0 { +; CHECK-LABEL: add_f16: +; CHECK: vadd.f16 s0, s0, s1 +; CHECK-NEXT: bx lr + %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @sub_f16(half %x, half %y) #0 { +; CHECK-LABEL: sub_f16: +; CHECK: vsub.f16 s0, s0, s1 +; CHECK-NEXT: bx lr + %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @mul_f16(half %x, half %y) #0 { +; CHECK-LABEL: mul_f16: +; CHECK: vmul.f16 s0, s0, s1 +; CHECK-NEXT: bx lr + %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @div_f16(half %x, half %y) #0 { +; CHECK-LABEL: div_f16: +; CHECK: vdiv.f16 s0, s0, s1 +; CHECK-NEXT: bx lr + %val = call half @llvm.experimental.constrained.fdiv.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @frem_f16(half %x, half %y) #0 { +; CHECK-LABEL: frem_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtb.f32.f16 s1, s1 +; CHECK-NEXT: bl fmodf +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.frem.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @fma_f16(half %x, half %y, half %z) #0 { +; CHECK-LABEL: fma_f16: +; CHECK: vfma.f16 s2, s0, s1 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: bx lr + %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @fmuladd_f16(half %x, half %y, half %z) #0 { +; CHECK-LABEL: fmuladd_f16: +; CHECK: vfma.f16 s2, s0, s1 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: bx lr + %val = call half @llvm.experimental.constrained.fmuladd.f16(half %x, half %y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define i32 @fptosi_i32_f16(half %x) #0 { +; CHECK-LABEL: fptosi_i32_f16: +; CHECK: vcvt.s32.f16 s0, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr + %val = call i32 @llvm.experimental.constrained.fptosi.i32.f16(half %x, metadata !"fpexcept.strict") #0 + ret i32 %val +} + +define i32 @fptoui_i32_f16(half %x) #0 { +; CHECK-LABEL: fptoui_i32_f16: +; CHECK: vcvt.s32.f16 s0, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr + %val = call i32 @llvm.experimental.constrained.fptoui.i32.f16(half %x, metadata !"fpexcept.strict") #0 + ret i32 %val +} + +define i64 @fptosi_i64_f16(half %x) #0 { +; CHECK-LABEL: fptosi_i64_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vmov.f16 r0, s0 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: bl __fixhfdi +; CHECK-NEXT: pop {r11, pc} + %val = call i64 @llvm.experimental.constrained.fptosi.i64.f16(half %x, metadata !"fpexcept.strict") #0 + ret i64 %val +} + +define i64 @fptoui_i64_f16(half %x) #0 { +; CHECK-LABEL: fptoui_i64_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vmov.f16 r0, s0 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: bl __fixunshfdi +; CHECK-NEXT: pop {r11, pc} + %val = call i64 @llvm.experimental.constrained.fptoui.i64.f16(half %x, metadata !"fpexcept.strict") #0 + ret i64 %val +} + +define half @sitofp_f16_i32(i32 %x) #0 { +; CHECK-LABEL: sitofp_f16_i32: +; CHECK: .pad #8 +; CHECK-NEXT: sub sp, sp, #8 +; CHECK-NEXT: movw r1, #0 +; CHECK-NEXT: eor r0, r0, #-2147483648 +; CHECK-NEXT: movt r1, #17200 +; CHECK-NEXT: str r0, [sp] +; CHECK-NEXT: str r1, [sp, #4] +; CHECK-NEXT: vldr d16, .LCPI57_0 +; CHECK-NEXT: vldr d17, [sp] +; CHECK-NEXT: vsub.f64 d16, d17, d16 +; CHECK-NEXT: vcvtb.f16.f64 s0, d16 +; CHECK-NEXT: add sp, sp, #8 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: .LCPI57_0: +; CHECK-NEXT: .long 2147483648 +; CHECK-NEXT: .long 1127219200 + %val = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @uitofp_f16_i32(i32 %x) #0 { +; CHECK-LABEL: uitofp_f16_i32: +; CHECK: .pad #8 +; CHECK-NEXT: sub sp, sp, #8 +; CHECK-NEXT: movw r1, #0 +; CHECK-NEXT: str r0, [sp] +; CHECK-NEXT: movt r1, #17200 +; CHECK-NEXT: vldr d16, .LCPI58_0 +; CHECK-NEXT: str r1, [sp, #4] +; CHECK-NEXT: vldr d17, [sp] +; CHECK-NEXT: vsub.f64 d16, d17, d16 +; CHECK-NEXT: vcvtb.f16.f64 s0, d16 +; CHECK-NEXT: add sp, sp, #8 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: .LCPI58_0: +; CHECK-NEXT: .long 0 +; CHECK-NEXT: .long 1127219200 + %val = call half @llvm.experimental.constrained.uitofp.f16.i32(i32 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @sitofp_f16_i64(i64 %x) #0 { +; CHECK-LABEL: sitofp_f16_i64: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl __floatdihf +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @uitofp_f16_i64(i64 %x) #0 { +; CHECK-LABEL: uitofp_f16_i64: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl __floatundihf +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.uitofp.f16.i64(i64 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @sitofp_f16_i128(i128 %x) #0 { +; CHECK-LABEL: sitofp_f16_i128: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl __floattihf +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.sitofp.f16.i128(i128 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @uitofp_f16_i128(i128 %x) #0 { +; CHECK-LABEL: uitofp_f16_i128: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl __floatuntihf +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.uitofp.f16.i128(i128 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @sqrt_f16(half %x) #0 { +; CHECK-LABEL: sqrt_f16: +; CHECK: vsqrt.f16 s0, s0 +; CHECK-NEXT: bx lr + %val = call half @llvm.experimental.constrained.sqrt.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @powi_f16(half %x, i32 %y) #0 { +; CHECK-LABEL: powi_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl __powisf2 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.powi.f16(half %x, i32 %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @sin_f16(half %x) #0 { +; CHECK-LABEL: sin_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl sinf +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.sin.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @cos_f16(half %x) #0 { +; CHECK-LABEL: cos_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl cosf +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.cos.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @tan_f16(half %x) #0 { +; CHECK-LABEL: tan_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl tanf +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.tan.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @asin_f16(half %x) #0 { +; CHECK-LABEL: asin_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl asinf +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.asin.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @acos_f16(half %x) #0 { +; CHECK-LABEL: acos_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl acosf +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.acos.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @atan_f16(half %x) #0 { +; CHECK-LABEL: atan_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl atanf +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.atan.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @atan2_f16(half %x, half %y) #0 { +; CHECK-LABEL: atan2_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtb.f32.f16 s1, s1 +; CHECK-NEXT: bl atan2f +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.atan2.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @sinh_f16(half %x) #0 { +; CHECK-LABEL: sinh_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl sinhf +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.sinh.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @cosh_f16(half %x) #0 { +; CHECK-LABEL: cosh_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl coshf +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.cosh.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @tanh_f16(half %x) #0 { +; CHECK-LABEL: tanh_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl tanhf +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.tanh.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @pow_f16(half %x, half %y) #0 { +; CHECK-LABEL: pow_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtb.f32.f16 s1, s1 +; CHECK-NEXT: bl powf +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.pow.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @log_f16(half %x) #0 { +; CHECK-LABEL: log_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl logf +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.log.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @log10_f16(half %x) #0 { +; CHECK-LABEL: log10_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl log10f +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.log10.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @log2_f16(half %x) #0 { +; CHECK-LABEL: log2_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl log2f +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.log2.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @exp_f16(half %x) #0 { +; CHECK-LABEL: exp_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl expf +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.exp.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @exp2_f16(half %x) #0 { +; CHECK-LABEL: exp2_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl exp2f +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.exp2.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @rint_f16(half %x) #0 { +; CHECK-LABEL: rint_f16: +; CHECK: vrintx.f16 s0, s0 +; CHECK-NEXT: bx lr + %val = call half @llvm.experimental.constrained.rint.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @nearbyint_f16(half %x) #0 { +; CHECK-LABEL: nearbyint_f16: +; CHECK: vrintr.f16 s0, s0 +; CHECK-NEXT: bx lr + %val = call half @llvm.experimental.constrained.nearbyint.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define i32 @lrint_f16(half %x) #0 { +; CHECK-LABEL: lrint_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl lrintf +; CHECK-NEXT: pop {r11, pc} + %val = call i32 @llvm.experimental.constrained.lrint.i32.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret i32 %val +} + +define i64 @llrint_f16(half %x) #0 { +; CHECK-LABEL: llrint_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl llrintf +; CHECK-NEXT: pop {r11, pc} + %val = call i64 @llvm.experimental.constrained.llrint.i64.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret i64 %val +} + +define half @maxnum_f16(half %x, half %y) #0 { +; CHECK-LABEL: maxnum_f16: +; CHECK: vmaxnm.f16 s0, s0, s1 +; CHECK-NEXT: bx lr + %val = call half @llvm.experimental.constrained.maxnum.f16(half %x, half %y, metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @minnum_f16(half %x, half %y) #0 { +; CHECK-LABEL: minnum_f16: +; CHECK: vminnm.f16 s0, s0, s1 +; CHECK-NEXT: bx lr + %val = call half @llvm.experimental.constrained.minnum.f16(half %x, half %y, metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @ceil_f16(half %x) #0 { +; CHECK-LABEL: ceil_f16: +; CHECK: vrintp.f16 s0, s0 +; CHECK-NEXT: bx lr + %val = call half @llvm.experimental.constrained.ceil.f16(half %x, metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @floor_f16(half %x) #0 { +; CHECK-LABEL: floor_f16: +; CHECK: vrintm.f16 s0, s0 +; CHECK-NEXT: bx lr + %val = call half @llvm.experimental.constrained.floor.f16(half %x, metadata !"fpexcept.strict") #0 + ret half %val +} + +define i32 @lround_f16(half %x) #0 { +; CHECK-LABEL: lround_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl lroundf +; CHECK-NEXT: pop {r11, pc} + %val = call i32 @llvm.experimental.constrained.lround.i32.f16(half %x, metadata !"fpexcept.strict") #0 + ret i32 %val +} + +define i64 @llround_f16(half %x) #0 { +; CHECK-LABEL: llround_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl llroundf +; CHECK-NEXT: pop {r11, pc} + %val = call i64 @llvm.experimental.constrained.llround.i64.f16(half %x, metadata !"fpexcept.strict") #0 + ret i64 %val +} + +define half @round_f16(half %x) #0 { +; CHECK-LABEL: round_f16: +; CHECK: vrinta.f16 s0, s0 +; CHECK-NEXT: bx lr + %val = call half @llvm.experimental.constrained.round.f16(half %x, metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @roundeven_f16(half %x) #0 { +; CHECK-LABEL: roundeven_f16: +; CHECK: vrintn.f16 s0, s0 +; CHECK-NEXT: bx lr + %val = call half @llvm.experimental.constrained.roundeven.f16(half %x, metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @trunc_f16(half %x) #0 { +; CHECK-LABEL: trunc_f16: +; CHECK: vrintz.f16 s0, s0 +; CHECK-NEXT: bx lr + %val = call half @llvm.experimental.constrained.trunc.f16(half %x, metadata !"fpexcept.strict") #0 + ret half %val +} + +define half @ldexp_f16(half %x, i32 %y) #0 { +; CHECK-LABEL: ldexp_f16: +; CHECK: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bl ldexpf +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: pop {r11, pc} + %val = call half @llvm.experimental.constrained.ldexp.f16.i32(half %x, i32 %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define i32 @fcmp_olt_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmp_olt_f16: +; CHECK: vcmp.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwmi r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"olt", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmp_ole_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmp_ole_f16: +; CHECK: vcmp.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwls r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ole", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmp_ogt_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmp_ogt_f16: +; CHECK: vcmp.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwgt r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ogt", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmp_oge_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmp_oge_f16: +; CHECK: vcmp.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwge r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"oge", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmp_oeq_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmp_oeq_f16: +; CHECK: vcmp.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movweq r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"oeq", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmp_one_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmp_one_f16: +; CHECK: vcmp.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwmi r0, #1 +; CHECK-NEXT: movwgt r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"one", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmp_ult_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmp_ult_f16: +; CHECK: vcmp.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwlt r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ult", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmp_ule_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmp_ule_f16: +; CHECK: vcmp.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwle r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ule", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmp_ugt_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmp_ugt_f16: +; CHECK: vcmp.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwhi r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ugt", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmp_uge_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmp_uge_f16: +; CHECK: vcmp.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwpl r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"uge", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmp_ueq_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmp_ueq_f16: +; CHECK: vcmp.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movweq r0, #1 +; CHECK-NEXT: movwvs r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ueq", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmp_une_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmp_une_f16: +; CHECK: vcmp.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwne r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"une", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmps_olt_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmps_olt_f16: +; CHECK: vcmpe.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwmi r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"olt", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmps_ole_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmps_ole_f16: +; CHECK: vcmpe.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwls r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"ole", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmps_ogt_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmps_ogt_f16: +; CHECK: vcmpe.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwgt r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"ogt", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmps_oge_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmps_oge_f16: +; CHECK: vcmpe.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwge r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"oge", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmps_oeq_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmps_oeq_f16: +; CHECK: vcmpe.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movweq r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"oeq", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmps_one_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmps_one_f16: +; CHECK: vcmpe.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwmi r0, #1 +; CHECK-NEXT: movwgt r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"one", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmps_ult_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmps_ult_f16: +; CHECK: vcmpe.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwlt r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"ult", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmps_ule_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmps_ule_f16: +; CHECK: vcmpe.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwle r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"ule", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmps_ugt_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmps_ugt_f16: +; CHECK: vcmpe.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwhi r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"ugt", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmps_uge_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmps_uge_f16: +; CHECK: vcmpe.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwpl r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"uge", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmps_ueq_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmps_ueq_f16: +; CHECK: vcmpe.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movweq r0, #1 +; CHECK-NEXT: movwvs r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"ueq", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @fcmps_une_f16(half %a, half %b) #0 { +; CHECK-LABEL: fcmps_une_f16: +; CHECK: vcmpe.f16 s0, s1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: movwne r0, #1 +; CHECK-NEXT: bx lr + %cmp = call i1 @llvm.experimental.constrained.fcmps.f16(half %a, half %b, metadata !"une", metadata !"fpexcept.strict") #0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + + +; Intrinsics to convert between floating-point types + +define half @fptrunc_f16_f32(float %x) #0 { +; CHECK-LABEL: fptrunc_f16_f32: +; CHECK: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: bx lr + %val = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + +define float @fpext_f32_f16(half %x) #0 { +; CHECK-LABEL: fpext_f32_f16: +; CHECK: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: bx lr + %val = call float @llvm.experimental.constrained.fpext.f32.f16(half %x, metadata !"fpexcept.strict") #0 + ret float %val +} + + +attributes #0 = { strictfp } + +declare half @llvm.experimental.constrained.fadd.f16(half, half, metadata, metadata) +declare half @llvm.experimental.constrained.fsub.f16(half, half, metadata, metadata) +declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata) +declare half @llvm.experimental.constrained.fdiv.f16(half, half, metadata, metadata) +declare half @llvm.experimental.constrained.frem.f16(half, half, metadata, metadata) +declare half @llvm.experimental.constrained.fma.f16(half, half, half, metadata, metadata) +declare half @llvm.experimental.constrained.fmuladd.f16(half, half, half, metadata, metadata) +declare i32 @llvm.experimental.constrained.fptosi.i32.f16(half, metadata) +declare i32 @llvm.experimental.constrained.fptoui.i32.f16(half, metadata) +declare i64 @llvm.experimental.constrained.fptosi.i64.f16(half, metadata) +declare i64 @llvm.experimental.constrained.fptoui.i64.f16(half, metadata) +declare half @llvm.experimental.constrained.sitofp.f16.i32(i32, metadata, metadata) +declare half @llvm.experimental.constrained.uitofp.f16.i32(i32, metadata, metadata) +declare half @llvm.experimental.constrained.sitofp.f16.i64(i64, metadata, metadata) +declare half @llvm.experimental.constrained.uitofp.f16.i64(i64, metadata, metadata) +declare half @llvm.experimental.constrained.sitofp.f16.i128(i128, metadata, metadata) +declare half @llvm.experimental.constrained.uitofp.f16.i128(i128, metadata, metadata) +declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata) +declare half @llvm.experimental.constrained.powi.f16(half, i32, metadata, metadata) +declare half @llvm.experimental.constrained.sin.f16(half, metadata, metadata) +declare half @llvm.experimental.constrained.cos.f16(half, metadata, metadata) +declare half @llvm.experimental.constrained.tan.f16(half, metadata, metadata) +declare half @llvm.experimental.constrained.pow.f16(half, half, metadata, metadata) +declare half @llvm.experimental.constrained.log.f16(half, metadata, metadata) +declare half @llvm.experimental.constrained.log10.f16(half, metadata, metadata) +declare half @llvm.experimental.constrained.log2.f16(half, metadata, metadata) +declare half @llvm.experimental.constrained.exp.f16(half, metadata, metadata) +declare half @llvm.experimental.constrained.exp2.f16(half, metadata, metadata) +declare half @llvm.experimental.constrained.rint.f16(half, metadata, metadata) +declare half @llvm.experimental.constrained.nearbyint.f16(half, metadata, metadata) +declare i32 @llvm.experimental.constrained.lrint.i32.f16(half, metadata, metadata) +declare i64 @llvm.experimental.constrained.llrint.i64.f16(half, metadata, metadata) +declare half @llvm.experimental.constrained.maxnum.f16(half, half, metadata) +declare half @llvm.experimental.constrained.minnum.f16(half, half, metadata) +declare half @llvm.experimental.constrained.ceil.f16(half, metadata) +declare half @llvm.experimental.constrained.floor.f16(half, metadata) +declare i32 @llvm.experimental.constrained.lround.i32.f16(half, metadata) +declare i64 @llvm.experimental.constrained.llround.i64.f16(half, metadata) +declare half @llvm.experimental.constrained.round.f16(half, metadata) +declare half @llvm.experimental.constrained.roundeven.f16(half, metadata) +declare half @llvm.experimental.constrained.trunc.f16(half, metadata) +declare i1 @llvm.experimental.constrained.fcmps.f16(half, half, metadata, metadata) +declare i1 @llvm.experimental.constrained.fcmp.f16(half, half, metadata, metadata) + +declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) +declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) + + declare half @llvm.sqrt.f16(half %a) declare half @llvm.powi.f16.i32(half %a, i32 %b) declare half @llvm.sin.f16(half %a) diff --git a/llvm/test/CodeGen/ARM/strict-fp-int-promote.ll b/llvm/test/CodeGen/ARM/strict-fp-int-promote.ll new file mode 100644 index 0000000000000..6e5b58974fc50 --- /dev/null +++ b/llvm/test/CodeGen/ARM/strict-fp-int-promote.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple armv7-- -mattr=+vfp4 -O0 -o - %s | FileCheck %s +; RUN: llc -mtriple armv7-- -mattr=+vfp4 -O3 -o - %s | FileCheck %s --check-prefix=CHECK-O3 + +declare float @llvm.experimental.constrained.sitofp.f32.i32(i32, metadata, metadata) +declare float @llvm.experimental.constrained.sitofp.f32.i16(i16, metadata, metadata) +declare i1 @llvm.experimental.constrained.fcmp.f32(float, float, metadata, metadata) +declare float @llvm.experimental.constrained.uitofp.f32.i16(i16, metadata, metadata) + +define i32 @test(i32 %a, i16 %b) #0 { +; CHECK-LABEL: test: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: sxth r0, r1 +; CHECK-NEXT: movw r1, #0 +; CHECK-NEXT: movt r1, #17200 +; CHECK-NEXT: str r1, [sp, #4] +; CHECK-NEXT: eor r2, r2, #-2147483648 +; CHECK-NEXT: str r2, [sp] +; CHECK-NEXT: vldr d16, [sp] +; CHECK-NEXT: vldr d17, .LCPI0_0 +; CHECK-NEXT: vsub.f64 d16, d16, d17 +; CHECK-NEXT: vcvt.f32.f64 s0, d16 +; CHECK-NEXT: str r1, [sp, #12] +; CHECK-NEXT: eor r0, r0, #-2147483648 +; CHECK-NEXT: str r0, [sp, #8] +; CHECK-NEXT: vldr d16, [sp, #8] +; CHECK-NEXT: vsub.f64 d16, d16, d17 +; CHECK-NEXT: vcvt.f32.f64 s2, d16 +; CHECK-NEXT: vcmp.f32 s0, s2 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: movweq r0, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI0_0: +; CHECK-NEXT: .long 2147483648 @ double 4503601774854144 +; CHECK-NEXT: .long 1127219200 +; +; CHECK-O3-LABEL: test: +; CHECK-O3: @ %bb.0: @ %entry +; CHECK-O3-NEXT: sub sp, sp, #16 +; CHECK-O3-NEXT: sxth r1, r1 +; CHECK-O3-NEXT: movw r2, #0 +; CHECK-O3-NEXT: movt r2, #17200 +; CHECK-O3-NEXT: str r2, [sp, #4] +; CHECK-O3-NEXT: eor r0, r0, #-2147483648 +; CHECK-O3-NEXT: str r0, [sp] +; CHECK-O3-NEXT: vldr d16, [sp] +; CHECK-O3-NEXT: vldr d17, .LCPI0_0 +; CHECK-O3-NEXT: vsub.f64 d16, d16, d17 +; CHECK-O3-NEXT: vcvt.f32.f64 s0, d16 +; CHECK-O3-NEXT: str r2, [sp, #12] +; CHECK-O3-NEXT: eor r0, r1, #-2147483648 +; CHECK-O3-NEXT: str r0, [sp, #8] +; CHECK-O3-NEXT: vldr d16, [sp, #8] +; CHECK-O3-NEXT: vsub.f64 d16, d16, d17 +; CHECK-O3-NEXT: vcvt.f32.f64 s2, d16 +; CHECK-O3-NEXT: vcmp.f32 s0, s2 +; CHECK-O3-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-O3-NEXT: mov r0, #0 +; CHECK-O3-NEXT: movweq r0, #1 +; CHECK-O3-NEXT: add sp, sp, #16 +; CHECK-O3-NEXT: bx lr +; CHECK-O3-NEXT: .p2align 3 +; CHECK-O3-NEXT: @ %bb.1: +; CHECK-O3-NEXT: .LCPI0_0: +; CHECK-O3-NEXT: .long 2147483648 @ double 4503601774854144 +; CHECK-O3-NEXT: .long 1127219200 +entry: + %conv = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + %conv1 = call float @llvm.experimental.constrained.sitofp.f32.i16(i16 %b, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + %cmp = call i1 @llvm.experimental.constrained.fcmp.f32(float %conv, float %conv1, metadata !"oeq", metadata !"fpexcept.strict") #1 + %conv2 = zext i1 %cmp to i32 + ret i32 %conv2 +} + +define i32 @test2(i32 %a, i16 %b) #0 { +; CHECK-LABEL: test2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: uxth r0, r1 +; CHECK-NEXT: movw r1, #0 +; CHECK-NEXT: movt r1, #17200 +; CHECK-NEXT: str r1, [sp, #4] +; CHECK-NEXT: eor r2, r2, #-2147483648 +; CHECK-NEXT: str r2, [sp] +; CHECK-NEXT: vldr d16, [sp] +; CHECK-NEXT: vldr d17, .LCPI1_0 +; CHECK-NEXT: vsub.f64 d16, d16, d17 +; CHECK-NEXT: vcvt.f32.f64 s0, d16 +; CHECK-NEXT: str r1, [sp, #12] +; CHECK-NEXT: str r0, [sp, #8] +; CHECK-NEXT: vldr d16, [sp, #8] +; CHECK-NEXT: vldr d17, .LCPI1_1 +; CHECK-NEXT: vsub.f64 d16, d16, d17 +; CHECK-NEXT: vcvt.f32.f64 s2, d16 +; CHECK-NEXT: vcmp.f32 s0, s2 +; CHECK-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: movweq r0, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI1_0: +; CHECK-NEXT: .long 2147483648 @ double 4503601774854144 +; CHECK-NEXT: .long 1127219200 +; CHECK-NEXT: .LCPI1_1: +; CHECK-NEXT: .long 0 @ double 4503599627370496 +; CHECK-NEXT: .long 1127219200 +; +; CHECK-O3-LABEL: test2: +; CHECK-O3: @ %bb.0: @ %entry +; CHECK-O3-NEXT: sub sp, sp, #16 +; CHECK-O3-NEXT: uxth r1, r1 +; CHECK-O3-NEXT: movw r2, #0 +; CHECK-O3-NEXT: movt r2, #17200 +; CHECK-O3-NEXT: str r2, [sp, #4] +; CHECK-O3-NEXT: eor r0, r0, #-2147483648 +; CHECK-O3-NEXT: str r0, [sp] +; CHECK-O3-NEXT: vldr d16, [sp] +; CHECK-O3-NEXT: vldr d17, .LCPI1_0 +; CHECK-O3-NEXT: vsub.f64 d16, d16, d17 +; CHECK-O3-NEXT: vcvt.f32.f64 s0, d16 +; CHECK-O3-NEXT: str r2, [sp, #12] +; CHECK-O3-NEXT: str r1, [sp, #8] +; CHECK-O3-NEXT: vldr d16, [sp, #8] +; CHECK-O3-NEXT: vldr d17, .LCPI1_1 +; CHECK-O3-NEXT: vsub.f64 d16, d16, d17 +; CHECK-O3-NEXT: vcvt.f32.f64 s2, d16 +; CHECK-O3-NEXT: vcmp.f32 s0, s2 +; CHECK-O3-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-O3-NEXT: mov r0, #0 +; CHECK-O3-NEXT: movweq r0, #1 +; CHECK-O3-NEXT: add sp, sp, #16 +; CHECK-O3-NEXT: bx lr +; CHECK-O3-NEXT: .p2align 3 +; CHECK-O3-NEXT: @ %bb.1: +; CHECK-O3-NEXT: .LCPI1_0: +; CHECK-O3-NEXT: .long 2147483648 @ double 4503601774854144 +; CHECK-O3-NEXT: .long 1127219200 +; CHECK-O3-NEXT: .LCPI1_1: +; CHECK-O3-NEXT: .long 0 @ double 4503599627370496 +; CHECK-O3-NEXT: .long 1127219200 +entry: + %conv = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + %conv1 = call float @llvm.experimental.constrained.uitofp.f32.i16(i16 %b, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + %cmp = call i1 @llvm.experimental.constrained.fcmp.f32(float %conv, float %conv1, metadata !"oeq", metadata !"fpexcept.strict") #1 + %conv2 = zext i1 %cmp to i32 + ret i32 %conv2 +} + +attributes #0 = { strictfp noinline optnone } +attributes #1 = { strictfp } diff --git a/llvm/test/CodeGen/ARM/strict-fp-ops.ll b/llvm/test/CodeGen/ARM/strict-fp-ops.ll new file mode 100644 index 0000000000000..608ab0716e0df --- /dev/null +++ b/llvm/test/CodeGen/ARM/strict-fp-ops.ll @@ -0,0 +1,202 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple armv7-- -mattr=+vfp4 %s -o - | FileCheck %s + + +; Div whose result is unused should be removed unless we have strict exceptions + +define void @unused_div(float %x, float %y) { +; CHECK-LABEL: unused_div: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %add = fdiv float %x, %y + ret void +} + +define void @unused_div_fpexcept_strict(float %x, float %y) #0 { +; CHECK-LABEL: unused_div_fpexcept_strict: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vdiv.f32 s0, s2, s0 +; CHECK-NEXT: bx lr +entry: + %add = call float @llvm.experimental.constrained.fdiv.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret void +} + +define void @unused_div_round_dynamic(float %x, float %y) #0 { +; CHECK-LABEL: unused_div_round_dynamic: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %add = call float @llvm.experimental.constrained.fdiv.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + ret void +} + + +; Machine CSE should eliminate the second add unless we have strict exceptions + +define float @add_twice(float %x, float %y, i32 %n) { +; CHECK-LABEL: add_twice: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vadd.f32 s0, s2, s0 +; CHECK-NEXT: vmul.f32 s2, s0, s0 +; CHECK-NEXT: vmoveq.f32 s2, s0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: bx lr +entry: + %add = fadd float %x, %y + %tobool.not = icmp eq i32 %n, 0 + br i1 %tobool.not, label %if.end, label %if.then + +if.then: + %add1 = fadd float %x, %y + %mul = fmul float %add, %add1 + br label %if.end + +if.end: + %a.0 = phi float [ %mul, %if.then ], [ %add, %entry ] + ret float %a.0 +} + +define float @add_twice_fpexcept_strict(float %x, float %y, i32 %n) #0 { +; CHECK-LABEL: add_twice_fpexcept_strict: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov s2, r1 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: vmov s4, r0 +; CHECK-NEXT: vadd.f32 s0, s4, s2 +; CHECK-NEXT: vaddne.f32 s2, s4, s2 +; CHECK-NEXT: vmulne.f32 s0, s0, s2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +entry: + %add = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + %tobool.not = icmp eq i32 %n, 0 + br i1 %tobool.not, label %if.end, label %if.then + +if.then: + %add1 = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + %mul = call float @llvm.experimental.constrained.fmul.f32(float %add, float %add1, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + br label %if.end + +if.end: + %a.0 = phi float [ %mul, %if.then ], [ %add, %entry ] + ret float %a.0 +} + +define float @add_twice_round_dynamic(float %x, float %y, i32 %n) #0 { +; CHECK-LABEL: add_twice_round_dynamic: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vadd.f32 s0, s2, s0 +; CHECK-NEXT: vmulne.f32 s0, s0, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +entry: + %add = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %tobool.not = icmp eq i32 %n, 0 + br i1 %tobool.not, label %if.end, label %if.then + +if.then: + %add1 = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %mul = call float @llvm.experimental.constrained.fmul.f32(float %add, float %add1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + br label %if.end + +if.end: + %a.0 = phi float [ %mul, %if.then ], [ %add, %entry ] + ret float %a.0 +} + +; Two adds separated by llvm.set.rounding should be preserved when rounding is +; dynamic (as they may give different results) or when we have strict exceptions +; (the llvm.set.rounding is irrelevant, but both could trap). + +define float @set_rounding(float %x, float %y) { +; CHECK-LABEL: set_rounding: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmrs r2, fpscr +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: vadd.f32 s0, s2, s0 +; CHECK-NEXT: vsub.f32 s0, s0, s0 +; CHECK-NEXT: orr r0, r2, #12582912 +; CHECK-NEXT: vmsr fpscr, r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmrs r1, fpscr +; CHECK-NEXT: bic r1, r1, #12582912 +; CHECK-NEXT: vmsr fpscr, r1 +; CHECK-NEXT: bx lr +entry: + %add1 = fadd float %x, %y + call void @llvm.set.rounding(i32 0) + %add2 = fadd float %x, %y + call void @llvm.set.rounding(i32 1) + %sub = fsub float %add1, %add2 + ret float %sub +} + +define float @set_rounding_fpexcept_strict(float %x, float %y) #0 { +; CHECK-LABEL: set_rounding_fpexcept_strict: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vadd.f32 s4, s2, s0 +; CHECK-NEXT: vmrs r0, fpscr +; CHECK-NEXT: orr r0, r0, #12582912 +; CHECK-NEXT: vmsr fpscr, r0 +; CHECK-NEXT: vadd.f32 s0, s2, s0 +; CHECK-NEXT: vmrs r0, fpscr +; CHECK-NEXT: bic r0, r0, #12582912 +; CHECK-NEXT: vmsr fpscr, r0 +; CHECK-NEXT: vsub.f32 s0, s4, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +entry: + %add1 = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + call void @llvm.set.rounding(i32 0) #0 + %add2 = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + call void @llvm.set.rounding(i32 1) #0 + %sub = call float @llvm.experimental.constrained.fsub.f32(float %add1, float %add2, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %sub +} + +define float @set_rounding_round_dynamic(float %x, float %y) #0 { +; CHECK-LABEL: set_rounding_round_dynamic: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vmrs r0, fpscr +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: vadd.f32 s4, s2, s0 +; CHECK-NEXT: orr r0, r0, #12582912 +; CHECK-NEXT: vmsr fpscr, r0 +; CHECK-NEXT: vmrs r0, fpscr +; CHECK-NEXT: vadd.f32 s0, s2, s0 +; CHECK-NEXT: bic r0, r0, #12582912 +; CHECK-NEXT: vmsr fpscr, r0 +; CHECK-NEXT: vsub.f32 s0, s4, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +entry: + %add1 = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + call void @llvm.set.rounding(i32 0) #0 + %add2 = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + call void @llvm.set.rounding(i32 1) #0 + %sub = call float @llvm.experimental.constrained.fsub.f32(float %add1, float %add2, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + ret float %sub +} + +declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) +declare float @llvm.experimental.constrained.fsub.f32(float, float, metadata, metadata) +declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata) +declare float @llvm.experimental.constrained.fdiv.f32(float, float, metadata, metadata) +declare i32 @llvm.get.rounding() +declare void @llvm.set.rounding(i32) + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/ARM/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/ARM/strictfp_f16_abi_promote.ll new file mode 100644 index 0000000000000..5906c796d2751 --- /dev/null +++ b/llvm/test/CodeGen/ARM/strictfp_f16_abi_promote.ll @@ -0,0 +1,270 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=armv7-- < %s | FileCheck -check-prefix=NOFP16 %s + +declare void @f16_user(half) +declare half @f16_result() + +declare void @v2f16_user(<2 x half>) +declare <2 x half> @v2f16_result() + +declare void @v4f16_user(<4 x half>) +declare <4 x half> @v4f16_result() + +declare void @v8f16_user(<8 x half>) +declare <8 x half> @v8f16_result() + +define void @f16_arg(half %arg, ptr %ptr) #0 { +; NOFP16-LABEL: f16_arg: +; NOFP16: @ %bb.0: +; NOFP16-NEXT: push {r4, lr} +; NOFP16-NEXT: uxth r0, r0 +; NOFP16-NEXT: mov r4, r1 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: str r0, [r4] +; NOFP16-NEXT: pop {r4, pc} + %fpext = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict") + store float %fpext, ptr %ptr + ret void +} + +define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 { +; NOFP16-LABEL: v2f16_arg: +; NOFP16: @ %bb.0: +; NOFP16-NEXT: push {r4, r5, r11, lr} +; NOFP16-NEXT: vpush {d8} +; NOFP16-NEXT: mov r5, r0 +; NOFP16-NEXT: uxth r0, r1 +; NOFP16-NEXT: mov r4, r2 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: uxth r1, r5 +; NOFP16-NEXT: vmov s17, r0 +; NOFP16-NEXT: mov r0, r1 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: vmov s16, r0 +; NOFP16-NEXT: vstr d8, [r4] +; NOFP16-NEXT: vpop {d8} +; NOFP16-NEXT: pop {r4, r5, r11, pc} + %fpext = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict") + store <2 x float> %fpext, ptr %ptr + ret void +} + +define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 { +; NOFP16-LABEL: v3f16_arg: +; NOFP16: @ %bb.0: +; NOFP16-NEXT: push {r4, r5, r6, lr} +; NOFP16-NEXT: vpush {d8} +; NOFP16-NEXT: mov r6, r0 +; NOFP16-NEXT: uxth r0, r1 +; NOFP16-NEXT: mov r4, r3 +; NOFP16-NEXT: mov r5, r2 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: uxth r1, r6 +; NOFP16-NEXT: vmov s17, r0 +; NOFP16-NEXT: mov r0, r1 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: vmov s16, r0 +; NOFP16-NEXT: uxth r0, r5 +; NOFP16-NEXT: vst1.32 {d8}, [r4:64]! +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: str r0, [r4] +; NOFP16-NEXT: vpop {d8} +; NOFP16-NEXT: pop {r4, r5, r6, pc} + %fpext = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict") + store <3 x float> %fpext, ptr %ptr + ret void +} + +define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { +; NOFP16-LABEL: v4f16_arg: +; NOFP16: @ %bb.0: +; NOFP16-NEXT: push {r4, r5, r6, r7, r11, lr} +; NOFP16-NEXT: vpush {d8, d9} +; NOFP16-NEXT: mov r6, r0 +; NOFP16-NEXT: uxth r0, r1 +; NOFP16-NEXT: mov r4, r3 +; NOFP16-NEXT: mov r5, r2 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov r7, r0 +; NOFP16-NEXT: uxth r0, r4 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: vmov s19, r0 +; NOFP16-NEXT: uxth r0, r5 +; NOFP16-NEXT: ldr r4, [sp, #40] +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: vmov s18, r0 +; NOFP16-NEXT: uxth r0, r6 +; NOFP16-NEXT: vmov s17, r7 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: vmov s16, r0 +; NOFP16-NEXT: vst1.64 {d8, d9}, [r4] +; NOFP16-NEXT: vpop {d8, d9} +; NOFP16-NEXT: pop {r4, r5, r6, r7, r11, pc} + %fpext = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %arg, metadata !"fpexcept.strict") + store <4 x float> %fpext, ptr %ptr + ret void +} + + define half @f16_return(float %arg) #0 { +; NOFP16-LABEL: f16_return: +; NOFP16: @ %bb.0: +; NOFP16-NEXT: push {r11, lr} +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: pop {r11, pc} + %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret half %fptrunc + } + + define <2 x half> @v2f16_return(<2 x float> %arg) #0 { +; NOFP16-LABEL: v2f16_return: +; NOFP16: @ %bb.0: +; NOFP16-NEXT: push {r11, lr} +; NOFP16-NEXT: vpush {d8} +; NOFP16-NEXT: sub sp, sp, #8 +; NOFP16-NEXT: vmov d8, r0, r1 +; NOFP16-NEXT: vmov r0, s17 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: vmov r1, s16 +; NOFP16-NEXT: strh r0, [sp, #6] +; NOFP16-NEXT: mov r0, r1 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh r0, [sp, #4] +; NOFP16-NEXT: add r0, sp, #4 +; NOFP16-NEXT: vld1.32 {d16[0]}, [r0:32] +; NOFP16-NEXT: vmovl.u16 q8, d16 +; NOFP16-NEXT: vmov.32 r0, d16[0] +; NOFP16-NEXT: vmov.32 r1, d16[1] +; NOFP16-NEXT: add sp, sp, #8 +; NOFP16-NEXT: vpop {d8} +; NOFP16-NEXT: pop {r11, pc} + %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret <2 x half> %fptrunc + } + + define <3 x half> @v3f16_return(<3 x float> %arg) #0 { +; NOFP16-LABEL: v3f16_return: +; NOFP16: @ %bb.0: +; NOFP16-NEXT: push {r4, r5, r6, lr} +; NOFP16-NEXT: vmov d1, r2, r3 +; NOFP16-NEXT: mov r5, r0 +; NOFP16-NEXT: vmov d0, r0, r1 +; NOFP16-NEXT: mov r4, r1 +; NOFP16-NEXT: vmov r0, s2 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: uxth r6, r0 +; NOFP16-NEXT: mov r0, r4 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: mov r4, r0 +; NOFP16-NEXT: mov r0, r5 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: pkhbt r0, r0, r4, lsl #16 +; NOFP16-NEXT: vmov d16, r0, r6 +; NOFP16-NEXT: vmov.u16 r0, d16[0] +; NOFP16-NEXT: vmov.u16 r1, d16[1] +; NOFP16-NEXT: vmov.u16 r2, d16[2] +; NOFP16-NEXT: pop {r4, r5, r6, pc} + %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret <3 x half> %fptrunc + } + + define <4 x half> @v4f16_return(<4 x float> %arg) #0 { +; NOFP16-LABEL: v4f16_return: +; NOFP16: @ %bb.0: +; NOFP16-NEXT: push {r4, r5, r11, lr} +; NOFP16-NEXT: vpush {d8, d9} +; NOFP16-NEXT: vmov d8, r2, r3 +; NOFP16-NEXT: vmov d9, r0, r1 +; NOFP16-NEXT: vmov r2, s17 +; NOFP16-NEXT: mov r0, r2 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: mov r4, r0 +; NOFP16-NEXT: vmov r0, s16 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: vmov r1, s19 +; NOFP16-NEXT: pkhbt r5, r0, r4, lsl #16 +; NOFP16-NEXT: mov r0, r1 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: mov r4, r0 +; NOFP16-NEXT: vmov r0, s18 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: pkhbt r0, r0, r4, lsl #16 +; NOFP16-NEXT: vmov d16, r0, r5 +; NOFP16-NEXT: vmov.u16 r0, d16[0] +; NOFP16-NEXT: vmov.u16 r1, d16[1] +; NOFP16-NEXT: vmov.u16 r2, d16[2] +; NOFP16-NEXT: vmov.u16 r3, d16[3] +; NOFP16-NEXT: vpop {d8, d9} +; NOFP16-NEXT: pop {r4, r5, r11, pc} + %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret <4 x half> %fptrunc + } + +define void @outgoing_v4f16_return(ptr %ptr) #0 { +; NOFP16-LABEL: outgoing_v4f16_return: +; NOFP16: @ %bb.0: +; NOFP16-NEXT: push {r4, lr} +; NOFP16-NEXT: mov r4, r0 +; NOFP16-NEXT: bl v4f16_result +; NOFP16-NEXT: strh r3, [r4, #6] +; NOFP16-NEXT: strh r2, [r4, #4] +; NOFP16-NEXT: strh r1, [r4, #2] +; NOFP16-NEXT: strh r0, [r4] +; NOFP16-NEXT: pop {r4, pc} + %val = call <4 x half> @v4f16_result() #0 + store <4 x half> %val, ptr %ptr + ret void +} + +define void @outgoing_v8f16_return(ptr %ptr) #0 { +; NOFP16-LABEL: outgoing_v8f16_return: +; NOFP16: @ %bb.0: +; NOFP16-NEXT: push {r4, r10, r11, lr} +; NOFP16-NEXT: add r11, sp, #8 +; NOFP16-NEXT: sub sp, sp, #16 +; NOFP16-NEXT: bfc sp, #0, #4 +; NOFP16-NEXT: mov r4, r0 +; NOFP16-NEXT: mov r0, sp +; NOFP16-NEXT: bl v8f16_result +; NOFP16-NEXT: ldm sp, {r0, r1, r2, r3} +; NOFP16-NEXT: stm r4, {r0, r1, r2, r3} +; NOFP16-NEXT: sub sp, r11, #8 +; NOFP16-NEXT: pop {r4, r10, r11, pc} + %val = call <8 x half> @v8f16_result() #0 + store <8 x half> %val, ptr %ptr + ret void +} + +define half @call_split_type_used_outside_block_v8f16() #0 { +; NOFP16-LABEL: call_split_type_used_outside_block_v8f16: +; NOFP16: @ %bb.0: @ %bb0 +; NOFP16-NEXT: push {r4, r10, r11, lr} +; NOFP16-NEXT: add r11, sp, #8 +; NOFP16-NEXT: sub sp, sp, #16 +; NOFP16-NEXT: bfc sp, #0, #4 +; NOFP16-NEXT: mov r4, sp +; NOFP16-NEXT: mov r0, r4 +; NOFP16-NEXT: bl v8f16_result +; NOFP16-NEXT: vld1.32 {d16[0]}, [r4:32] +; NOFP16-NEXT: vmov.u16 r0, d16[0] +; NOFP16-NEXT: sub sp, r11, #8 +; NOFP16-NEXT: pop {r4, r10, r11, pc} +bb0: + %split.ret.type = call <8 x half> @v8f16_result() #0 + br label %bb1 + +bb1: + %extract = extractelement <8 x half> %split.ret.type, i32 0 + ret half %extract +} + +declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0 +declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0 +declare <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half>, metadata) #0 +declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) #0 + +declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0 +declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0 +declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float>, metadata, metadata) #0 +declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float>, metadata, metadata) #0 + +attributes #0 = { strictfp } From 6397aad852e4db8e284bfa5b4f5c4a37c3826f67 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Wed, 29 Oct 2025 21:50:12 +0000 Subject: [PATCH 148/539] [HashRecognize] Forbid optz when data.next has exit-block user (#165574) The CRC optimization relies on stripping the auxiliary data completely, and should hence be forbidden when it has a user in the exit-block. Forbid this case, fixing a miscompile. Fixes #165382. --- llvm/lib/Analysis/HashRecognize.cpp | 5 +- .../HashRecognize/cyclic-redundancy-check.ll | 81 +++++++++++++++++++ .../LoopIdiom/cyclic-redundancy-check.ll | 46 +++++++++++ 3 files changed, 131 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Analysis/HashRecognize.cpp b/llvm/lib/Analysis/HashRecognize.cpp index 4529123508a7c..8974ce5734b13 100644 --- a/llvm/lib/Analysis/HashRecognize.cpp +++ b/llvm/lib/Analysis/HashRecognize.cpp @@ -468,8 +468,11 @@ std::variant HashRecognize::recognizeCRC() const { // Ensure that the PHIs have exactly two uses: // the bit-shift, and the XOR (or a cast feeding into the XOR). + // Also ensure that the SimpleRecurrence's evolution doesn't have stray + // users. if (!ConditionalRecurrence.Phi->hasNUses(2) || - !SimpleRecurrence.Phi->hasNUses(2)) + !SimpleRecurrence.Phi->hasNUses(2) || + SimpleRecurrence.BO->getUniqueUndroppableUser() != SimpleRecurrence.Phi) return "Recurrences have stray uses"; // Check that the SelectInst ConditionalRecurrence.Step is conditional on diff --git a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll index 7dec2f8f96906..78b4139d21982 100644 --- a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll +++ b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll @@ -1448,4 +1448,85 @@ exit: ; preds = %loop ret i16 %crc.next } +define i16 @not.crc.data.next.outside.user(i16 %crc.init, i16 %data.init) { +; CHECK-LABEL: 'not.crc.data.next.outside.user' +; CHECK-NEXT: Did not find a hash algorithm +; CHECK-NEXT: Reason: Recurrences have stray uses +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ] + %data = phi i16 [ %data.init, %entry ], [ %data.next, %loop ] + %xor.crc.data = xor i16 %data, %crc + %crc.shl = shl i16 %crc, 1 + %crc.xor = xor i16 %crc.shl, 3 + %check.sb = icmp slt i16 %xor.crc.data, 0 + %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl + %data.next = shl i16 %data, 1 + %iv.next = add nuw nsw i32 %iv, 1 + %exit.cond = icmp samesign ult i32 %iv, 7 + br i1 %exit.cond, label %loop, label %exit + +exit: + %ret = xor i16 %data.next, %crc.next + ret i16 %ret +} + +define i16 @not.crc.data.phi.outside.user(i16 %crc.init, i16 %data.init) { +; CHECK-LABEL: 'not.crc.data.phi.outside.user' +; CHECK-NEXT: Did not find a hash algorithm +; CHECK-NEXT: Reason: Recurrences have stray uses +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ] + %data = phi i16 [ %data.init, %entry ], [ %data.next, %loop ] + %xor.crc.data = xor i16 %data, %crc + %crc.shl = shl i16 %crc, 1 + %crc.xor = xor i16 %crc.shl, 3 + %check.sb = icmp slt i16 %xor.crc.data, 0 + %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl + %data.next = shl i16 %data, 1 + %iv.next = add nuw nsw i32 %iv, 1 + %exit.cond = icmp samesign ult i32 %iv, 7 + br i1 %exit.cond, label %loop, label %exit + +exit: + %ret = xor i16 %data, %crc.next + ret i16 %ret +} + +define i16 @not.crc.crc.phi.outside.user(i16 %crc.init, i16 %data.init) { +; CHECK-LABEL: 'not.crc.crc.phi.outside.user' +; CHECK-NEXT: Did not find a hash algorithm +; CHECK-NEXT: Reason: Recurrences have stray uses +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ] + %data = phi i16 [ %data.init, %entry ], [ %data.next, %loop ] + %xor.crc.data = xor i16 %data, %crc + %crc.shl = shl i16 %crc, 1 + %crc.xor = xor i16 %crc.shl, 3 + %check.sb = icmp slt i16 %xor.crc.data, 0 + %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl + %data.next = shl i16 %data, 1 + %iv.next = add nuw nsw i32 %iv, 1 + %exit.cond = icmp samesign ult i32 %iv, 7 + br i1 %exit.cond, label %loop, label %exit + +exit: + %ret = xor i16 %crc, %crc.next + ret i16 %ret +} + declare i16 @side.effect() diff --git a/llvm/test/Transforms/LoopIdiom/cyclic-redundancy-check.ll b/llvm/test/Transforms/LoopIdiom/cyclic-redundancy-check.ll index b2ec53ca405d4..90995a0257721 100644 --- a/llvm/test/Transforms/LoopIdiom/cyclic-redundancy-check.ll +++ b/llvm/test/Transforms/LoopIdiom/cyclic-redundancy-check.ll @@ -537,6 +537,52 @@ exit: ; preds = %loop %ret = and i32 %unrelated.next, %crc.next ret i32 %ret } + +define i16 @not.crc.data.next.outside.user(i16 %crc.init, i16 %data.init) { +; CHECK-LABEL: define i16 @not.crc.data.next.outside.user( +; CHECK-SAME: i16 [[CRC_INIT:%.*]], i16 [[DATA_INIT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[TBL_LD:%.*]] = phi i16 [ [[CRC_INIT]], %[[ENTRY]] ], [ [[CRC_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[CRC_BE_SHIFT:%.*]] = phi i16 [ [[DATA_INIT]], %[[ENTRY]] ], [ [[DATA_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[CRC_NEXT3:%.*]] = xor i16 [[CRC_BE_SHIFT]], [[TBL_LD]] +; CHECK-NEXT: [[CRC_SHL:%.*]] = shl i16 [[TBL_LD]], 1 +; CHECK-NEXT: [[CRC_XOR:%.*]] = xor i16 [[CRC_SHL]], 3 +; CHECK-NEXT: [[CHECK_SB:%.*]] = icmp slt i16 [[CRC_NEXT3]], 0 +; CHECK-NEXT: [[CRC_NEXT]] = select i1 [[CHECK_SB]], i16 [[CRC_XOR]], i16 [[CRC_SHL]] +; CHECK-NEXT: [[DATA_NEXT]] = shl i16 [[CRC_BE_SHIFT]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp samesign ult i32 [[IV]], 7 +; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[CRC_NEXT_LCSSA:%.*]] = phi i16 [ [[CRC_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[DATA_NEXT_LCSSA:%.*]] = phi i16 [ [[DATA_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[RET:%.*]] = xor i16 [[DATA_NEXT_LCSSA]], [[CRC_NEXT_LCSSA]] +; CHECK-NEXT: ret i16 [[RET]] +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ] + %data = phi i16 [ %data.init, %entry ], [ %data.next, %loop ] + %xor.crc.data = xor i16 %data, %crc + %crc.shl = shl i16 %crc, 1 + %crc.xor = xor i16 %crc.shl, 3 + %check.sb = icmp slt i16 %xor.crc.data, 0 + %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl + %data.next = shl i16 %data, 1 + %iv.next = add nuw nsw i32 %iv, 1 + %exit.cond = icmp samesign ult i32 %iv, 7 + br i1 %exit.cond, label %loop, label %exit + +exit: + %ret = xor i16 %data.next, %crc.next + ret i16 %ret +} ;. ; CHECK: attributes #[[ATTR0]] = { optsize } ;. From e8e2668fa40a442af553e84684490e029cc1a581 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 29 Oct 2025 11:52:06 -1000 Subject: [PATCH 149/539] [flang][rt] Add install target for header files (#165610) --- flang-rt/CMakeLists.txt | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/flang-rt/CMakeLists.txt b/flang-rt/CMakeLists.txt index cad39d0c71016..50b8e834776fb 100644 --- a/flang-rt/CMakeLists.txt +++ b/flang-rt/CMakeLists.txt @@ -330,3 +330,19 @@ if (FLANG_RT_INCLUDE_TESTS) else () add_custom_target(check-flang-rt) endif() + +################### +# Install headers # +################### + +if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY) + add_llvm_install_targets(install-flang-rt-headers COMPONENT flang-rt-headers) + + install(DIRECTORY include/flang-rt/runtime + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/flang-rt" + COMPONENT flang-rt-headers + FILES_MATCHING + PATTERN "*.h" + PATTERN ".git" EXCLUDE + PATTERN "CMakeFiles" EXCLUDE) +endif() From 8cf7ec974f1d50f88c764fe4a6767b067f1d4ebb Mon Sep 17 00:00:00 2001 From: lb90 Date: Wed, 29 Oct 2025 23:28:49 +0100 Subject: [PATCH 150/539] [LLDB][Windows]: Don't pass duplicate HANDLEs to CreateProcess (#165281) CreateProcess fails with ERROR_INVALID_PARAMETER when duplicate HANDLEs are passed via `PROC_THREAD_ATTRIBUTE_HANDLE_LIST`. This can happen, for example, if stdout and stdin are the same device (e.g. a bidirectional named pipe), or if stdout and stderr are the same device. Fixes https://github.com/msys2/MINGW-packages/issues/26030 --- .../Host/windows/ProcessLauncherWindows.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/lldb/source/Host/windows/ProcessLauncherWindows.cpp b/lldb/source/Host/windows/ProcessLauncherWindows.cpp index f5adadaf061bf..e1b4b7e48c5a6 100644 --- a/lldb/source/Host/windows/ProcessLauncherWindows.cpp +++ b/lldb/source/Host/windows/ProcessLauncherWindows.cpp @@ -16,6 +16,7 @@ #include "llvm/Support/Program.h" #include +#include #include using namespace lldb; @@ -91,13 +92,13 @@ ProcessLauncherWindows::LaunchProcess(const ProcessLaunchInfo &launch_info, startupinfo.hStdOutput = stdout_handle ? stdout_handle : ::GetStdHandle(STD_OUTPUT_HANDLE); - std::vector inherited_handles; + std::unordered_set inherited_handles; if (startupinfo.hStdError) - inherited_handles.push_back(startupinfo.hStdError); + inherited_handles.insert(startupinfo.hStdError); if (startupinfo.hStdInput) - inherited_handles.push_back(startupinfo.hStdInput); + inherited_handles.insert(startupinfo.hStdInput); if (startupinfo.hStdOutput) - inherited_handles.push_back(startupinfo.hStdOutput); + inherited_handles.insert(startupinfo.hStdOutput); SIZE_T attributelist_size = 0; InitializeProcThreadAttributeList(/*lpAttributeList=*/nullptr, @@ -120,13 +121,15 @@ ProcessLauncherWindows::LaunchProcess(const ProcessLaunchInfo &launch_info, const FileAction *act = launch_info.GetFileActionAtIndex(i); if (act->GetAction() == FileAction::eFileActionDuplicate && act->GetFD() == act->GetActionArgument()) - inherited_handles.push_back(reinterpret_cast(act->GetFD())); + inherited_handles.insert(reinterpret_cast(act->GetFD())); } if (!inherited_handles.empty()) { + std::vector handles(inherited_handles.begin(), + inherited_handles.end()); if (!UpdateProcThreadAttribute( startupinfoex.lpAttributeList, /*dwFlags=*/0, - PROC_THREAD_ATTRIBUTE_HANDLE_LIST, inherited_handles.data(), - inherited_handles.size() * sizeof(HANDLE), + PROC_THREAD_ATTRIBUTE_HANDLE_LIST, handles.data(), + handles.size() * sizeof(HANDLE), /*lpPreviousValue=*/nullptr, /*lpReturnSize=*/nullptr)) { error = Status(::GetLastError(), eErrorTypeWin32); return HostProcess(); From 1f07875d4c2ea7226ea55b6a28e58dc022d4581e Mon Sep 17 00:00:00 2001 From: Razvan Lupusoru Date: Wed, 29 Oct 2025 16:09:33 -0700 Subject: [PATCH 151/539] [acc] Expand OpenACCSupport to provide getRecipeName and emitNYI (#165628) Extends OpenACCSupport utilities to include recipe name generation and error reporting for unsupported features, providing foundation for variable privatization handling. Changes: - Add RecipeKind enum (private, firstprivate, reduction) for APIs that request a specific kind of recipe - Add getRecipeName() API to OpenACCSupport and OpenACCUtils that generates recipe names from types (e.g., "privatization_memref_5x10xf32_") - Add emitNYI() API to OpenACCSupport for graceful handling of not-yet-implemented cases - Generalize MemRefPointerLikeModel template to support UnrankedMemRefType - Add unit tests and integration tests for new APIs --- .../Dialect/OpenACC/Analysis/OpenACCSupport.h | 36 ++++++++ .../mlir/Dialect/OpenACC/OpenACCOps.td | 20 +++++ .../mlir/Dialect/OpenACC/OpenACCUtils.h | 4 + .../OpenACC/Analysis/OpenACCSupport.cpp | 19 +++++ mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 13 +-- .../Dialect/OpenACC/Utils/OpenACCUtils.cpp | 39 +++++++++ .../OpenACC/support-analysis-recipename.mlir | 78 +++++++++++++++++ .../OpenACC/support-analysis-unsupported.mlir | 18 ++++ .../Dialect/OpenACC/TestOpenACCSupport.cpp | 22 +++++ .../Dialect/OpenACC/OpenACCUtilsTest.cpp | 85 +++++++++++++++++++ 10 files changed, 329 insertions(+), 5 deletions(-) create mode 100644 mlir/test/Dialect/OpenACC/support-analysis-recipename.mlir create mode 100644 mlir/test/Dialect/OpenACC/support-analysis-unsupported.mlir diff --git a/mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h b/mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h index 0833462ea0509..d9b2646b753f3 100644 --- a/mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h +++ b/mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h @@ -58,6 +58,9 @@ namespace mlir { namespace acc { +// Forward declaration for RecipeKind enum +enum class RecipeKind : uint32_t; + namespace detail { /// This class contains internal trait classes used by OpenACCSupport. /// It follows the Concept-Model pattern used throughout MLIR (e.g., in @@ -69,6 +72,13 @@ struct OpenACCSupportTraits { /// Get the variable name for a given MLIR value. virtual std::string getVariableName(Value v) = 0; + + /// Get the recipe name for a given kind, type and value. + virtual std::string getRecipeName(RecipeKind kind, Type type, + Value var) = 0; + + // Used to report a case that is not supported by the implementation. + virtual InFlightDiagnostic emitNYI(Location loc, const Twine &message) = 0; }; /// This class wraps a concrete OpenACCSupport implementation and forwards @@ -84,6 +94,14 @@ struct OpenACCSupportTraits { return impl.getVariableName(v); } + std::string getRecipeName(RecipeKind kind, Type type, Value var) final { + return impl.getRecipeName(kind, type, var); + } + + InFlightDiagnostic emitNYI(Location loc, const Twine &message) final { + return impl.emitNYI(loc, message); + } + private: ImplT impl; }; @@ -118,6 +136,24 @@ class OpenACCSupport { /// \return The variable name, or an empty string if unavailable. std::string getVariableName(Value v); + /// Get the recipe name for a given type and value. + /// + /// \param kind The kind of recipe to get the name for. + /// \param type The type to get the recipe name for. Can be null if the + /// var is provided instead. + /// \param var The MLIR value to get the recipe name for. Can be null if + /// the type is provided instead. + /// \return The recipe name, or an empty string if not available. + std::string getRecipeName(RecipeKind kind, Type type, Value var); + + /// Report a case that is not yet supported by the implementation. + /// + /// \param loc The location to report the unsupported case at. + /// \param message The message to report. + /// \return An in-flight diagnostic object that can be used to report the + /// unsupported case. + InFlightDiagnostic emitNYI(Location loc, const Twine &message); + /// Signal that this analysis should always be preserved so that /// underlying implementation registration is not lost. bool isInvalidated(const AnalysisManager::PreservedAnalyses &pa) { diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td index a18c18af8a753..2f4517ddfe754 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td @@ -152,6 +152,26 @@ def OpenACC_LoopParMode : I32EnumAttr< let genSpecializedAttr = 0; } +def OpenACC_PrivateRecipe : I32EnumAttrCase<"private_recipe", 0>; +def OpenACC_FirstprivateRecipe : I32EnumAttrCase<"firstprivate_recipe", 1>; +def OpenACC_ReductionRecipe : I32EnumAttrCase<"reduction_recipe", 2>; + +def OpenACC_RecipeKind : I32EnumAttr< + "RecipeKind", + "Encodes the options for kinds of recipes availabie in acc dialect", + [ + OpenACC_PrivateRecipe, OpenACC_FirstprivateRecipe, + OpenACC_ReductionRecipe]> { + let cppNamespace = "::mlir::acc"; + let genSpecializedAttr = 0; +} + +def OpenACC_RecipeKindAttr : EnumAttr { + let assemblyFormat = [{ ```<` $value `>` }]; +} + // Type used in operation below. def IntOrIndex : AnyTypeOf<[AnyInteger, Index]>; diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h index 0ee88c6f47b67..563c1e0099fc0 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h @@ -43,6 +43,10 @@ mlir::acc::VariableTypeCategory getTypeCategory(mlir::Value var); /// empty string if no name is found. std::string getVariableName(mlir::Value v); +/// Get the recipe name for a given recipe kind and type. +/// Returns an empty string if not possible to generate a recipe name. +std::string getRecipeName(mlir::acc::RecipeKind kind, mlir::Type type); + } // namespace acc } // namespace mlir diff --git a/mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp b/mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp index f6b4534794eaf..40e769e7068cf 100644 --- a/mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp +++ b/mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp @@ -22,5 +22,24 @@ std::string OpenACCSupport::getVariableName(Value v) { return acc::getVariableName(v); } +std::string OpenACCSupport::getRecipeName(RecipeKind kind, Type type, + Value var) { + if (impl) + return impl->getRecipeName(kind, type, var); + // The default implementation assumes that only type matters + // and the actual instance of variable is not relevant. + auto recipeName = acc::getRecipeName(kind, type); + if (recipeName.empty()) + emitNYI(var ? var.getLoc() : UnknownLoc::get(type.getContext()), + "variable privatization (incomplete recipe name handling)"); + return recipeName; +} + +InFlightDiagnostic OpenACCSupport::emitNYI(Location loc, const Twine &message) { + if (impl) + return impl->emitNYI(loc, message); + return mlir::emitError(loc, "not yet implemented: " + message); +} + } // namespace acc } // namespace mlir diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index ca46629919dba..35eba724a9059 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -50,11 +50,11 @@ static void attachVarNameAttr(Operation *op, OpBuilder &builder, } } +template struct MemRefPointerLikeModel - : public PointerLikeType::ExternalModel { + : public PointerLikeType::ExternalModel, T> { Type getElementType(Type pointer) const { - return cast(pointer).getElementType(); + return cast(pointer).getElementType(); } mlir::acc::VariableTypeCategory @@ -63,7 +63,7 @@ struct MemRefPointerLikeModel if (auto mappableTy = dyn_cast(varType)) { return mappableTy.getTypeCategory(varPtr); } - auto memrefTy = cast(pointer); + auto memrefTy = cast(pointer); if (!memrefTy.hasRank()) { // This memref is unranked - aka it could have any rank, including a // rank of 0 which could mean scalar. For now, return uncategorized. @@ -296,7 +296,10 @@ void OpenACCDialect::initialize() { // By attaching interfaces here, we make the OpenACC dialect dependent on // the other dialects. This is probably better than having dialects like LLVM // and memref be dependent on OpenACC. - MemRefType::attachInterface(*getContext()); + MemRefType::attachInterface>( + *getContext()); + UnrankedMemRefType::attachInterface< + MemRefPointerLikeModel>(*getContext()); LLVM::LLVMPointerType::attachInterface( *getContext()); } diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp index 89adda82646e6..660c3138af0ec 100644 --- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp +++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp @@ -11,6 +11,7 @@ #include "mlir/Dialect/OpenACC/OpenACC.h" #include "mlir/Interfaces/ViewLikeInterface.h" #include "llvm/ADT/TypeSwitch.h" +#include "llvm/Support/Casting.h" mlir::Operation *mlir::acc::getEnclosingComputeOp(mlir::Region ®ion) { mlir::Operation *parentOp = region.getParentOp(); @@ -106,3 +107,41 @@ std::string mlir::acc::getVariableName(mlir::Value v) { return ""; } + +std::string mlir::acc::getRecipeName(mlir::acc::RecipeKind kind, + mlir::Type type) { + assert(kind == mlir::acc::RecipeKind::private_recipe || + kind == mlir::acc::RecipeKind::firstprivate_recipe || + kind == mlir::acc::RecipeKind::reduction_recipe); + if (!llvm::isa(type)) + return ""; + + std::string recipeName; + llvm::raw_string_ostream ss(recipeName); + ss << (kind == mlir::acc::RecipeKind::private_recipe ? "privatization_" + : kind == mlir::acc::RecipeKind::firstprivate_recipe + ? "firstprivatization_" + : "reduction_"); + + // Print the type using its dialect-defined textual format. + type.print(ss); + ss.flush(); + + // Replace invalid characters (anything that's not a letter, number, or + // period) since this needs to be a valid MLIR identifier. + for (char &c : recipeName) { + if (!std::isalnum(static_cast(c)) && c != '.' && c != '_') { + if (c == '?') + c = 'U'; + else if (c == '*') + c = 'Z'; + else if (c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || + c == '}' || c == '<' || c == '>') + c = '_'; + else + c = 'X'; + } + } + + return recipeName; +} diff --git a/mlir/test/Dialect/OpenACC/support-analysis-recipename.mlir b/mlir/test/Dialect/OpenACC/support-analysis-recipename.mlir new file mode 100644 index 0000000000000..8ea53b5d0f4d4 --- /dev/null +++ b/mlir/test/Dialect/OpenACC/support-analysis-recipename.mlir @@ -0,0 +1,78 @@ +// RUN: mlir-opt %s -split-input-file -test-acc-support | FileCheck %s + +// Test private recipe with 2D memref +func.func @test_private_2d_memref() { + // Create a 2D memref + %0 = memref.alloca() {test.recipe_name = #acc.recipe_kind} : memref<5x10xf32> + + // CHECK: op=%{{.*}} = memref.alloca() {test.recipe_name = #acc.recipe_kind} : memref<5x10xf32> + // CHECK-NEXT: getRecipeName(kind=private_recipe, type=memref<5x10xf32>)="privatization_memref_5x10xf32_" + + return +} + +// ----- + +// Test firstprivate recipe with 2D memref +func.func @test_firstprivate_2d_memref() { + // Create a 2D memref + %0 = memref.alloca() {test.recipe_name = #acc.recipe_kind} : memref<8x16xf64> + + // CHECK: op=%{{.*}} = memref.alloca() {test.recipe_name = #acc.recipe_kind} : memref<8x16xf64> + // CHECK-NEXT: getRecipeName(kind=firstprivate_recipe, type=memref<8x16xf64>)="firstprivatization_memref_8x16xf64_" + + return +} + +// ----- + +// Test reduction recipe with 2D memref +func.func @test_reduction_2d_memref() { + // Create a 2D memref + %0 = memref.alloca() {test.recipe_name = #acc.recipe_kind} : memref<4x8xi32> + + // CHECK: op=%{{.*}} = memref.alloca() {test.recipe_name = #acc.recipe_kind} : memref<4x8xi32> + // CHECK-NEXT: getRecipeName(kind=reduction_recipe, type=memref<4x8xi32>)="reduction_memref_4x8xi32_" + + return +} + +// ----- + +// Test private recipe with dynamic memref +func.func @test_private_dynamic_memref(%arg0: memref<5x10xi32>) { + // Cast to dynamic dimensions + %0 = memref.cast %arg0 {test.recipe_name = #acc.recipe_kind} : memref<5x10xi32> to memref + + // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.recipe_name = #acc.recipe_kind} : memref<5x10xi32> to memref + // CHECK-NEXT: getRecipeName(kind=private_recipe, type=memref)="privatization_memref_Ux10xi32_" + + return +} + +// ----- + +// Test private recipe with scalar memref +func.func @test_private_scalar_memref() { + // Create a scalar memref (no dimensions) + %0 = memref.alloca() {test.recipe_name = #acc.recipe_kind} : memref + + // CHECK: op=%{{.*}} = memref.alloca() {test.recipe_name = #acc.recipe_kind} : memref + // CHECK-NEXT: getRecipeName(kind=private_recipe, type=memref)="privatization_memref_i32_" + + return +} + +// ----- + +// Test private recipe with unranked memref +func.func @test_private_unranked_memref(%arg0: memref<10xi32>) { + // Cast to unranked memref + %0 = memref.cast %arg0 {test.recipe_name = #acc.recipe_kind} : memref<10xi32> to memref<*xi32> + + // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.recipe_name = #acc.recipe_kind} : memref<10xi32> to memref<*xi32> + // CHECK-NEXT: getRecipeName(kind=private_recipe, type=memref<*xi32>)="privatization_memref_Zxi32_" + + return +} + diff --git a/mlir/test/Dialect/OpenACC/support-analysis-unsupported.mlir b/mlir/test/Dialect/OpenACC/support-analysis-unsupported.mlir new file mode 100644 index 0000000000000..c4d5b81a1380a --- /dev/null +++ b/mlir/test/Dialect/OpenACC/support-analysis-unsupported.mlir @@ -0,0 +1,18 @@ +// RUN: mlir-opt %s --split-input-file -test-acc-support -verify-diagnostics + +// Test emitNYI with a simple message +func.func @test_emit_nyi() { + // expected-error @below {{not yet implemented: Unsupported feature in OpenACC}} + %0 = memref.alloca() {test.emit_nyi = "Unsupported feature in OpenACC"} : memref<10xi32> + return +} + +// ----- + +// Test recipe name on load operation from scalar memref +func.func @test_recipe_load_scalar() { + %0 = memref.alloca() : memref + // expected-error @below {{not yet implemented: variable privatization (incomplete recipe name handling)}} + %1 = memref.load %0[] {test.recipe_name = #acc.recipe_kind} : memref + return +} diff --git a/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp b/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp index 8bf984bdc2632..7c8b08489c62e 100644 --- a/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp +++ b/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp @@ -57,6 +57,28 @@ void TestOpenACCSupportPass::runOnOperation() { << "\"\n"; } } + + // Check for test.recipe_name attribute. This is the marker used to identify + // the operations that need to be tested for getRecipeName. + if (auto recipeAttr = + op->getAttrOfType("test.recipe_name")) { + RecipeKind kind = recipeAttr.getValue(); + // Get the type from the first result if available + if (op->getNumResults() > 0) { + Type type = op->getResult(0).getType(); + std::string recipeName = + support.getRecipeName(kind, type, op->getResult(0)); + llvm::outs() << "op=" << *op + << "\n\tgetRecipeName(kind=" << stringifyRecipeKind(kind) + << ", type=" << type << ")=\"" << recipeName << "\"\n"; + } + } + + // Check for test.emit_nyi attribute. This is the marker used to + // test whether the not yet implemented case is reported correctly. + if (auto messageAttr = op->getAttrOfType("test.emit_nyi")) { + support.emitNYI(op->getLoc(), messageAttr.getValue()); + } }); } diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp index 3fbbcc90a67c9..f1fe53c15a6f5 100644 --- a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp +++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp @@ -485,3 +485,88 @@ TEST_F(OpenACCUtilsTest, getVariableNameFromCopyin) { std::string varName = getVariableName(copyinOp->getAccVar()); EXPECT_EQ(varName, name); } + +//===----------------------------------------------------------------------===// +// getRecipeName Tests +//===----------------------------------------------------------------------===// + +TEST_F(OpenACCUtilsTest, getRecipeNamePrivateScalarMemref) { + // Create a scalar memref type + auto scalarMemrefTy = MemRefType::get({}, b.getI32Type()); + + // Test private recipe with scalar memref + std::string recipeName = + getRecipeName(RecipeKind::private_recipe, scalarMemrefTy); + EXPECT_EQ(recipeName, "privatization_memref_i32_"); +} + +TEST_F(OpenACCUtilsTest, getRecipeNameFirstprivateScalarMemref) { + // Create a scalar memref type + auto scalarMemrefTy = MemRefType::get({}, b.getF32Type()); + + // Test firstprivate recipe with scalar memref + std::string recipeName = + getRecipeName(RecipeKind::firstprivate_recipe, scalarMemrefTy); + EXPECT_EQ(recipeName, "firstprivatization_memref_f32_"); +} + +TEST_F(OpenACCUtilsTest, getRecipeNameReductionScalarMemref) { + // Create a scalar memref type + auto scalarMemrefTy = MemRefType::get({}, b.getI64Type()); + + // Test reduction recipe with scalar memref + std::string recipeName = + getRecipeName(RecipeKind::reduction_recipe, scalarMemrefTy); + EXPECT_EQ(recipeName, "reduction_memref_i64_"); +} + +TEST_F(OpenACCUtilsTest, getRecipeNamePrivate2DMemref) { + // Create a 2D memref type + auto memref2DTy = MemRefType::get({5, 10}, b.getF32Type()); + + // Test private recipe with 2D memref + std::string recipeName = + getRecipeName(RecipeKind::private_recipe, memref2DTy); + EXPECT_EQ(recipeName, "privatization_memref_5x10xf32_"); +} + +TEST_F(OpenACCUtilsTest, getRecipeNameFirstprivate2DMemref) { + // Create a 2D memref type + auto memref2DTy = MemRefType::get({8, 16}, b.getF64Type()); + + // Test firstprivate recipe with 2D memref + std::string recipeName = + getRecipeName(RecipeKind::firstprivate_recipe, memref2DTy); + EXPECT_EQ(recipeName, "firstprivatization_memref_8x16xf64_"); +} + +TEST_F(OpenACCUtilsTest, getRecipeNameReduction2DMemref) { + // Create a 2D memref type + auto memref2DTy = MemRefType::get({4, 8}, b.getI32Type()); + + // Test reduction recipe with 2D memref + std::string recipeName = + getRecipeName(RecipeKind::reduction_recipe, memref2DTy); + EXPECT_EQ(recipeName, "reduction_memref_4x8xi32_"); +} + +TEST_F(OpenACCUtilsTest, getRecipeNamePrivateDynamicMemref) { + // Create a memref with dynamic dimensions + auto dynamicMemrefTy = + MemRefType::get({ShapedType::kDynamic, 10}, b.getI32Type()); + + // Test private recipe with dynamic memref + std::string recipeName = + getRecipeName(RecipeKind::private_recipe, dynamicMemrefTy); + EXPECT_EQ(recipeName, "privatization_memref_Ux10xi32_"); +} + +TEST_F(OpenACCUtilsTest, getRecipeNamePrivateUnrankedMemref) { + // Create an unranked memref type + auto unrankedMemrefTy = UnrankedMemRefType::get(b.getI32Type(), 0); + + // Test private recipe with unranked memref + std::string recipeName = + getRecipeName(RecipeKind::private_recipe, unrankedMemrefTy); + EXPECT_EQ(recipeName, "privatization_memref_Zxi32_"); +} From 6ae23a985bbd912f0c7ab4139d7a856bd2ceade3 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 29 Oct 2025 16:33:25 -0700 Subject: [PATCH 152/539] [MC] Remove a duplicate #include (NFC) (#165507) Identified with readability-duplicate-include. --- llvm/lib/MC/MCParser/AsmLexer.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp index a6188f0676937..1af4a297babaa 100644 --- a/llvm/lib/MC/MCParser/AsmLexer.cpp +++ b/llvm/lib/MC/MCParser/AsmLexer.cpp @@ -16,7 +16,6 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCParser/AsmLexer.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/SaveAndRestore.h" From 637f3cef5faa249c24a9295995134e3bb48c6bcf Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 29 Oct 2025 16:33:33 -0700 Subject: [PATCH 153/539] [WebAssembly] Remove a redundant cast (NFC) (#165508) Local is already of type unsigned. --- llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp index 27f7e1ada1250..5a1779c2c80fb 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp @@ -81,7 +81,7 @@ WebAssemblyFrameLowering::getLocalForStackObject(MachineFunction &MF, // Abuse object size to record number of WebAssembly locals allocated to // this object. MFI.setObjectSize(FrameIndex, ValueVTs.size()); - return static_cast(Local); + return Local; } /// We need a base pointer in the case of having items on the stack that From 7163c3c0c0c8354542ff15e98322ae5e12ea5611 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 29 Oct 2025 16:33:41 -0700 Subject: [PATCH 154/539] [llvm] Proofread HowToSubmitABug.rst (#165511) --- llvm/docs/HowToSubmitABug.rst | 46 +++++++++++++++++------------------ 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/llvm/docs/HowToSubmitABug.rst b/llvm/docs/HowToSubmitABug.rst index 002087cc55e0a..d62391b5da745 100644 --- a/llvm/docs/HowToSubmitABug.rst +++ b/llvm/docs/HowToSubmitABug.rst @@ -6,26 +6,26 @@ Introduction - Got bugs? ======================== -If you're working with LLVM and run into a bug, we definitely want to know +If you're working with LLVM and encounter a bug, we definitely want to know about it. This document describes what you can do to increase the odds of getting it fixed quickly. 🔒 If you believe that the bug is security related, please follow :ref:`report-security-issue`. 🔒 -Basically you have to do two things at a minimum. First, decide whether the +Basically, you have to do two things at a minimum. First, decide whether the bug `crashes the compiler`_ or if the compiler is `miscompiling`_ the program (i.e., the compiler successfully produces an executable, but it doesn't run right). Based on what type of bug it is, follow the instructions in the linked section to narrow down the bug so that the person who fixes it will be able to find the problem more easily. -Once you have a reduced test-case, go to `the LLVM Bug Tracking System +Once you have a reduced test case, go to `the LLVM Bug Tracking System `_ and fill out the form with the necessary details (note that you don't need to pick a label, just use if you're not sure). The bug description should contain the following information: * All information necessary to reproduce the problem. -* The reduced test-case that triggers the bug. +* The reduced test case that triggers the bug. * The location where you obtained LLVM (if not from our Git repository). @@ -39,10 +39,10 @@ Crashing Bugs More often than not, bugs in the compiler cause it to crash---often due to an assertion failure of some sort. The most important piece of the puzzle is to figure out if it is crashing in the Clang front-end or if it is one of -the LLVM libraries (e.g. the optimizer or code generator) that has +the LLVM libraries (e.g., the optimizer or code generator) that has problems. -To figure out which component is crashing (the front-end, middle-end +To identify the crashing component (the front-end, middle-end optimizer, or backend code generator), run the ``clang`` command line as you were when the crash occurred, but with the following extra command line options: @@ -53,7 +53,7 @@ options: `. * ``-emit-llvm``: If ``clang`` crashes with this option (which disables - the code generator), you found a middle-end optimizer bug. Jump ahead to + the code generator), you've found a middle-end optimizer bug. Jump ahead to :ref:`middle-end bugs `. * Otherwise, you have a backend code generator crash. Jump ahead to :ref:`code @@ -102,19 +102,19 @@ functions. Then run: If this doesn't crash, please follow the instructions for a :ref:`front-end bug `. -If this does crash, then you should be able to debug this with the following +If this does crash, then you can debug this with the following :doc:`bugpoint ` command: .. code-block:: bash bugpoint foo.bc -O3 -Run this, then file a bug with the instructions and reduced .bc +Run this, then file a bug with the instructions and reduced ``.bc`` files that bugpoint emits. If bugpoint doesn't reproduce the crash, :doc:`llvm-reduce ` is an alternative way to reduce -LLVM IR. Create a script that repros the crash and run: +LLVM IR. Create a script that reproduces the crash and run: .. code-block:: bash @@ -137,16 +137,16 @@ Backend code generator bugs --------------------------- If you find a bug that crashes clang in the code generator, compile your -source file to a .bc file by passing "``-emit-llvm -c -o foo.bc``" to -clang (in addition to the options you already pass). Once your have -foo.bc, one of the following commands should fail: +source file to a ``.bc`` file by passing "``-emit-llvm -c -o foo.bc``" to +clang (in addition to the options you already pass). Once you have +``foo.bc``, one of the following commands should fail: #. ``llc foo.bc`` #. ``llc foo.bc -relocation-model=pic`` #. ``llc foo.bc -relocation-model=static`` If none of these crash, please follow the instructions for a :ref:`front-end -bug`. If one of these do crash, you should be able to reduce +bug`. If one of these crashes, you should be able to reduce this with one of the following :doc:`bugpoint ` command lines (use the one corresponding to the command above that failed): @@ -154,9 +154,9 @@ the one corresponding to the command above that failed): #. ``bugpoint -run-llc foo.bc --tool-args -relocation-model=pic`` #. ``bugpoint -run-llc foo.bc --tool-args -relocation-model=static`` -Please run this, then file a bug with the instructions and reduced .bc file +Please run this, then file a bug with the instructions and reduced ``.bc`` file that bugpoint emits. If something goes wrong with bugpoint, please submit -the "foo.bc" file and the option that llc crashes with. +the ``foo.bc`` file and the option that llc crashes with. LTO bugs --------------------------- @@ -174,7 +174,7 @@ in addition to your existing compilation options: These options enable LTO and save temporary files generated during compilation for later analysis. -On Windows, you should be using lld-link as the linker. Adjust your compilation +On Windows, use lld-link as the linker. Adjust your compilation flags as follows: * Add ``/lldsavetemps`` to the linker flags. * When linking from the compiler driver, add ``/link /lldsavetemps`` in order to forward that flag to the linker. @@ -199,7 +199,7 @@ command line (use the bc file corresponding to the command above that failed): llvm-reduce --test reduce.sh a.out.0.2.internalize.bc -Example of reduce.sh script +Example of ``reduce.sh`` script .. code-block:: bash @@ -209,9 +209,9 @@ Example of reduce.sh script path/to/not --crash path/to/opt "-passes=lto" $1 -o temp.bc 2> err.log grep -q "It->second == &Insn" err.log -Here we have grepped the failed assert message. +Here we have grepped for the failed assert message. -Please run this, then file a bug with the instructions and reduced .bc file +Please run this, then file a bug with the instructions and reduced ``.bc`` file that llvm-reduce emits. .. _miscompiling: @@ -221,16 +221,16 @@ Miscompilations If clang successfully produces an executable, but that executable doesn't run right, this is either a bug in the code or a bug in the compiler. The first -thing to check is to make sure it is not using undefined behavior (e.g. +thing to check is to make sure it is not using undefined behavior (e.g., reading a variable before it is defined). In particular, check to see if the program is clean under various `sanitizers -`_ (e.g. ``clang +`_ (e.g., ``clang -fsanitize=undefined,address``) and `valgrind `_. Many "LLVM bugs" that we have chased down ended up being bugs in the program being compiled, not LLVM. Once you determine that the program itself is not buggy, you should choose -which code generator you wish to compile the program with (e.g. LLC or the JIT) +which code generator you wish to compile the program with (e.g., LLC or the JIT) and optionally a series of LLVM passes to run. For example: .. code-block:: bash From a7911207528222daa9c89202f85d2e2544b74983 Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Wed, 29 Oct 2025 20:55:15 -0400 Subject: [PATCH 155/539] [clang-shlib] Fix linking libclang-cpp on Haiku (#156401) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Haiku requires linking in libnetwork. Co-authored-by: Jérôme Duval --- clang/tools/clang-shlib/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clang/tools/clang-shlib/CMakeLists.txt b/clang/tools/clang-shlib/CMakeLists.txt index 945076e1ad810..a4d0aa5779a7e 100644 --- a/clang/tools/clang-shlib/CMakeLists.txt +++ b/clang/tools/clang-shlib/CMakeLists.txt @@ -41,6 +41,10 @@ if (CLANG_LINK_CLANG_DYLIB) set(INSTALL_WITH_TOOLCHAIN INSTALL_WITH_TOOLCHAIN) endif() +if (HAIKU) + list(APPEND _DEPS network) +endif() + add_clang_library(clang-cpp SHARED ${INSTALL_WITH_TOOLCHAIN} From 9aded609a97808e0aed18cc09738d8192d2c272d Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Wed, 29 Oct 2025 18:12:40 -0700 Subject: [PATCH 156/539] [mlir][CF] Add structural type conversion patterns (#165629) Add structural type conversion patterns for CF dialect ops. These patterns are similar to the SCF structural type conversion patterns. This commit adds missing functionality and is in preparation of #165180, which changes the way blocks are converted. (Only entry blocks are converted.) --- .../Transforms/StructuralTypeConversions.h | 48 +++++ .../ControlFlow/Transforms/CMakeLists.txt | 1 + .../Transforms/StructuralTypeConversions.cpp | 169 ++++++++++++++++++ .../test-legalize-type-conversion.mlir | 22 +++ mlir/test/lib/Dialect/Test/CMakeLists.txt | 1 + mlir/test/lib/Dialect/Test/TestPatterns.cpp | 7 + 6 files changed, 248 insertions(+) create mode 100644 mlir/include/mlir/Dialect/ControlFlow/Transforms/StructuralTypeConversions.h create mode 100644 mlir/lib/Dialect/ControlFlow/Transforms/StructuralTypeConversions.cpp diff --git a/mlir/include/mlir/Dialect/ControlFlow/Transforms/StructuralTypeConversions.h b/mlir/include/mlir/Dialect/ControlFlow/Transforms/StructuralTypeConversions.h new file mode 100644 index 0000000000000..a32d9e2025c76 --- /dev/null +++ b/mlir/include/mlir/Dialect/ControlFlow/Transforms/StructuralTypeConversions.h @@ -0,0 +1,48 @@ +//===- StructuralTypeConversions.h - CF Type Conversions --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_CONTROL_FLOW_TRANSFORMS_STRUCTURAL_TYPE_CONVERSIONS_H +#define MLIR_DIALECT_CONTROL_FLOW_TRANSFORMS_STRUCTURAL_TYPE_CONVERSIONS_H + +#include "mlir/IR/PatternMatch.h" + +namespace mlir { + +class ConversionTarget; +class TypeConverter; + +namespace cf { + +/// Populates patterns for CF structural type conversions and sets up the +/// provided ConversionTarget with the appropriate legality configuration for +/// the ops to get converted properly. +/// +/// A "structural" type conversion is one where the underlying ops are +/// completely agnostic to the actual types involved and simply need to update +/// their types. An example of this is cf.br -- the cf.br op needs to update +/// its types accordingly to the TypeConverter, but otherwise does not care +/// what type conversions are happening. +void populateCFStructuralTypeConversionsAndLegality( + const TypeConverter &typeConverter, RewritePatternSet &patterns, + ConversionTarget &target, PatternBenefit benefit = 1); + +/// Similar to `populateCFStructuralTypeConversionsAndLegality` but does not +/// populate the conversion target. +void populateCFStructuralTypeConversions(const TypeConverter &typeConverter, + RewritePatternSet &patterns, + PatternBenefit benefit = 1); + +/// Updates the ConversionTarget with dynamic legality of CF operations based +/// on the provided type converter. +void populateCFStructuralTypeConversionTarget( + const TypeConverter &typeConverter, ConversionTarget &target); + +} // namespace cf +} // namespace mlir + +#endif // MLIR_DIALECT_CONTROL_FLOW_TRANSFORMS_STRUCTURAL_TYPE_CONVERSIONS_H diff --git a/mlir/lib/Dialect/ControlFlow/Transforms/CMakeLists.txt b/mlir/lib/Dialect/ControlFlow/Transforms/CMakeLists.txt index 47740d31844f4..e9da135ed46f9 100644 --- a/mlir/lib/Dialect/ControlFlow/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/ControlFlow/Transforms/CMakeLists.txt @@ -1,6 +1,7 @@ add_mlir_dialect_library(MLIRControlFlowTransforms BufferDeallocationOpInterfaceImpl.cpp BufferizableOpInterfaceImpl.cpp + StructuralTypeConversions.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/ControlFlow/Transforms diff --git a/mlir/lib/Dialect/ControlFlow/Transforms/StructuralTypeConversions.cpp b/mlir/lib/Dialect/ControlFlow/Transforms/StructuralTypeConversions.cpp new file mode 100644 index 0000000000000..5e2a742c2d64c --- /dev/null +++ b/mlir/lib/Dialect/ControlFlow/Transforms/StructuralTypeConversions.cpp @@ -0,0 +1,169 @@ +//===- TypeConversion.cpp - Type Conversion of Unstructured Control Flow --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass to convert MLIR standard and builtin dialects +// into the LLVM IR dialect. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/ControlFlow/Transforms/StructuralTypeConversions.h" + +#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" + +using namespace mlir; + +namespace { + +/// Helper function for converting branch ops. This function converts the +/// signature of the given block. If the new block signature is different from +/// `expectedTypes`, returns "failure". +static FailureOr getConvertedBlock(ConversionPatternRewriter &rewriter, + const TypeConverter *converter, + Operation *branchOp, Block *block, + TypeRange expectedTypes) { + assert(converter && "expected non-null type converter"); + assert(!block->isEntryBlock() && "entry blocks have no predecessors"); + + // There is nothing to do if the types already match. + if (block->getArgumentTypes() == expectedTypes) + return block; + + // Compute the new block argument types and convert the block. + std::optional conversion = + converter->convertBlockSignature(block); + if (!conversion) + return rewriter.notifyMatchFailure(branchOp, + "could not compute block signature"); + if (expectedTypes != conversion->getConvertedTypes()) + return rewriter.notifyMatchFailure( + branchOp, + "mismatch between adaptor operand types and computed block signature"); + return rewriter.applySignatureConversion(block, *conversion, converter); +} + +/// Flatten the given value ranges into a single vector of values. +static SmallVector flattenValues(ArrayRef values) { + SmallVector result; + for (const ValueRange &vals : values) + llvm::append_range(result, vals); + return result; +} + +/// Convert the destination block signature (if necessary) and change the +/// operands of the branch op. +struct BranchOpConversion : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(cf::BranchOp op, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + SmallVector flattenedAdaptor = flattenValues(adaptor.getOperands()); + FailureOr convertedBlock = + getConvertedBlock(rewriter, getTypeConverter(), op, op.getSuccessor(), + TypeRange(ValueRange(flattenedAdaptor))); + if (failed(convertedBlock)) + return failure(); + rewriter.replaceOpWithNewOp(op, flattenedAdaptor, + *convertedBlock); + return success(); + } +}; + +/// Convert the destination block signatures (if necessary) and change the +/// operands of the branch op. +struct CondBranchOpConversion : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(cf::CondBranchOp op, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + SmallVector flattenedAdaptorTrue = + flattenValues(adaptor.getTrueDestOperands()); + SmallVector flattenedAdaptorFalse = + flattenValues(adaptor.getFalseDestOperands()); + if (!llvm::hasSingleElement(adaptor.getCondition())) + return rewriter.notifyMatchFailure(op, + "expected single element condition"); + FailureOr convertedTrueBlock = + getConvertedBlock(rewriter, getTypeConverter(), op, op.getTrueDest(), + TypeRange(ValueRange(flattenedAdaptorTrue))); + if (failed(convertedTrueBlock)) + return failure(); + FailureOr convertedFalseBlock = + getConvertedBlock(rewriter, getTypeConverter(), op, op.getFalseDest(), + TypeRange(ValueRange(flattenedAdaptorFalse))); + if (failed(convertedFalseBlock)) + return failure(); + rewriter.replaceOpWithNewOp( + op, llvm::getSingleElement(adaptor.getCondition()), + flattenedAdaptorTrue, flattenedAdaptorFalse, op.getBranchWeightsAttr(), + *convertedTrueBlock, *convertedFalseBlock); + return success(); + } +}; + +/// Convert the destination block signatures (if necessary) and change the +/// operands of the switch op. +struct SwitchOpConversion : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(cf::SwitchOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // Get or convert default block. + FailureOr convertedDefaultBlock = getConvertedBlock( + rewriter, getTypeConverter(), op, op.getDefaultDestination(), + TypeRange(adaptor.getDefaultOperands())); + if (failed(convertedDefaultBlock)) + return failure(); + + // Get or convert all case blocks. + SmallVector caseDestinations; + SmallVector caseOperands = adaptor.getCaseOperands(); + for (auto it : llvm::enumerate(op.getCaseDestinations())) { + Block *b = it.value(); + FailureOr convertedBlock = + getConvertedBlock(rewriter, getTypeConverter(), op, b, + TypeRange(caseOperands[it.index()])); + if (failed(convertedBlock)) + return failure(); + caseDestinations.push_back(*convertedBlock); + } + + rewriter.replaceOpWithNewOp( + op, adaptor.getFlag(), *convertedDefaultBlock, + adaptor.getDefaultOperands(), adaptor.getCaseValuesAttr(), + caseDestinations, caseOperands); + return success(); + } +}; + +} // namespace + +void mlir::cf::populateCFStructuralTypeConversions( + const TypeConverter &typeConverter, RewritePatternSet &patterns, + PatternBenefit benefit) { + patterns.add( + typeConverter, patterns.getContext(), benefit); +} + +void mlir::cf::populateCFStructuralTypeConversionTarget( + const TypeConverter &typeConverter, ConversionTarget &target) { + target.addDynamicallyLegalOp( + [&](Operation *op) { return typeConverter.isLegal(op->getOperands()); }); +} + +void mlir::cf::populateCFStructuralTypeConversionsAndLegality( + const TypeConverter &typeConverter, RewritePatternSet &patterns, + ConversionTarget &target, PatternBenefit benefit) { + populateCFStructuralTypeConversions(typeConverter, patterns, benefit); + populateCFStructuralTypeConversionTarget(typeConverter, target); +} diff --git a/mlir/test/Transforms/test-legalize-type-conversion.mlir b/mlir/test/Transforms/test-legalize-type-conversion.mlir index c003f8b2cb1cd..91f83a0afaeef 100644 --- a/mlir/test/Transforms/test-legalize-type-conversion.mlir +++ b/mlir/test/Transforms/test-legalize-type-conversion.mlir @@ -143,3 +143,25 @@ func.func @test_signature_conversion_no_converter() { return } +// ----- + +// CHECK-LABEL: func @test_unstructured_cf_conversion( +// CHECK-SAME: %[[arg0:.*]]: f64, %[[c:.*]]: i1) +// CHECK: %[[cast1:.*]] = "test.cast"(%[[arg0]]) : (f64) -> f32 +// CHECK: "test.foo"(%[[cast1]]) +// CHECK: cf.br ^[[bb1:.*]](%[[arg0]] : f64) +// CHECK: ^[[bb1]](%[[arg1:.*]]: f64): +// CHECK: cf.cond_br %[[c]], ^[[bb1]](%[[arg1]] : f64), ^[[bb2:.*]](%[[arg1]] : f64) +// CHECK: ^[[bb2]](%[[arg2:.*]]: f64): +// CHECK: %[[cast2:.*]] = "test.cast"(%[[arg2]]) : (f64) -> f32 +// CHECK: "test.bar"(%[[cast2]]) +// CHECK: return +func.func @test_unstructured_cf_conversion(%arg0: f32, %c: i1) { + "test.foo"(%arg0) : (f32) -> () + cf.br ^bb1(%arg0: f32) +^bb1(%arg1: f32): + cf.cond_br %c, ^bb1(%arg1 : f32), ^bb2(%arg1 : f32) +^bb2(%arg2: f32): + "test.bar"(%arg2) : (f32) -> () + return +} diff --git a/mlir/test/lib/Dialect/Test/CMakeLists.txt b/mlir/test/lib/Dialect/Test/CMakeLists.txt index f099d01abd31a..9354a85d984c9 100644 --- a/mlir/test/lib/Dialect/Test/CMakeLists.txt +++ b/mlir/test/lib/Dialect/Test/CMakeLists.txt @@ -71,6 +71,7 @@ add_mlir_library(MLIRTestDialect ) mlir_target_link_libraries(MLIRTestDialect PUBLIC MLIRControlFlowInterfaces + MLIRControlFlowTransforms MLIRDataLayoutInterfaces MLIRDerivedAttributeOpInterface MLIRDestinationStyleOpInterface diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp index efbdbfb65d65b..fd2b943ff1296 100644 --- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp +++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp @@ -11,6 +11,7 @@ #include "TestTypes.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/CommonFolders.h" +#include "mlir/Dialect/ControlFlow/Transforms/StructuralTypeConversions.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Func/Transforms/FuncConversions.h" #include "mlir/Dialect/SCF/Transforms/Patterns.h" @@ -2042,6 +2043,10 @@ struct TestTypeConversionDriver }); converter.addConversion([](IndexType type) { return type; }); converter.addConversion([](IntegerType type, SmallVectorImpl &types) { + if (type.isInteger(1)) { + // i1 is legal. + types.push_back(type); + } if (type.isInteger(38)) { // i38 is legal. types.push_back(type); @@ -2175,6 +2180,8 @@ struct TestTypeConversionDriver converter); mlir::scf::populateSCFStructuralTypeConversionsAndLegality( converter, patterns, target); + mlir::cf::populateCFStructuralTypeConversionsAndLegality(converter, + patterns, target); ConversionConfig config; config.allowPatternRollback = allowPatternRollback; From 8be0fcfe490592edf68fe09ef85daa0d92ec4378 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Wed, 29 Oct 2025 23:44:30 -0500 Subject: [PATCH 157/539] [bazel][mlir] Port #165629: ControlFlowTransforms deps (#165646) --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 ++ utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel | 1 + 2 files changed, 3 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index e8561cc39e007..7156bea81d6b5 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -4681,6 +4681,8 @@ cc_library( ":ControlFlowDialect", ":IR", ":MemRefDialect", + ":Pass", + ":TransformUtils", ], ) diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel index 778f0be86025a..aa61da4667720 100644 --- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel @@ -386,6 +386,7 @@ cc_library( "//mlir:CallOpInterfaces", "//mlir:CommonFolders", "//mlir:ControlFlowInterfaces", + "//mlir:ControlFlowTransforms", "//mlir:DLTIDialect", "//mlir:DataLayoutInterfaces", "//mlir:DerivedAttributeOpInterface", From 2015d58c91fe8db406073dcb46578e0d5212a938 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 30 Oct 2025 05:00:17 +0000 Subject: [PATCH 158/539] [LV] Only skip scalarization overhead for members used as address. Refine logic to scalarize interleave group member: only skip scalarization overhead for member being used as addresses. For others, use the regular scalar memory op cost. This currently doesn't trigger in practice as far as I could find, but fixes a potential divergence between VPlan- and legacy cost models. It fixes a concrete divergence with a follow-up patch, https://github.com/llvm/llvm-project/pull/161276. --- .../Transforms/Vectorize/LoopVectorize.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f7968abbe5b6b..8ebc108080271 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5750,13 +5750,18 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { getMemoryInstructionCost(I, ElementCount::getFixed(1)))); UpdateMemOpUserCost(cast(I)); } else if (const auto *Group = getInterleavedAccessGroup(I)) { - // Scalarize an interleave group of address loads. - for (unsigned I = 0; I < Group->getFactor(); ++I) { - if (Instruction *Member = Group->getMember(I)) { - setWideningDecision( - Member, VF, CM_Scalarize, - (VF.getKnownMinValue() * - getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); + // Scalarize all members of this interleaved group when any member + // is used as an address. The address-used load skips scalarization + // overhead, other members include it. + for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) { + if (Instruction *Member = Group->getMember(Idx)) { + InstructionCost Cost = + AddrDefs.contains(Member) + ? (VF.getKnownMinValue() * + getMemoryInstructionCost(Member, + ElementCount::getFixed(1))) + : getMemInstScalarizationCost(Member, VF); + setWideningDecision(Member, VF, CM_Scalarize, Cost); UpdateMemOpUserCost(cast(Member)); } } From d421669c3366692bbd73872d7a4c551582cb6237 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Thu, 21 Aug 2025 07:32:24 -0700 Subject: [PATCH 159/539] [MLIR] Apply clang-tidy fixes for llvm-qualified-auto in Vectorization.cpp (NFC) --- mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 9d62491214018..0f317eac8fa41 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -3911,21 +3911,21 @@ struct Conv1DGenerator Value lhs = vector::TransferReadOp::create( rewriter, loc, lhsType, lhsShaped, ValueRange{zero, zero, zero}, /*padding=*/arith::getZeroConstant(rewriter, loc, lhsEltType)); - auto maybeMaskedLhs = maybeMaskXferOp( + auto *maybeMaskedLhs = maybeMaskXferOp( lhsType.getShape(), lhsType.getScalableDims(), lhs.getDefiningOp()); // Read rhs slice of size {kw, c} @ [0, 0]. Value rhs = vector::TransferReadOp::create( rewriter, loc, rhsType, rhsShaped, ValueRange{zero, zero}, /*padding=*/arith::getZeroConstant(rewriter, loc, rhsEltType)); - auto maybeMaskedRhs = maybeMaskXferOp( + auto *maybeMaskedRhs = maybeMaskXferOp( rhsType.getShape(), rhsType.getScalableDims(), rhs.getDefiningOp()); // Read res slice of size {n, w, c} @ [0, 0, 0]. Value res = vector::TransferReadOp::create( rewriter, loc, resType, resShaped, ValueRange{zero, zero, zero}, /*padding=*/arith::getZeroConstant(rewriter, loc, resEltType)); - auto maybeMaskedRes = maybeMaskXferOp( + auto *maybeMaskedRes = maybeMaskXferOp( resType.getShape(), resType.getScalableDims(), res.getDefiningOp()); //===------------------------------------------------------------------===// From ed31a277e82bd8502fc2b55934d710ee17464437 Mon Sep 17 00:00:00 2001 From: Pankaj Dwivedi Date: Thu, 30 Oct 2025 12:32:32 +0530 Subject: [PATCH 160/539] [AMDGPU] Enable "amdgpu-uniform-intrinsic-combine" pass in pipeline. (#162819) This PR enables AMDGPUUniformIntrinsicCombine pass in the llc pipeline. Also introduces the "amdgpu-uniform-intrinsic-combine" command-line flag to enable/disable the pass. see the PR:https://github.com/llvm/llvm-project/pull/116953 --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 + .../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 2 +- .../GlobalISel/llvm.amdgcn.ballot.i32.ll | 63 +++--- .../GlobalISel/llvm.amdgcn.ballot.i64.ll | 67 +++--- .../amdgpu-miscellaneous-uniform-intrinsic.ll | 173 +++++++++++++++ .../amdgpu-simplify-uniform-waterfall.ll | 1 + .../amdgpu-uniform-intrinsic-combine.ll | 1 + .../amdgpu-uniform-temporal-divergence.ll | 1 + .../CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll | 18 +- llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 6 +- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 23 ++ .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll | 58 ++--- .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll | 58 ++--- .../CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll | 75 ++----- .../AMDGPU/llvm.amdgcn.permlane64.ptr.ll | 77 ++----- .../AMDGPU/llvm.amdgcn.readfirstlane.ll | 48 ++--- .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 49 ++--- .../spill-vgpr-to-agpr-update-regscavenger.ll | 23 +- .../AMDGPU/splitkit-getsubrangeformask.ll | 198 +++++++++--------- llvm/test/CodeGen/AMDGPU/wqm.ll | 18 +- 20 files changed, 524 insertions(+), 440 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 75a94ac891819..b28c50e3f5b6d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1315,6 +1315,9 @@ void AMDGPUPassConfig::addIRPasses() { isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); + if (EnableUniformIntrinsicCombine) + addPass(createAMDGPUUniformIntrinsicCombineLegacyPass()); + // This can be disabled by passing ::Disable here or on the command line // with --expand-variadics-override=disable. addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); @@ -2066,6 +2069,8 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { if (isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(AMDGPUImageIntrinsicOptimizerPass(TM)); + if (EnableUniformIntrinsicCombine) + addPass(AMDGPUUniformIntrinsicCombinePass()); // This can be disabled by passing ::Disable here or on the command line // with --expand-variadics-override=disable. addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp index 65e6ed9d1d428..b5e2d76db662e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -188,4 +188,4 @@ INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE, FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() { return new AMDGPUUniformIntrinsicCombineLegacy(); -} +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll index 7714c032d1737..d3e211855d7ed 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll @@ -113,9 +113,9 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_xor_b32 s0, s0, 1 +; CHECK-NEXT: s_and_b32 s0, s0, 1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB8_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 @@ -161,16 +161,17 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; CHECK-NEXT: s_xor_b32 s0, s0, 1 +; CHECK-NEXT: s_xor_b32 s0, s0, 1 +; CHECK-NEXT: s_and_b32 s0, s0, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB10_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB10_3 -; CHECK-NEXT: .LBB10_2: ; %true +; CHECK-NEXT: s_cbranch_scc1 .LBB10_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB10_3 +; CHECK-NEXT: .LBB10_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB10_3 ; CHECK-NEXT: .LBB10_3: %c = trunc i32 %v to i1 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) @@ -208,11 +209,7 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_cmp_lt_u32 s0, 12 -; CHECK-NEXT: s_cselect_b32 s0, 1, 0 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_cmp_ge_u32 s0, 12 ; CHECK-NEXT: s_cbranch_scc1 .LBB12_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 @@ -258,17 +255,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_cmp_lt_u32 s0, 12 -; CHECK-NEXT: s_cselect_b32 s0, 1, 0 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB14_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB14_3 -; CHECK-NEXT: .LBB14_2: ; %true +; CHECK-NEXT: s_cbranch_scc1 .LBB14_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB14_3 +; CHECK-NEXT: .LBB14_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB14_3 ; CHECK-NEXT: .LBB14_3: %c = icmp ult i32 %v, 12 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) @@ -310,14 +303,12 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_cmp_lt_u32 s0, 12 +; CHECK-NEXT: s_cmp_ge_u32 s0, 12 ; CHECK-NEXT: s_cselect_b32 s0, 1, 0 -; CHECK-NEXT: s_cmp_gt_u32 s1, 34 +; CHECK-NEXT: s_cmp_le_u32 s1, 34 ; CHECK-NEXT: s_cselect_b32 s1, 1, 0 -; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_or_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB16_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 @@ -372,16 +363,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg ; CHECK-NEXT: s_cmp_gt_u32 s1, 34 ; CHECK-NEXT: s_cselect_b32 s1, 1, 0 ; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB18_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB18_3 -; CHECK-NEXT: .LBB18_2: ; %true +; CHECK-NEXT: s_cbranch_scc1 .LBB18_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB18_3 +; CHECK-NEXT: .LBB18_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB18_3 ; CHECK-NEXT: .LBB18_3: %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll index 7b8166948610b..250fbc7c0f147 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll @@ -116,9 +116,9 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 +; CHECK-NEXT: s_xor_b32 s0, s0, 1 +; CHECK-NEXT: s_and_b32 s0, s0, 1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB8_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 @@ -164,16 +164,17 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB10_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB10_3 -; CHECK-NEXT: .LBB10_2: ; %true +; CHECK-NEXT: s_xor_b32 s0, s0, 1 +; CHECK-NEXT: s_xor_b32 s0, s0, 1 +; CHECK-NEXT: s_and_b32 s0, s0, 1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB10_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB10_3 +; CHECK-NEXT: .LBB10_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB10_3 ; CHECK-NEXT: .LBB10_3: %c = trunc i32 %v to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) @@ -211,11 +212,7 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_cmp_lt_u32 s0, 12 -; CHECK-NEXT: s_cselect_b32 s0, 1, 0 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 +; CHECK-NEXT: s_cmp_ge_u32 s0, 12 ; CHECK-NEXT: s_cbranch_scc1 .LBB12_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 @@ -261,17 +258,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_cmp_lt_u32 s0, 12 -; CHECK-NEXT: s_cselect_b32 s0, 1, 0 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB14_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB14_3 -; CHECK-NEXT: .LBB14_2: ; %true +; CHECK-NEXT: s_cbranch_scc1 .LBB14_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB14_3 +; CHECK-NEXT: .LBB14_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB14_3 ; CHECK-NEXT: .LBB14_3: %c = icmp ult i32 %v, 12 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) @@ -313,14 +306,12 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_cmp_lt_u32 s0, 12 +; CHECK-NEXT: s_cmp_ge_u32 s0, 12 ; CHECK-NEXT: s_cselect_b32 s0, 1, 0 -; CHECK-NEXT: s_cmp_gt_u32 s1, 34 +; CHECK-NEXT: s_cmp_le_u32 s1, 34 ; CHECK-NEXT: s_cselect_b32 s1, 1, 0 -; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 +; CHECK-NEXT: s_or_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB16_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 @@ -375,16 +366,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg ; CHECK-NEXT: s_cmp_gt_u32 s1, 34 ; CHECK-NEXT: s_cselect_b32 s1, 1, 0 ; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB18_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB18_3 -; CHECK-NEXT: .LBB18_2: ; %true +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB18_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB18_3 +; CHECK-NEXT: .LBB18_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB18_3 ; CHECK-NEXT: .LBB18_3: %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll new file mode 100644 index 0000000000000..34d4c519851d4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll @@ -0,0 +1,173 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -o - %s | FileCheck %s +define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) { +; CHECK-LABEL: readfirstlane_with_readfirstlane: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5) + %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1) + store i32 %v2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) { +; CHECK-LABEL: readfirstlane_with_readlane: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_bfe_u32 v1, v0, 10, 10 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_readfirstlane_b32 s2, v1 +; CHECK-NEXT: v_readlane_b32 s2, v0, s2 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy) + %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1) + store i32 %v2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) { +; CHECK-LABEL: readlane_with_firstlane: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx) + %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3) + store i32 %v2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) { +; CHECK-LABEL: readlane_readlane: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_bfe_u32 v1, v0, 10, 10 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_readfirstlane_b32 s2, v1 +; CHECK-NEXT: v_readlane_b32 s2, v0, s2 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy) + %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2) + store i32 %v2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) { +; CHECK-LABEL: permlane64_uniform: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s2, s[4:5], 0x8 +; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane64(i32 %src) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) { +; CHECK-LABEL: permlane64_nonuniform: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_permlane64_b32 v1, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane64(i32 %tid) + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + store i32 %v, i32 addrspace(1)* %out_ptr + ret void +} + +define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) { +; CHECK-LABEL: permlane64_nonuniform_expression: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT: v_permlane64_b32 v1, v1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid2 = add i32 %tid, 1 + %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2) + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + store i32 %v, i32 addrspace(1)* %out_ptr + ret void +} + +define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1) %out) { +; CHECK-LABEL: trivial_waterfall_eq_zero: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5 +; CHECK-NEXT: s_mov_b32 s2, 0 +; CHECK-NEXT: s_branch .LBB7_2 +; CHECK-NEXT: .LBB7_1: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB7_2 Depth=1 +; CHECK-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_cbranch_vccz .LBB7_4 +; CHECK-NEXT: .LBB7_2: ; %while +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_cbranch_vccnz .LBB7_1 +; CHECK-NEXT: ; %bb.3: ; %if +; CHECK-NEXT: ; in Loop: Header=BB7_2 Depth=1 +; CHECK-NEXT: s_mov_b32 s2, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT: s_branch .LBB7_1 +; CHECK-NEXT: .LBB7_4: ; %exit +; CHECK-NEXT: s_endpgm +entry: + br label %while + +while: + %done = phi i1 [ 0, %entry ], [ 1, %if ] + %not_done = xor i1 %done, true + %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done) + %is_done = icmp eq i64 %ballot, 0 ; in this case is_done = !not_done + br i1 %is_done, label %exit, label %if + +if: + store i32 5, ptr addrspace(1) %out + br label %while + +exit: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll index 33ce278028bba..c962c05d24ad0 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -amdgpu-enable-uniform-intrinsic-combine=0 -O3 -S < %s | FileCheck %s -check-prefix=CURRENT-CHECK ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -O3 -S < %s | FileCheck %s -check-prefix=O3-CHECK define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll index a3e42e564376c..a7e828c95d69f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -amdgpu-enable-uniform-intrinsic-combine=0 -O3 -S < %s | FileCheck %s -check-prefix=CURRENT-CHECK +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s -check-prefix=DCE-CHECK diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll index 2fde3e3759f47..792926154f7a8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,instcombine,early-cse,simplifycfg -S < %s | FileCheck %s -check-prefix=COMB-CHECK ; This should not be optimized diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll index db32135939a5d..b8f084d5f82ad 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll @@ -4,24 +4,14 @@ define amdgpu_gs i32 @main() { ; CHECK-LABEL: main: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_bitcmp1_b32 0, 0 ; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_cselect_b32 s1, -1, 0 -; CHECK-NEXT: s_or_saveexec_b32 s2, -1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) -; CHECK-NEXT: v_readfirstlane_b32 s1, v0 -; CHECK-NEXT: s_mov_b32 exec_lo, s2 -; CHECK-NEXT: s_or_b32 s0, s0, s1 -; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; CHECK-NEXT: s_bitcmp1_b32 s0, 0 ; CHECK-NEXT: s_cselect_b32 s0, -1, 0 -; CHECK-NEXT: s_wait_alu 0xfffe ; CHECK-NEXT: s_xor_b32 s0, s0, -1 -; CHECK-NEXT: s_wait_alu 0xfffe -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) -; CHECK-NEXT: v_readfirstlane_b32 s0, v1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: s_wait_alu 0xf1ff ; CHECK-NEXT: ; return to shader part epilog bb: diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll index 3aa36635a0ab6..704ea37117f32 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll @@ -9,11 +9,11 @@ ; RUN: | FileCheck -check-prefix=GCN-O3 %s -; GCN-O0: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O0: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O2: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O2: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O3: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O3: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) define void @empty() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 6e5212580ba2e..ee6caab6f25cd 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -31,6 +31,11 @@ ; GCN-O0-NEXT: AMDGPU Remove Incompatible Functions ; GCN-O0-NEXT: AMDGPU Printf lowering ; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU +; GCN-O0-NEXT: FunctionPass Manager +; GCN-O0-NEXT: Dominator Tree Construction +; GCN-O0-NEXT: Cycle Info Analysis +; GCN-O0-NEXT: Uniformity Analysis +; GCN-O0-NEXT: AMDGPU Uniform Intrinsic Combine ; GCN-O0-NEXT: Expand variadic functions ; GCN-O0-NEXT: AMDGPU Inline All Functions ; GCN-O0-NEXT: Inliner for always_inline functions @@ -179,6 +184,11 @@ ; GCN-O1-NEXT: AMDGPU Remove Incompatible Functions ; GCN-O1-NEXT: AMDGPU Printf lowering ; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU +; GCN-O1-NEXT: FunctionPass Manager +; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: Cycle Info Analysis +; GCN-O1-NEXT: Uniformity Analysis +; GCN-O1-NEXT: AMDGPU Uniform Intrinsic Combine ; GCN-O1-NEXT: Expand variadic functions ; GCN-O1-NEXT: AMDGPU Inline All Functions ; GCN-O1-NEXT: Inliner for always_inline functions @@ -466,6 +476,11 @@ ; GCN-O1-OPTS-NEXT: AMDGPU Remove Incompatible Functions ; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering ; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU +; GCN-O1-OPTS-NEXT: FunctionPass Manager +; GCN-O1-OPTS-NEXT: Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Cycle Info Analysis +; GCN-O1-OPTS-NEXT: Uniformity Analysis +; GCN-O1-OPTS-NEXT: AMDGPU Uniform Intrinsic Combine ; GCN-O1-OPTS-NEXT: Expand variadic functions ; GCN-O1-OPTS-NEXT: AMDGPU Inline All Functions ; GCN-O1-OPTS-NEXT: Inliner for always_inline functions @@ -783,6 +798,10 @@ ; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: AMDGPU Image Intrinsic Optimizer +; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: Cycle Info Analysis +; GCN-O2-NEXT: Uniformity Analysis +; GCN-O2-NEXT: AMDGPU Uniform Intrinsic Combine ; GCN-O2-NEXT: Expand variadic functions ; GCN-O2-NEXT: AMDGPU Inline All Functions ; GCN-O2-NEXT: Inliner for always_inline functions @@ -1104,6 +1123,10 @@ ; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: AMDGPU Image Intrinsic Optimizer +; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: Cycle Info Analysis +; GCN-O3-NEXT: Uniformity Analysis +; GCN-O3-NEXT: AMDGPU Uniform Intrinsic Combine ; GCN-O3-NEXT: Expand variadic functions ; GCN-O3-NEXT: AMDGPU Inline All Functions ; GCN-O3-NEXT: Inliner for always_inline functions diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll index e00e1f13b2b77..aa591d28eb346 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll @@ -110,9 +110,8 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_and_b32 s0, s0, 1 -; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0 -; CHECK-NEXT: s_cbranch_vccz .LBB8_2 +; CHECK-NEXT: s_bitcmp0_b32 s0, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB8_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB8_3 @@ -156,15 +155,16 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_and_b32 s0, s0, 1 -; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0 -; CHECK-NEXT: s_cbranch_vccz .LBB10_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB10_3 -; CHECK-NEXT: .LBB10_2: ; %true +; CHECK-NEXT: s_bitcmp1_b32 s0, 0 +; CHECK-NEXT: s_cselect_b32 s0, -1, 0 +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; CHECK-NEXT: s_cbranch_vccnz .LBB10_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB10_3 +; CHECK-NEXT: .LBB10_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB10_3 ; CHECK-NEXT: .LBB10_3: %c = trunc i32 %v to i1 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) @@ -201,8 +201,8 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12 -; CHECK-NEXT: s_cbranch_vccz .LBB12_2 +; CHECK-NEXT: s_cmp_gt_u32 s0, 11 +; CHECK-NEXT: s_cbranch_scc1 .LBB12_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB12_3 @@ -245,14 +245,14 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12 -; CHECK-NEXT: s_cbranch_vccz .LBB14_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB14_3 -; CHECK-NEXT: .LBB14_2: ; %true +; CHECK-NEXT: s_cmp_lt_u32 s0, 12 +; CHECK-NEXT: s_cbranch_scc1 .LBB14_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB14_3 +; CHECK-NEXT: .LBB14_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB14_3 ; CHECK-NEXT: .LBB14_3: %c = icmp ult i32 %v, 12 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) @@ -293,13 +293,13 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_cmp_lt_u32 s0, 12 +; CHECK-NEXT: s_cmp_gt_u32 s0, 11 ; CHECK-NEXT: s_cselect_b32 s0, -1, 0 -; CHECK-NEXT: s_cmp_gt_u32 s1, 34 +; CHECK-NEXT: s_cmp_lt_u32 s1, 35 ; CHECK-NEXT: s_cselect_b32 s1, -1, 0 -; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: s_and_b32 s0, s0, exec_lo -; CHECK-NEXT: s_cbranch_scc0 .LBB16_2 +; CHECK-NEXT: s_or_b32 s0, s0, s1 +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; CHECK-NEXT: s_cbranch_vccnz .LBB16_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB16_3 @@ -353,14 +353,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg ; CHECK-NEXT: s_cmp_gt_u32 s1, 34 ; CHECK-NEXT: s_cselect_b32 s1, -1, 0 ; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: s_and_b32 s0, s0, exec_lo -; CHECK-NEXT: s_cbranch_scc0 .LBB18_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB18_3 -; CHECK-NEXT: .LBB18_2: ; %true +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; CHECK-NEXT: s_cbranch_vccnz .LBB18_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB18_3 +; CHECK-NEXT: .LBB18_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB18_3 ; CHECK-NEXT: .LBB18_3: %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll index b4adf7f641550..30c2c260a3274 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll @@ -113,9 +113,8 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_and_b32 s0, s0, 1 -; CHECK-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0 -; CHECK-NEXT: s_cbranch_vccz .LBB8_2 +; CHECK-NEXT: s_bitcmp0_b32 s0, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB8_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB8_3 @@ -159,15 +158,16 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_and_b32 s0, s0, 1 -; CHECK-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0 -; CHECK-NEXT: s_cbranch_vccz .LBB10_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB10_3 -; CHECK-NEXT: .LBB10_2: ; %true +; CHECK-NEXT: s_bitcmp1_b32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_cbranch_vccnz .LBB10_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB10_3 +; CHECK-NEXT: .LBB10_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB10_3 ; CHECK-NEXT: .LBB10_3: %c = trunc i32 %v to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) @@ -204,8 +204,8 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_lt_u32_e64 vcc, s0, 12 -; CHECK-NEXT: s_cbranch_vccz .LBB12_2 +; CHECK-NEXT: s_cmp_gt_u32 s0, 11 +; CHECK-NEXT: s_cbranch_scc1 .LBB12_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB12_3 @@ -248,14 +248,14 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_lt_u32_e64 vcc, s0, 12 -; CHECK-NEXT: s_cbranch_vccz .LBB14_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB14_3 -; CHECK-NEXT: .LBB14_2: ; %true +; CHECK-NEXT: s_cmp_lt_u32 s0, 12 +; CHECK-NEXT: s_cbranch_scc1 .LBB14_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB14_3 +; CHECK-NEXT: .LBB14_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB14_3 ; CHECK-NEXT: .LBB14_3: %c = icmp ult i32 %v, 12 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) @@ -296,13 +296,13 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_cmp_lt_u32 s0, 12 +; CHECK-NEXT: s_cmp_gt_u32 s0, 11 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_cmp_gt_u32 s1, 34 +; CHECK-NEXT: s_cmp_lt_u32 s1, 35 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CHECK-NEXT: s_cbranch_scc0 .LBB16_2 +; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_cbranch_vccnz .LBB16_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB16_3 @@ -356,14 +356,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg ; CHECK-NEXT: s_cmp_gt_u32 s1, 34 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CHECK-NEXT: s_cbranch_scc0 .LBB18_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB18_3 -; CHECK-NEXT: .LBB18_2: ; %true +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_cbranch_vccnz .LBB18_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB18_3 +; CHECK-NEXT: .LBB18_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB18_3 ; CHECK-NEXT: .LBB18_3: %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll index 6dd2258420998..39191d242574f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll @@ -23,10 +23,8 @@ define amdgpu_kernel void @test_s_i32(ptr addrspace(1) %out, i32 %src0) { ; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 -; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_s_i32: @@ -36,8 +34,6 @@ define amdgpu_kernel void @test_s_i32(ptr addrspace(1) %out, i32 %src0) { ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %src0) @@ -50,12 +46,9 @@ define amdgpu_kernel void @test_s_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 -; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_s_i64: @@ -64,9 +57,6 @@ define amdgpu_kernel void @test_s_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 -; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %src0) @@ -79,12 +69,9 @@ define amdgpu_kernel void @test_s_f64(ptr addrspace(1) %out, double %src0) { ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 -; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_s_f64: @@ -93,9 +80,6 @@ define amdgpu_kernel void @test_s_f64(ptr addrspace(1) %out, double %src0) { ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 -; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane64.f64(double %src0) @@ -116,19 +100,15 @@ define amdgpu_kernel void @test_i_i32(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_i_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x63 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_i_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm @@ -141,19 +121,15 @@ define amdgpu_kernel void @test_i_f32(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_i_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x449a5000 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_i_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm @@ -166,23 +142,16 @@ define amdgpu_kernel void @test_i_i64(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_i_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v2 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0x63 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: global_store_b64 v1, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_i_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x63 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 -; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm @@ -195,22 +164,16 @@ define amdgpu_kernel void @test_i_f64(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_i_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x40934a00 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40934a00 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: global_store_b64 v0, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_i_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x40934a00 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v2 -; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll index b0149f7de5e85..672b658659824 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll @@ -6,12 +6,9 @@ define amdgpu_kernel void @test_p0(ptr addrspace(1) %out, ptr %src0) { ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 -; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm %v = call ptr @llvm.amdgcn.permlane64.p0(ptr %src0) store ptr %v, ptr addrspace(1) %out @@ -22,21 +19,14 @@ define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) { ; GFX11-SDAG-LABEL: test_v3p0: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x2 -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 ; GFX11-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x54 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 ; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, s6 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v7, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v1 -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v4 -; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 -; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v8 -; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v7 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, s7 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: global_store_b64 v6, v[4:5], s[4:5] offset:16 ; GFX11-SDAG-NEXT: global_store_b128 v6, v[0:3], s[4:5] @@ -53,10 +43,8 @@ define amdgpu_kernel void @test_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0 ; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 -; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm %v = call ptr addrspace(3) @llvm.amdgcn.permlane64.v3p0(ptr addrspace(3) %src0) store ptr addrspace(3) %v, ptr addrspace(1) %out @@ -70,14 +58,9 @@ define amdgpu_kernel void @test_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 -; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-SDAG-NEXT: global_store_b96 v3, v[0:2], s[4:5] ; GFX11-SDAG-NEXT: s_endpgm %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane64.v3p3(<3 x ptr addrspace(3)> %src0) store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out @@ -91,10 +74,8 @@ define amdgpu_kernel void @test_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0 ; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 -; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm %v = call ptr addrspace(5) @llvm.amdgcn.permlane64.p5(ptr addrspace(5) %src0) store ptr addrspace(5) %v, ptr addrspace(1) %out @@ -108,14 +89,9 @@ define amdgpu_kernel void @test_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 -; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-SDAG-NEXT: global_store_b96 v3, v[0:2], s[4:5] ; GFX11-SDAG-NEXT: s_endpgm %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane64.v3p5(<3 x ptr addrspace(5)> %src0) store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out @@ -129,10 +105,8 @@ define amdgpu_kernel void @test_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0 ; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 -; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm %v = call ptr addrspace(6) @llvm.amdgcn.permlane64.p6(ptr addrspace(6) %src0) store ptr addrspace(6) %v, ptr addrspace(1) %out @@ -146,14 +120,9 @@ define amdgpu_kernel void @test_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 -; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-SDAG-NEXT: global_store_b96 v3, v[0:2], s[4:5] ; GFX11-SDAG-NEXT: s_endpgm %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane64.v3p6(<3 x ptr addrspace(6)> %src0) store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index d1ba892d7f7e1..02d29909c661c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -396,8 +396,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) { ; ; CHECK-GISEL-LABEL: test_readfirstlane_imm_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_mov_b32 s0, 0 -; CHECK-GISEL-NEXT: s_mov_b32 s1, 0x40400000 +; CHECK-GISEL-NEXT: s_mov_b64 s[0:1], 0x4040000000000000 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -456,14 +455,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 32 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -490,15 +488,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 -; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -588,17 +584,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64: @@ -628,17 +624,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index 7ff5eb46def38..0795f4050b622 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -9,7 +9,7 @@ declare double @llvm.amdgcn.readlane.f64(double, i32) #0 define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_load_dword s0, s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s0 @@ -18,7 +18,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_load_dword s0, s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s0 @@ -224,14 +224,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 32 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -258,15 +257,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 -; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -660,17 +657,17 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64: @@ -700,17 +697,17 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64: diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll index 586579fcaeb93..ef96944abef0e 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll @@ -20,38 +20,33 @@ define void @test() { ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: .LBB0_3: ; %bb.3 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: ; implicit-def: $sgpr4 -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_readfirstlane_b32 s6, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], -1 -; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: s_cmp_eq_u32 s6, s7 ; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v1, s4, 0 ; CHECK-NEXT: v_writelane_b32 v1, s5, 1 -; CHECK-NEXT: s_mov_b64 s[10:11], exec -; CHECK-NEXT: s_mov_b64 exec, -1 +; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: s_cbranch_scc1 .LBB0_5 ; CHECK-NEXT: ; %bb.4: ; %bb.4 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 +; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_writelane_b32 v1, s4, 0 ; CHECK-NEXT: v_writelane_b32 v1, s5, 1 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 +; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: .LBB0_5: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 +; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: v_readlane_b32 s4, v1, 0 ; CHECK-NEXT: v_readlane_b32 s5, v1, 1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll index 5aafb0f576fb4..364598f7cf6c0 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -31,8 +31,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr10 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr8 ; CHECK-NEXT: undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %125:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: KILL undef %125:sgpr_128 + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %117:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: KILL undef %117:sgpr_128 ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc @@ -44,87 +44,85 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.71, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 64, 0 :: (invariant load (s128) from %ir.88, addrspace 4) ; CHECK-NEXT: KILL undef %74:sreg_64 ; CHECK-NEXT: KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1 ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %112:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %87:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: KILL undef %89:sgpr_128 - ; CHECK-NEXT: KILL undef %118:sgpr_128 + ; CHECK-NEXT: KILL undef %112:sgpr_128 + ; CHECK-NEXT: KILL undef %87:sgpr_128 ; CHECK-NEXT: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_2:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.87, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.93, addrspace 4) - ; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1 + ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %148:sreg_32, 31, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %148:sreg_32, implicit-def $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.77, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.83, addrspace 4) ; CHECK-NEXT: KILL [[S_ADD_U32_2]].sub0, [[S_ADD_U32_2]].sub1 - ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %169:sreg_32, implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1 + ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %148:sreg_32, implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %302:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %279:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %363:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %334:sgpr_128, undef %335:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %345:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.95, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 0, 0 :: (invariant load (s128) from %ir.100, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.105, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %329:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %340:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_13:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_13:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %384:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %361:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 224, 0 :: (invariant load (s128) from %ir.117, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.133, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 576, 0 :: (invariant load (s128) from %ir.138, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.134, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.162, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.122, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 224, 0 :: (invariant load (s128) from %ir.128, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc @@ -135,49 +133,49 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_17]], 168, 0 :: (invariant load (s32) from %ir.260, align 8, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.145, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1 ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 0, 0 :: (invariant load (s128) from %ir.158, addrspace 4) ; CHECK-NEXT: [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1 ; CHECK-NEXT: [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.166, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.171, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s64) from %ir.269, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.211, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.193, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.199, addrspace 4) ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.216, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.221, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.204, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.209, addrspace 4) ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]] @@ -189,30 +187,30 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.280, addrspace 4) ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0 ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) - ; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1 - ; CHECK-NEXT: KILL undef %470:sreg_64 + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 160, 0 :: (invariant load (s128) from %ir.244, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %443:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: KILL [[S_ADD_U32_15]].sub0, [[S_ADD_U32_15]].sub1 ; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3 + ; CHECK-NEXT: KILL undef %443:sreg_64 ; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.252, addrspace 4) ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s32) from %ir.291, align 8, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]] + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]] @@ -224,22 +222,22 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4) - ; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4) - ; CHECK-NEXT: undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_21]], 96, 0 :: (invariant load (s128) from %ir.309, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.315, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.321, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]] - ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] + ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]] ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec @@ -351,13 +349,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec - ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %543:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %516:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4) ; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec ; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc ; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec ; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec - ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %557:vgpr_32, undef %559:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) + ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %530:vgpr_32, undef %532:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 .expVert: %0 = extractelement <31 x i32> %userData, i64 2 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index ad8dcd3888e9f..21f0c008366a9 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -3477,13 +3477,10 @@ define amdgpu_gs void @wqm_init_exec_wwm() { ; GFX9-W64-NEXT: s_mov_b64 exec, 0 ; GFX9-W64-NEXT: s_mov_b32 s1, 0 ; GFX9-W64-NEXT: s_mov_b32 s0, s1 -; GFX9-W64-NEXT: s_cmp_lg_u64 exec, 0 -; GFX9-W64-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-W64-NEXT: s_cmp_eq_u64 s[0:1], 0 ; GFX9-W64-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-W64-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] -; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] ; GFX9-W64-NEXT: exp mrt0 off, off, off, off ; GFX9-W64-NEXT: s_endpgm ; @@ -3491,14 +3488,11 @@ define amdgpu_gs void @wqm_init_exec_wwm() { ; GFX10-W32: ; %bb.0: ; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-W32-NEXT: s_mov_b32 s1, 0 -; GFX10-W32-NEXT: s_cmp_lg_u64 exec, 0 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-W32-NEXT: s_mov_b32 s0, s1 -; GFX10-W32-NEXT: s_cselect_b32 s2, -1, 0 -; GFX10-W32-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-W32-NEXT: s_cmp_eq_u64 s[0:1], 0 ; GFX10-W32-NEXT: s_cselect_b32 s0, -1, 0 -; GFX10-W32-NEXT: s_xor_b32 s0, s2, s0 -; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX10-W32-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0 ; GFX10-W32-NEXT: exp mrt0 off, off, off, off ; GFX10-W32-NEXT: s_endpgm call void @llvm.amdgcn.init.exec(i64 0) From 3fb1b1e6ca30ca2d10a6c3b8a4c1645b5a08adf7 Mon Sep 17 00:00:00 2001 From: Slava Gurevich Date: Thu, 30 Oct 2025 00:25:10 -0700 Subject: [PATCH 161/539] [mlir] Fix use-after-move issues (#165660) This patch addresses two use-after-move issues: 1. `Timing.cpp` A variable was std::moved and then immediately passed to an `assert()` check. Since the moved-from state made the assertion condition trivially true, the check was effectively useless. The `assert()` is removed. 2. `Query.cpp` The `matcher` object was moved-from and then subsequently used as if it still retained valid state. The fix ensures no subsequent use for the moved-from variable. Testing: `ninja check-mlir` --- mlir/lib/Query/Query.cpp | 5 +++-- mlir/lib/Support/Timing.cpp | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Query/Query.cpp b/mlir/lib/Query/Query.cpp index 375e82050a481..cf8a4d293299c 100644 --- a/mlir/lib/Query/Query.cpp +++ b/mlir/lib/Query/Query.cpp @@ -121,12 +121,13 @@ LogicalResult MatchQuery::run(llvm::raw_ostream &os, QuerySession &qs) const { Operation *rootOp = qs.getRootOp(); int matchCount = 0; matcher::MatchFinder finder; + + StringRef functionName = matcher.getFunctionName(); auto matches = finder.collectMatches(rootOp, std::move(matcher)); // An extract call is recognized by considering if the matcher has a name. // TODO: Consider making the extract more explicit. - if (matcher.hasFunctionName()) { - auto functionName = matcher.getFunctionName(); + if (!functionName.empty()) { std::vector flattenedMatches = finder.flattenMatchedOps(matches); Operation *function = diff --git a/mlir/lib/Support/Timing.cpp b/mlir/lib/Support/Timing.cpp index fb6f82c283df5..16306d72815f7 100644 --- a/mlir/lib/Support/Timing.cpp +++ b/mlir/lib/Support/Timing.cpp @@ -319,7 +319,6 @@ class TimerImpl { void mergeChildren(AsyncChildrenMap &&other) { for (auto &thread : other) { mergeChildren(std::move(thread.second)); - assert(thread.second.empty()); } other.clear(); } From b93442ceb0c6cf682bafbc11283784ba861b5fbc Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Thu, 30 Oct 2025 09:36:25 +0100 Subject: [PATCH 162/539] [clang] Add Bytes/Columns types to TextDiagnostic (#165541) In `TextDiagnostic.cpp`, we're using column- and byte indices everywhere, but we were using integers for them which made it hard to know what to pass where, and what was produced. To make matters worse, that `SourceManager` considers a "column" is a byte in `TextDiagnostic`. Add `Bytes` and `Columns` structs, which are not related so API using them can differentiate between values interpreted columns or bytes. --- clang/lib/Frontend/TextDiagnostic.cpp | 389 ++++++++++++++------------ 1 file changed, 217 insertions(+), 172 deletions(-) diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp index f5add2a941f72..c33d8f8ca9ebd 100644 --- a/clang/lib/Frontend/TextDiagnostic.cpp +++ b/clang/lib/Frontend/TextDiagnostic.cpp @@ -47,6 +47,43 @@ static constexpr raw_ostream::Colors CommentColor = raw_ostream::YELLOW; static constexpr raw_ostream::Colors LiteralColor = raw_ostream::GREEN; static constexpr raw_ostream::Colors KeywordColor = raw_ostream::BLUE; +namespace { +template class ColumnsOrBytes { +public: + int V = 0; + ColumnsOrBytes(int V) : V(V) {} + bool isValid() const { return V != -1; } + Sub next() const { return Sub(V + 1); } + Sub prev() const { return Sub(V - 1); } + + bool operator>(Sub O) const { return V > O.V; } + bool operator<(Sub O) const { return V < O.V; } + bool operator<=(Sub B) const { return V <= B.V; } + bool operator!=(Sub C) const { return C.V != V; } + + Sub operator+(Sub B) const { return Sub(V + B.V); } + Sub &operator+=(Sub B) { + V += B.V; + return *static_cast(this); + } + Sub operator-(Sub B) const { return Sub(V - B.V); } + Sub &operator-=(Sub B) { + V -= B.V; + return *static_cast(this); + } +}; + +class Bytes final : public ColumnsOrBytes { +public: + Bytes(int V) : ColumnsOrBytes(V) {} +}; + +class Columns final : public ColumnsOrBytes { +public: + Columns(int V) : ColumnsOrBytes(V) {} +}; +} // namespace + /// Add highlights to differences in template strings. static void applyTemplateHighlighting(raw_ostream &OS, StringRef Str, bool &Normal, bool Bold) { @@ -109,8 +146,8 @@ printableTextForNextCharacter(StringRef SourceLine, size_t *I, if (SourceLine[*I] == '\t') { assert(0 < TabStop && TabStop <= DiagnosticOptions::MaxTabStop && "Invalid -ftabstop value"); - unsigned Col = bytesSincePreviousTabOrLineBegin(SourceLine, *I); - unsigned NumSpaces = TabStop - (Col % TabStop); + unsigned LineBytes = bytesSincePreviousTabOrLineBegin(SourceLine, *I); + unsigned NumSpaces = TabStop - (LineBytes % TabStop); assert(0 < NumSpaces && NumSpaces <= TabStop && "Invalid computation of space amt"); ++(*I); @@ -220,33 +257,33 @@ static void expandTabs(std::string &SourceLine, unsigned TabStop) { /// (\\u3042 is represented in UTF-8 by three bytes and takes two columns to /// display) static void genColumnByteMapping(StringRef SourceLine, unsigned TabStop, - SmallVectorImpl &BytesOut, - SmallVectorImpl &ColumnsOut) { + SmallVectorImpl &BytesOut, + SmallVectorImpl &ColumnsOut) { assert(BytesOut.empty()); assert(ColumnsOut.empty()); if (SourceLine.empty()) { - BytesOut.resize(1u, 0); - ColumnsOut.resize(1u, 0); + BytesOut.resize(1u, Bytes(0)); + ColumnsOut.resize(1u, Columns(0)); return; } ColumnsOut.resize(SourceLine.size() + 1, -1); - int Columns = 0; + Columns NumColumns = 0; size_t I = 0; while (I < SourceLine.size()) { - ColumnsOut[I] = Columns; - BytesOut.resize(Columns + 1, -1); - BytesOut.back() = I; + ColumnsOut[I] = NumColumns; + BytesOut.resize(NumColumns.V + 1, -1); + BytesOut.back() = Bytes(I); auto [Str, Printable] = printableTextForNextCharacter(SourceLine, &I, TabStop); - Columns += llvm::sys::locale::columnWidth(Str); + NumColumns += Columns(llvm::sys::locale::columnWidth(Str)); } - ColumnsOut.back() = Columns; - BytesOut.resize(Columns + 1, -1); - BytesOut.back() = I; + ColumnsOut.back() = NumColumns; + BytesOut.resize(NumColumns.V + 1, -1); + BytesOut.back() = Bytes(I); } namespace { @@ -258,48 +295,52 @@ struct SourceColumnMap { assert(m_byteToColumn.size()==SourceLine.size()+1); assert(0 < m_byteToColumn.size() && 0 < m_columnToByte.size()); - assert(m_byteToColumn.size() - == static_cast(m_columnToByte.back()+1)); - assert(static_cast(m_byteToColumn.back()+1) - == m_columnToByte.size()); + assert(m_byteToColumn.size() == + static_cast(m_columnToByte.back().V + 1)); + assert(static_cast(m_byteToColumn.back().V + 1) == + m_columnToByte.size()); } - int columns() const { return m_byteToColumn.back(); } - int bytes() const { return m_columnToByte.back(); } + Columns columns() const { return m_byteToColumn.back(); } + Bytes bytes() const { return m_columnToByte.back(); } /// Map a byte to the column which it is at the start of, or return -1 /// if it is not at the start of a column (for a UTF-8 trailing byte). - int byteToColumn(int n) const { - assert(0<=n && n(m_byteToColumn.size())); - return m_byteToColumn[n]; + Columns byteToColumn(Bytes N) const { + assert(0 <= N.V && N.V < static_cast(m_byteToColumn.size())); + return m_byteToColumn[N.V]; } /// Map a byte to the first column which contains it. - int byteToContainingColumn(int N) const { - assert(0 <= N && N < static_cast(m_byteToColumn.size())); - while (m_byteToColumn[N] == -1) - --N; - return m_byteToColumn[N]; + Columns byteToContainingColumn(Bytes N) const { + assert(0 <= N.V && N.V < static_cast(m_byteToColumn.size())); + while (!m_byteToColumn[N.V].isValid()) + --N.V; + return m_byteToColumn[N.V]; } /// Map a column to the byte which starts the column, or return -1 if /// the column the second or subsequent column of an expanded tab or similar /// multi-column entity. - int columnToByte(int n) const { - assert(0<=n && n(m_columnToByte.size())); - return m_columnToByte[n]; + Bytes columnToByte(Columns N) const { + assert(0 <= N.V && N.V < static_cast(m_columnToByte.size())); + return m_columnToByte[N.V]; } /// Map from a byte index to the next byte which starts a column. - int startOfNextColumn(int N) const { - assert(0 <= N && N < static_cast(m_byteToColumn.size() - 1)); - while (byteToColumn(++N) == -1) {} + Bytes startOfNextColumn(Bytes N) const { + assert(0 <= N.V && N.V < static_cast(m_byteToColumn.size() - 1)); + N = N.next(); + while (!byteToColumn(N).isValid()) + N = N.next(); return N; } /// Map from a byte index to the previous byte which starts a column. - int startOfPreviousColumn(int N) const { - assert(0 < N && N < static_cast(m_byteToColumn.size())); - while (byteToColumn(--N) == -1) {} + Bytes startOfPreviousColumn(Bytes N) const { + assert(0 < N.V && N.V < static_cast(m_byteToColumn.size())); + N = N.prev(); + while (!byteToColumn(N).isValid()) + N = N.prev(); return N; } @@ -308,9 +349,9 @@ struct SourceColumnMap { } private: - const std::string m_SourceLine; - SmallVector m_byteToColumn; - SmallVector m_columnToByte; + StringRef m_SourceLine; + SmallVector m_byteToColumn; + SmallVector m_columnToByte; }; } // end anonymous namespace @@ -319,14 +360,15 @@ struct SourceColumnMap { static void selectInterestingSourceRegion(std::string &SourceLine, std::string &CaretLine, std::string &FixItInsertionLine, - unsigned Columns, + Columns NonGutterColumns, const SourceColumnMap &map) { - unsigned CaretColumns = CaretLine.size(); - unsigned FixItColumns = llvm::sys::locale::columnWidth(FixItInsertionLine); - unsigned MaxColumns = std::max(static_cast(map.columns()), - std::max(CaretColumns, FixItColumns)); + Columns CaretColumns = Columns(CaretLine.size()); + Columns FixItColumns = + Columns(llvm::sys::locale::columnWidth(FixItInsertionLine)); + Columns MaxColumns = + std::max({map.columns().V, CaretColumns.V, FixItColumns.V}); // if the number of columns is less than the desired number we're done - if (MaxColumns <= Columns) + if (MaxColumns <= NonGutterColumns) return; // No special characters are allowed in CaretLine. @@ -334,13 +376,13 @@ static void selectInterestingSourceRegion(std::string &SourceLine, // Find the slice that we need to display the full caret line // correctly. - unsigned CaretStart = 0, CaretEnd = CaretLine.size(); - for (; CaretStart != CaretEnd; ++CaretStart) - if (!isWhitespace(CaretLine[CaretStart])) + Columns CaretStart = 0, CaretEnd = CaretLine.size(); + for (; CaretStart != CaretEnd; CaretStart = CaretStart.next()) + if (!isWhitespace(CaretLine[CaretStart.V])) break; - for (; CaretEnd != CaretStart; --CaretEnd) - if (!isWhitespace(CaretLine[CaretEnd - 1])) + for (; CaretEnd != CaretStart; CaretEnd = CaretEnd.prev()) + if (!isWhitespace(CaretLine[CaretEnd.V - 1])) break; // caret has already been inserted into CaretLine so the above whitespace @@ -349,39 +391,38 @@ static void selectInterestingSourceRegion(std::string &SourceLine, // If we have a fix-it line, make sure the slice includes all of the // fix-it information. if (!FixItInsertionLine.empty()) { - unsigned FixItStart = 0, FixItEnd = FixItInsertionLine.size(); - for (; FixItStart != FixItEnd; ++FixItStart) - if (!isWhitespace(FixItInsertionLine[FixItStart])) - break; - - for (; FixItEnd != FixItStart; --FixItEnd) - if (!isWhitespace(FixItInsertionLine[FixItEnd - 1])) - break; - // We can safely use the byte offset FixItStart as the column offset // because the characters up until FixItStart are all ASCII whitespace // characters. - unsigned FixItStartCol = FixItStart; - unsigned FixItEndCol - = llvm::sys::locale::columnWidth(FixItInsertionLine.substr(0, FixItEnd)); - - CaretStart = std::min(FixItStartCol, CaretStart); - CaretEnd = std::max(FixItEndCol, CaretEnd); + Bytes FixItStart = 0; + Bytes FixItEnd = Bytes(FixItInsertionLine.size()); + while (FixItStart != FixItEnd && + isWhitespace(FixItInsertionLine[FixItStart.V])) + FixItStart = FixItStart.next(); + + while (FixItEnd != FixItStart && + isWhitespace(FixItInsertionLine[FixItEnd.V - 1])) + FixItEnd = FixItEnd.prev(); + + Columns FixItStartCol = Columns(FixItStart.V); + Columns FixItEndCol = Columns(llvm::sys::locale::columnWidth( + FixItInsertionLine.substr(0, FixItEnd.V))); + + CaretStart = std::min(FixItStartCol.V, CaretStart.V); + CaretEnd = std::max(FixItEndCol.V, CaretEnd.V); } // CaretEnd may have been set at the middle of a character // If it's not at a character's first column then advance it past the current // character. - while (static_cast(CaretEnd) < map.columns() && - -1 == map.columnToByte(CaretEnd)) - ++CaretEnd; - - assert((static_cast(CaretStart) > map.columns() || - -1!=map.columnToByte(CaretStart)) && - "CaretStart must not point to a column in the middle of a source" - " line character"); - assert((static_cast(CaretEnd) > map.columns() || - -1!=map.columnToByte(CaretEnd)) && + while (CaretEnd < map.columns() && !map.columnToByte(CaretEnd).isValid()) + CaretEnd = CaretEnd.next(); + + assert( + (CaretStart > map.columns() || map.columnToByte(CaretStart).isValid()) && + "CaretStart must not point to a column in the middle of a source" + " line character"); + assert((CaretEnd > map.columns() || map.columnToByte(CaretEnd).isValid()) && "CaretEnd must not point to a column in the middle of a source line" " character"); @@ -390,70 +431,70 @@ static void selectInterestingSourceRegion(std::string &SourceLine, // number of columns we have, try to grow the slice to encompass // more context. - unsigned SourceStart = map.columnToByte(std::min(CaretStart, - map.columns())); - unsigned SourceEnd = map.columnToByte(std::min(CaretEnd, - map.columns())); + Bytes SourceStart = map.columnToByte(std::min(CaretStart.V, map.columns().V)); + Bytes SourceEnd = map.columnToByte(std::min(CaretEnd.V, map.columns().V)); - unsigned CaretColumnsOutsideSource = CaretEnd-CaretStart - - (map.byteToColumn(SourceEnd)-map.byteToColumn(SourceStart)); + Columns CaretColumnsOutsideSource = + CaretEnd - CaretStart - + (map.byteToColumn(SourceEnd) - map.byteToColumn(SourceStart)); char const *front_ellipse = " ..."; char const *front_space = " "; char const *back_ellipse = "..."; - unsigned ellipses_space = strlen(front_ellipse) + strlen(back_ellipse); + Columns EllipsesColumns = + Columns(strlen(front_ellipse) + strlen(back_ellipse)); - unsigned TargetColumns = Columns; + Columns TargetColumns = Columns(NonGutterColumns); // Give us extra room for the ellipses // and any of the caret line that extends past the source - if (TargetColumns > ellipses_space+CaretColumnsOutsideSource) - TargetColumns -= ellipses_space+CaretColumnsOutsideSource; + if (TargetColumns > EllipsesColumns + CaretColumnsOutsideSource) + TargetColumns -= EllipsesColumns + CaretColumnsOutsideSource; - while (SourceStart>0 || SourceEnd 0 || SourceEnd < SourceLine.size()) { bool ExpandedRegion = false; - if (SourceStart>0) { - unsigned NewStart = map.startOfPreviousColumn(SourceStart); + if (SourceStart > 0) { + Bytes NewStart = map.startOfPreviousColumn(SourceStart); // Skip over any whitespace we see here; we're looking for // another bit of interesting text. // FIXME: Detect non-ASCII whitespace characters too. - while (NewStart && isWhitespace(SourceLine[NewStart])) + while (NewStart > 0 && isWhitespace(SourceLine[NewStart.V])) NewStart = map.startOfPreviousColumn(NewStart); // Skip over this bit of "interesting" text. - while (NewStart) { - unsigned Prev = map.startOfPreviousColumn(NewStart); - if (isWhitespace(SourceLine[Prev])) + while (NewStart > 0) { + Bytes Prev = map.startOfPreviousColumn(NewStart); + if (isWhitespace(SourceLine[Prev.V])) break; NewStart = Prev; } - assert(map.byteToColumn(NewStart) != -1); - unsigned NewColumns = map.byteToColumn(SourceEnd) - - map.byteToColumn(NewStart); + assert(map.byteToColumn(NewStart).isValid()); + Columns NewColumns = + map.byteToColumn(SourceEnd) - map.byteToColumn(NewStart); if (NewColumns <= TargetColumns) { SourceStart = NewStart; ExpandedRegion = true; } } - if (SourceEnd(SourceLine.size())}) - + map.byteToColumn(SourceEnd); + Columns FrontColumnsRemoved = CaretStart; + Columns ColumnsKept = CaretEnd - CaretStart; // We checked up front that the line needed truncation - assert(FrontColumnsRemoved+ColumnsKept+BackColumnsRemoved > Columns); + assert(FrontColumnsRemoved + ColumnsKept + BackColumnsRemoved > + NonGutterColumns); // The line needs some truncation, and we'd prefer to keep the front // if possible, so remove the back - if (BackColumnsRemoved > strlen(back_ellipse)) - SourceLine.replace(SourceEnd, std::string::npos, back_ellipse); + if (BackColumnsRemoved > Columns(strlen(back_ellipse))) + SourceLine.replace(SourceEnd.V, std::string::npos, back_ellipse); // If that's enough then we're done - if (FrontColumnsRemoved+ColumnsKept <= Columns) + if (FrontColumnsRemoved + ColumnsKept <= Columns(NonGutterColumns)) return; // Otherwise remove the front as well - if (FrontColumnsRemoved > strlen(front_ellipse)) { - SourceLine.replace(0, SourceStart, front_ellipse); - CaretLine.replace(0, CaretStart, front_space); + if (FrontColumnsRemoved > Columns(strlen(front_ellipse))) { + SourceLine.replace(0, SourceStart.V, front_ellipse); + CaretLine.replace(0, CaretStart.V, front_space); if (!FixItInsertionLine.empty()) - FixItInsertionLine.replace(0, CaretStart, front_space); + FixItInsertionLine.replace(0, CaretStart.V, front_space); } } @@ -961,41 +1004,40 @@ maybeAddRange(std::pair A, std::pair B, struct LineRange { unsigned LineNo; - unsigned StartCol; - unsigned EndCol; + Bytes StartByte; + Bytes EndByte; }; /// Highlight \p R (with ~'s) on the current source line. static void highlightRange(const LineRange &R, const SourceColumnMap &Map, std::string &CaretLine) { // Pick the first non-whitespace column. - unsigned StartColNo = R.StartCol; - while (StartColNo < Map.getSourceLine().size() && - (Map.getSourceLine()[StartColNo] == ' ' || - Map.getSourceLine()[StartColNo] == '\t')) - StartColNo = Map.startOfNextColumn(StartColNo); + Bytes StartByte = R.StartByte; + while (StartByte < Map.bytes() && (Map.getSourceLine()[StartByte.V] == ' ' || + Map.getSourceLine()[StartByte.V] == '\t')) + StartByte = Map.startOfNextColumn(StartByte); // Pick the last non-whitespace column. - unsigned EndColNo = - std::min(static_cast(R.EndCol), Map.getSourceLine().size()); - while (EndColNo && (Map.getSourceLine()[EndColNo - 1] == ' ' || - Map.getSourceLine()[EndColNo - 1] == '\t')) - EndColNo = Map.startOfPreviousColumn(EndColNo); + Bytes EndByte = std::min(R.EndByte.V, Map.bytes().V); + while (EndByte.V != 0 && (Map.getSourceLine()[EndByte.V - 1] == ' ' || + Map.getSourceLine()[EndByte.V - 1] == '\t')) + EndByte = Map.startOfPreviousColumn(EndByte); // If the start/end passed each other, then we are trying to highlight a // range that just exists in whitespace. That most likely means we have // a multi-line highlighting range that covers a blank line. - if (StartColNo > EndColNo) + if (StartByte > EndByte) return; + assert(StartByte <= EndByte && "Invalid range!"); // Fill the range with ~'s. - StartColNo = Map.byteToContainingColumn(StartColNo); - EndColNo = Map.byteToContainingColumn(EndColNo); + Columns StartCol = Map.byteToContainingColumn(StartByte); + Columns EndCol = Map.byteToContainingColumn(EndByte); + + if (CaretLine.size() < static_cast(EndCol.V)) + CaretLine.resize(EndCol.V, ' '); - assert(StartColNo <= EndColNo && "Invalid range!"); - if (CaretLine.size() < EndColNo) - CaretLine.resize(EndColNo, ' '); - std::fill(CaretLine.begin() + StartColNo, CaretLine.begin() + EndColNo, '~'); + std::fill(CaretLine.begin() + StartCol.V, CaretLine.begin() + EndCol.V, '~'); } static std::string buildFixItInsertionLine(FileID FID, unsigned LineNo, @@ -1006,7 +1048,7 @@ static std::string buildFixItInsertionLine(FileID FID, unsigned LineNo, std::string FixItInsertionLine; if (Hints.empty() || !DiagOpts.ShowFixits) return FixItInsertionLine; - unsigned PrevHintEndCol = 0; + Columns PrevHintEndCol = 0; for (const auto &H : Hints) { if (H.CodeToInsert.empty()) @@ -1024,12 +1066,13 @@ static std::string buildFixItInsertionLine(FileID FID, unsigned LineNo, // Note: When modifying this function, be very careful about what is a // "column" (printed width, platform-dependent) and what is a // "byte offset" (SourceManager "column"). - unsigned HintByteOffset = - SM.getColumnNumber(HintLocInfo.first, HintLocInfo.second) - 1; + Bytes HintByteOffset = + Bytes(SM.getColumnNumber(HintLocInfo.first, HintLocInfo.second)) + .prev(); // The hint must start inside the source or right at the end - assert(HintByteOffset < static_cast(map.bytes()) + 1); - unsigned HintCol = map.byteToContainingColumn(HintByteOffset); + assert(HintByteOffset < map.bytes().next()); + Columns HintCol = map.byteToContainingColumn(HintByteOffset); // If we inserted a long previous hint, push this one forwards, and add // an extra space to show that this is not part of the previous @@ -1043,11 +1086,11 @@ static std::string buildFixItInsertionLine(FileID FID, unsigned LineNo, // This should NOT use HintByteOffset, because the source might have // Unicode characters in earlier columns. - unsigned NewFixItLineSize = FixItInsertionLine.size() + - (HintCol - PrevHintEndCol) + - H.CodeToInsert.size(); + Columns NewFixItLineSize = Columns(FixItInsertionLine.size()) + + (HintCol - PrevHintEndCol) + + Columns(H.CodeToInsert.size()); if (NewFixItLineSize > FixItInsertionLine.size()) - FixItInsertionLine.resize(NewFixItLineSize, ' '); + FixItInsertionLine.resize(NewFixItLineSize.V, ' '); std::copy(H.CodeToInsert.begin(), H.CodeToInsert.end(), FixItInsertionLine.end() - H.CodeToInsert.size()); @@ -1095,28 +1138,29 @@ prepareAndFilterRanges(const SmallVectorImpl &Ranges, if (EndLineNo < Lines.first || SM.getFileID(End) != FID) continue; - unsigned StartColumn = SM.getExpansionColumnNumber(Begin); - unsigned EndColumn = SM.getExpansionColumnNumber(End); - assert(StartColumn && "StartColumn must be valid, 0 is invalid"); - assert(EndColumn && "EndColumn must be valid, 0 is invalid"); + Bytes StartByte = SM.getExpansionColumnNumber(Begin); + Bytes EndByte = SM.getExpansionColumnNumber(End); + assert(StartByte.V != 0 && "StartByte must be valid, 0 is invalid"); + assert(EndByte.V != 0 && "EndByte must be valid, 0 is invalid"); if (R.isTokenRange()) - EndColumn += Lexer::MeasureTokenLength(End, SM, LangOpts); + EndByte += Bytes(Lexer::MeasureTokenLength(End, SM, LangOpts)); // Only a single line. if (StartLineNo == EndLineNo) { - LineRanges.push_back({StartLineNo, StartColumn - 1, EndColumn - 1}); + LineRanges.push_back({StartLineNo, StartByte.prev(), EndByte.prev()}); continue; } // Start line. - LineRanges.push_back({StartLineNo, StartColumn - 1, ~0u}); + LineRanges.push_back( + {StartLineNo, StartByte.prev(), std::numeric_limits::max()}); // Middle lines. for (unsigned S = StartLineNo + 1; S != EndLineNo; ++S) - LineRanges.push_back({S, 0, ~0u}); + LineRanges.push_back({S, 0, std::numeric_limits::max()}); // End line. - LineRanges.push_back({EndLineNo, 0, EndColumn - 1}); + LineRanges.push_back({EndLineNo, 0, EndByte.prev()}); } return LineRanges; @@ -1226,8 +1270,7 @@ highlightLines(StringRef FileData, unsigned StartLineNumber, if (TokenStartLine > EndLineNumber) break; - unsigned StartCol = - SM.getSpellingColumnNumber(T.getLocation(), &Invalid) - 1; + Bytes StartCol = SM.getSpellingColumnNumber(T.getLocation(), &Invalid) - 1; if (Invalid) continue; @@ -1235,14 +1278,14 @@ highlightLines(StringRef FileData, unsigned StartLineNumber, if (TokenStartLine == TokenEndLine) { SmallVector &LineRanges = SnippetRanges[TokenStartLine - StartLineNumber]; - appendStyle(LineRanges, T, StartCol, T.getLength()); + appendStyle(LineRanges, T, StartCol.V, T.getLength()); continue; } assert((TokenEndLine - TokenStartLine) >= 1); // For tokens that span multiple lines (think multiline comments), we // divide them into multiple StyleRanges. - unsigned EndCol = SM.getSpellingColumnNumber(T.getEndLoc(), &Invalid) - 1; + Bytes EndCol = SM.getSpellingColumnNumber(T.getEndLoc(), &Invalid) - 1; if (Invalid) continue; @@ -1258,9 +1301,9 @@ highlightLines(StringRef FileData, unsigned StartLineNumber, SnippetRanges[L - StartLineNumber]; if (L == TokenStartLine) // First line - appendStyle(LineRanges, T, StartCol, LineLength); + appendStyle(LineRanges, T, StartCol.V, LineLength); else if (L == TokenEndLine) // Last line - appendStyle(LineRanges, T, 0, EndCol); + appendStyle(LineRanges, T, 0, EndCol.V); else appendStyle(LineRanges, T, 0, LineLength); } @@ -1315,11 +1358,11 @@ void TextDiagnostic::emitSnippetAndCaret( const char *BufEnd = BufStart + BufData.size(); unsigned CaretLineNo = Loc.getLineNumber(); - unsigned CaretColNo = Loc.getColumnNumber(); + Bytes CaretByte = Loc.getColumnNumber(); // Arbitrarily stop showing snippets when the line is too long. static const size_t MaxLineLengthToPrint = 4096; - if (CaretColNo > MaxLineLengthToPrint) + if (CaretByte > MaxLineLengthToPrint) return; // Find the set of lines to include. @@ -1379,35 +1422,37 @@ void TextDiagnostic::emitSnippetAndCaret( std::string SourceLine(LineStart, LineEnd); // Remove trailing null bytes. while (!SourceLine.empty() && SourceLine.back() == '\0' && - (LineNo != CaretLineNo || SourceLine.size() > CaretColNo)) + (LineNo != CaretLineNo || + SourceLine.size() > static_cast(CaretByte.V))) SourceLine.pop_back(); // Build the byte to column map. - const SourceColumnMap sourceColMap(SourceLine, DiagOpts.TabStop); + const SourceColumnMap SourceColMap(SourceLine, DiagOpts.TabStop); std::string CaretLine; // Highlight all of the characters covered by Ranges with ~ characters. for (const auto &LR : LineRanges) { if (LR.LineNo == LineNo) - highlightRange(LR, sourceColMap, CaretLine); + highlightRange(LR, SourceColMap, CaretLine); } // Next, insert the caret itself. if (CaretLineNo == LineNo) { - size_t Col = sourceColMap.byteToContainingColumn(CaretColNo - 1); - CaretLine.resize(std::max(Col + 1, CaretLine.size()), ' '); - CaretLine[Col] = '^'; + Columns Col = SourceColMap.byteToContainingColumn(CaretByte.prev()); + CaretLine.resize( + std::max(static_cast(Col.V) + 1, CaretLine.size()), ' '); + CaretLine[Col.V] = '^'; } std::string FixItInsertionLine = - buildFixItInsertionLine(FID, LineNo, sourceColMap, Hints, SM, DiagOpts); + buildFixItInsertionLine(FID, LineNo, SourceColMap, Hints, SM, DiagOpts); // If the source line is too long for our terminal, select only the // "interesting" source region within that line. - unsigned Columns = DiagOpts.MessageLength; - if (Columns) + Columns MessageLength = DiagOpts.MessageLength; + if (MessageLength.V != 0) selectInterestingSourceRegion(SourceLine, CaretLine, FixItInsertionLine, - Columns, sourceColMap); + MessageLength, SourceColMap); // If we are in -fdiagnostics-print-source-range-info mode, we are trying // to produce easily machine parsable output. Add a space before the From ea1fdeb5488cb728bd83c578a950a1afd78fd34d Mon Sep 17 00:00:00 2001 From: Valery Pykhtin Date: Thu, 30 Oct 2025 09:41:33 +0100 Subject: [PATCH 163/539] [utils][UpdateTestChecks] Extract MIR functionality into separate mir.py module (#165535) This commit extracts some MIR-related code from `common.py` and `update_mir_test_checks.py` into a dedicated `mir.py` module to improve code organization. This is a preparation step for https://github.com/llvm/llvm-project/pull/164965 and also moves some pieces already moved by https://github.com/llvm/llvm-project/pull/140296 All code intentionally moved verbatim with minimal necessary adaptations: * `log()` calls converted to `print(..., file=sys.stderr)` at `mir.py` lines 62, 64 due to a `log` locality. --- llvm/utils/UpdateTestChecks/common.py | 238 ------------ llvm/utils/UpdateTestChecks/mir.py | 362 ++++++++++++++++++ .../update_givaluetracking_test_checks.py | 3 +- llvm/utils/update_mir_test_checks.py | 121 +----- 4 files changed, 367 insertions(+), 357 deletions(-) create mode 100644 llvm/utils/UpdateTestChecks/mir.py diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index 8cd200c93a482..b6b80ea117672 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -2396,244 +2396,6 @@ def add_analyze_checks( ) -IR_FUNC_NAME_RE = re.compile( - r"^\s*define\s+(?:internal\s+)?[^@]*@(?P[A-Za-z0-9_.]+)\s*\(" -) -IR_PREFIX_DATA_RE = re.compile(r"^ *(;|$)") -MIR_FUNC_NAME_RE = re.compile(r" *name: *(?P[A-Za-z0-9_.-]+)") -MIR_BODY_BEGIN_RE = re.compile(r" *body: *\|") -MIR_BASIC_BLOCK_RE = re.compile(r" *bb\.[0-9]+.*:$") -MIR_PREFIX_DATA_RE = re.compile(r"^ *(;|bb.[0-9].*: *$|[a-z]+:( |$)|$)") - - -def find_mir_functions_with_one_bb(lines, verbose=False): - result = [] - cur_func = None - bbs = 0 - for line in lines: - m = MIR_FUNC_NAME_RE.match(line) - if m: - if bbs == 1: - result.append(cur_func) - cur_func = m.group("func") - bbs = 0 - m = MIR_BASIC_BLOCK_RE.match(line) - if m: - bbs += 1 - if bbs == 1: - result.append(cur_func) - return result - - -def add_mir_checks_for_function( - test, - output_lines, - run_list, - func_dict, - func_name, - single_bb, - print_fixed_stack, - first_check_is_next, - at_the_function_name, -): - printed_prefixes = set() - for run in run_list: - for prefix in run[0]: - if prefix in printed_prefixes: - break - if not func_dict[prefix][func_name]: - continue - if printed_prefixes: - # Add some space between different check prefixes. - indent = len(output_lines[-1]) - len(output_lines[-1].lstrip(" ")) - output_lines.append(" " * indent + ";") - printed_prefixes.add(prefix) - add_mir_check_lines( - test, - output_lines, - prefix, - ("@" if at_the_function_name else "") + func_name, - single_bb, - func_dict[prefix][func_name], - print_fixed_stack, - first_check_is_next, - ) - break - else: - warn( - "Found conflicting asm for function: {}".format(func_name), - test_file=test, - ) - return output_lines - - -def add_mir_check_lines( - test, - output_lines, - prefix, - func_name, - single_bb, - func_info, - print_fixed_stack, - first_check_is_next, -): - func_body = str(func_info).splitlines() - if single_bb: - # Don't bother checking the basic block label for a single BB - func_body.pop(0) - - if not func_body: - warn( - "Function has no instructions to check: {}".format(func_name), - test_file=test, - ) - return - - first_line = func_body[0] - indent = len(first_line) - len(first_line.lstrip(" ")) - # A check comment, indented the appropriate amount - check = "{:>{}}; {}".format("", indent, prefix) - - output_lines.append("{}-LABEL: name: {}".format(check, func_name)) - - if print_fixed_stack: - output_lines.append("{}: fixedStack:".format(check)) - for stack_line in func_info.extrascrub.splitlines(): - filecheck_directive = check + "-NEXT" - output_lines.append("{}: {}".format(filecheck_directive, stack_line)) - - first_check = not first_check_is_next - for func_line in func_body: - if not func_line.strip(): - # The mir printer prints leading whitespace so we can't use CHECK-EMPTY: - output_lines.append(check + "-NEXT: {{" + func_line + "$}}") - continue - filecheck_directive = check if first_check else check + "-NEXT" - first_check = False - check_line = "{}: {}".format(filecheck_directive, func_line[indent:]).rstrip() - output_lines.append(check_line) - - -def should_add_mir_line_to_output(input_line, prefix_set): - # Skip any check lines that we're handling as well as comments - m = CHECK_RE.match(input_line) - if (m and m.group(1) in prefix_set) or input_line.strip() == ";": - return False - return True - - -def add_mir_checks( - input_lines, - prefix_set, - autogenerated_note, - test, - run_list, - func_dict, - print_fixed_stack, - first_check_is_next, - at_the_function_name, -): - simple_functions = find_mir_functions_with_one_bb(input_lines) - - output_lines = [] - output_lines.append(autogenerated_note) - - func_name = None - state = "toplevel" - for input_line in input_lines: - if input_line == autogenerated_note: - continue - - if state == "toplevel": - m = IR_FUNC_NAME_RE.match(input_line) - if m: - state = "ir function prefix" - func_name = m.group("func") - if input_line.rstrip("| \r\n") == "---": - state = "document" - output_lines.append(input_line) - elif state == "document": - m = MIR_FUNC_NAME_RE.match(input_line) - if m: - state = "mir function metadata" - func_name = m.group("func") - if input_line.strip() == "...": - state = "toplevel" - func_name = None - if should_add_mir_line_to_output(input_line, prefix_set): - output_lines.append(input_line) - elif state == "mir function metadata": - if should_add_mir_line_to_output(input_line, prefix_set): - output_lines.append(input_line) - m = MIR_BODY_BEGIN_RE.match(input_line) - if m: - if func_name in simple_functions: - # If there's only one block, put the checks inside it - state = "mir function prefix" - continue - state = "mir function body" - add_mir_checks_for_function( - test, - output_lines, - run_list, - func_dict, - func_name, - single_bb=False, - print_fixed_stack=print_fixed_stack, - first_check_is_next=first_check_is_next, - at_the_function_name=at_the_function_name, - ) - elif state == "mir function prefix": - m = MIR_PREFIX_DATA_RE.match(input_line) - if not m: - state = "mir function body" - add_mir_checks_for_function( - test, - output_lines, - run_list, - func_dict, - func_name, - single_bb=True, - print_fixed_stack=print_fixed_stack, - first_check_is_next=first_check_is_next, - at_the_function_name=at_the_function_name, - ) - - if should_add_mir_line_to_output(input_line, prefix_set): - output_lines.append(input_line) - elif state == "mir function body": - if input_line.strip() == "...": - state = "toplevel" - func_name = None - if should_add_mir_line_to_output(input_line, prefix_set): - output_lines.append(input_line) - elif state == "ir function prefix": - m = IR_PREFIX_DATA_RE.match(input_line) - if not m: - state = "ir function body" - add_mir_checks_for_function( - test, - output_lines, - run_list, - func_dict, - func_name, - single_bb=False, - print_fixed_stack=print_fixed_stack, - first_check_is_next=first_check_is_next, - at_the_function_name=at_the_function_name, - ) - - if should_add_mir_line_to_output(input_line, prefix_set): - output_lines.append(input_line) - elif state == "ir function body": - if input_line.strip() == "}": - state = "toplevel" - func_name = None - if should_add_mir_line_to_output(input_line, prefix_set): - output_lines.append(input_line) - return output_lines - - def build_global_values_dictionary(glob_val_dict, raw_tool_output, prefixes, ginfo): for nameless_value in ginfo.get_nameless_values(): if nameless_value.global_ir_rhs_regexp is None: diff --git a/llvm/utils/UpdateTestChecks/mir.py b/llvm/utils/UpdateTestChecks/mir.py new file mode 100644 index 0000000000000..24bb8b341d335 --- /dev/null +++ b/llvm/utils/UpdateTestChecks/mir.py @@ -0,0 +1,362 @@ +"""MIR test utility functions for UpdateTestChecks scripts.""" + +import re +import sys +from UpdateTestChecks import common +from UpdateTestChecks.common import ( + CHECK_RE, + warn, +) + +IR_FUNC_NAME_RE = re.compile( + r"^\s*define\s+(?:internal\s+)?[^@]*@(?P[A-Za-z0-9_.]+)\s*\(" +) +IR_PREFIX_DATA_RE = re.compile(r"^ *(;|$)") +MIR_FUNC_NAME_RE = re.compile(r" *name: *(?P[A-Za-z0-9_.-]+)") +MIR_BODY_BEGIN_RE = re.compile(r" *body: *\|") +MIR_BASIC_BLOCK_RE = re.compile(r" *bb\.[0-9]+.*:$") +MIR_PREFIX_DATA_RE = re.compile(r"^ *(;|bb.[0-9].*: *$|[a-z]+:( |$)|$)") + +VREG_RE = re.compile(r"(%[0-9]+)(?:\.[a-z0-9_]+)?(?::[a-z0-9_]+)?(?:\([<>a-z0-9 ]+\))?") +MI_FLAGS_STR = ( + r"(frame-setup |frame-destroy |nnan |ninf |nsz |arcp |contract |afn " + r"|reassoc |nuw |nsw |exact |nofpexcept |nomerge |unpredictable " + r"|noconvergent |nneg |disjoint |nusw |samesign |inbounds )*" +) +VREG_DEF_FLAGS_STR = r"(?:dead |undef )*" + +# Pattern to match the defined vregs and the opcode of an instruction that +# defines vregs. Opcodes starting with a lower-case 't' are allowed to match +# ARM's thumb instructions, like tADDi8 and t2ADDri. +VREG_DEF_RE = re.compile( + r"^ *(?P{2}{0}(?:, {2}{0})*) = " + r"{1}(?P[A-Zt][A-Za-z0-9_]+)".format( + VREG_RE.pattern, MI_FLAGS_STR, VREG_DEF_FLAGS_STR + ) +) + +MIR_FUNC_RE = re.compile( + r"^---$" + r"\n" + r"^ *name: *(?P[A-Za-z0-9_.-]+)$" + r".*?" + r"(?:^ *fixedStack: *(\[\])? *\n" + r"(?P.*?)\n?" + r"^ *stack:" + r".*?)?" + r"^ *body: *\|\n" + r"(?P.*?)\n" + r"^\.\.\.$", + flags=(re.M | re.S), +) + + +def build_function_info_dictionary( + test, raw_tool_output, triple, prefixes, func_dict, verbose +): + for m in MIR_FUNC_RE.finditer(raw_tool_output): + func = m.group("func") + fixedStack = m.group("fixedStack") + body = m.group("body") + if verbose: + print("Processing function: {}".format(func), file=sys.stderr) + for l in body.splitlines(): + print(" {}".format(l), file=sys.stderr) + + # Vreg mangling + mangled = [] + vreg_map = {} + for func_line in body.splitlines(keepends=True): + m = VREG_DEF_RE.match(func_line) + if m: + for vreg in VREG_RE.finditer(m.group("vregs")): + if vreg.group(1) in vreg_map: + name = vreg_map[vreg.group(1)] + else: + name = mangle_vreg(m.group("opcode"), vreg_map.values()) + vreg_map[vreg.group(1)] = name + func_line = func_line.replace( + vreg.group(1), "[[{}:%[0-9]+]]".format(name), 1 + ) + for number, name in vreg_map.items(): + func_line = re.sub( + r"{}\b".format(number), "[[{}]]".format(name), func_line + ) + mangled.append(func_line) + body = "".join(mangled) + + for prefix in prefixes: + info = common.function_body( + body, fixedStack, None, None, None, None, ginfo=None + ) + if func in func_dict[prefix]: + if ( + not func_dict[prefix][func] + or func_dict[prefix][func].scrub != info.scrub + or func_dict[prefix][func].extrascrub != info.extrascrub + ): + func_dict[prefix][func] = None + else: + func_dict[prefix][func] = info + + +def mangle_vreg(opcode, current_names): + base = opcode + # Simplify some common prefixes and suffixes + if opcode.startswith("G_"): + base = base[len("G_") :] + if opcode.endswith("_PSEUDO"): + base = base[: len("_PSEUDO")] + # Shorten some common opcodes with long-ish names + base = dict( + IMPLICIT_DEF="DEF", + GLOBAL_VALUE="GV", + CONSTANT="C", + FCONSTANT="C", + MERGE_VALUES="MV", + UNMERGE_VALUES="UV", + INTRINSIC="INT", + INTRINSIC_W_SIDE_EFFECTS="INT", + INSERT_VECTOR_ELT="IVEC", + EXTRACT_VECTOR_ELT="EVEC", + SHUFFLE_VECTOR="SHUF", + ).get(base, base) + # Avoid ambiguity when opcodes end in numbers + if len(base.rstrip("0123456789")) < len(base): + base += "_" + + i = 0 + for name in current_names: + if name.rstrip("0123456789") == base: + i += 1 + if i: + return "{}{}".format(base, i) + return base + + +def find_mir_functions_with_one_bb(lines, verbose=False): + result = [] + cur_func = None + bbs = 0 + for line in lines: + m = MIR_FUNC_NAME_RE.match(line) + if m: + if bbs == 1: + result.append(cur_func) + cur_func = m.group("func") + bbs = 0 + m = MIR_BASIC_BLOCK_RE.match(line) + if m: + bbs += 1 + if bbs == 1: + result.append(cur_func) + return result + + +def add_mir_checks_for_function( + test, + output_lines, + run_list, + func_dict, + func_name, + single_bb, + print_fixed_stack, + first_check_is_next, + at_the_function_name, +): + printed_prefixes = set() + for run in run_list: + for prefix in run[0]: + if prefix in printed_prefixes: + break + if not func_dict[prefix][func_name]: + continue + if printed_prefixes: + # Add some space between different check prefixes. + indent = len(output_lines[-1]) - len(output_lines[-1].lstrip(" ")) + output_lines.append(" " * indent + ";") + printed_prefixes.add(prefix) + add_mir_check_lines( + test, + output_lines, + prefix, + ("@" if at_the_function_name else "") + func_name, + single_bb, + func_dict[prefix][func_name], + print_fixed_stack, + first_check_is_next, + ) + break + else: + warn( + "Found conflicting asm for function: {}".format(func_name), + test_file=test, + ) + return output_lines + + +def add_mir_check_lines( + test, + output_lines, + prefix, + func_name, + single_bb, + func_info, + print_fixed_stack, + first_check_is_next, +): + func_body = str(func_info).splitlines() + if single_bb: + # Don't bother checking the basic block label for a single BB + func_body.pop(0) + + if not func_body: + warn( + "Function has no instructions to check: {}".format(func_name), + test_file=test, + ) + return + + first_line = func_body[0] + indent = len(first_line) - len(first_line.lstrip(" ")) + # A check comment, indented the appropriate amount + check = "{:>{}}; {}".format("", indent, prefix) + + output_lines.append("{}-LABEL: name: {}".format(check, func_name)) + + if print_fixed_stack: + output_lines.append("{}: fixedStack:".format(check)) + for stack_line in func_info.extrascrub.splitlines(): + filecheck_directive = check + "-NEXT" + output_lines.append("{}: {}".format(filecheck_directive, stack_line)) + + first_check = not first_check_is_next + for func_line in func_body: + if not func_line.strip(): + # The mir printer prints leading whitespace so we can't use CHECK-EMPTY: + output_lines.append(check + "-NEXT: {{" + func_line + "$}}") + continue + filecheck_directive = check if first_check else check + "-NEXT" + first_check = False + check_line = "{}: {}".format(filecheck_directive, func_line[indent:]).rstrip() + output_lines.append(check_line) + + +def should_add_mir_line_to_output(input_line, prefix_set): + # Skip any check lines that we're handling as well as comments + m = CHECK_RE.match(input_line) + if (m and m.group(1) in prefix_set) or input_line.strip() == ";": + return False + return True + + +def add_mir_checks( + input_lines, + prefix_set, + autogenerated_note, + test, + run_list, + func_dict, + print_fixed_stack, + first_check_is_next, + at_the_function_name, +): + simple_functions = find_mir_functions_with_one_bb(input_lines) + + output_lines = [] + output_lines.append(autogenerated_note) + + func_name = None + state = "toplevel" + for input_line in input_lines: + if input_line == autogenerated_note: + continue + + if state == "toplevel": + m = IR_FUNC_NAME_RE.match(input_line) + if m: + state = "ir function prefix" + func_name = m.group("func") + if input_line.rstrip("| \r\n") == "---": + state = "document" + output_lines.append(input_line) + elif state == "document": + m = MIR_FUNC_NAME_RE.match(input_line) + if m: + state = "mir function metadata" + func_name = m.group("func") + if input_line.strip() == "...": + state = "toplevel" + func_name = None + if should_add_mir_line_to_output(input_line, prefix_set): + output_lines.append(input_line) + elif state == "mir function metadata": + if should_add_mir_line_to_output(input_line, prefix_set): + output_lines.append(input_line) + m = MIR_BODY_BEGIN_RE.match(input_line) + if m: + if func_name in simple_functions: + # If there's only one block, put the checks inside it + state = "mir function prefix" + continue + state = "mir function body" + add_mir_checks_for_function( + test, + output_lines, + run_list, + func_dict, + func_name, + single_bb=False, + print_fixed_stack=print_fixed_stack, + first_check_is_next=first_check_is_next, + at_the_function_name=at_the_function_name, + ) + elif state == "mir function prefix": + m = MIR_PREFIX_DATA_RE.match(input_line) + if not m: + state = "mir function body" + add_mir_checks_for_function( + test, + output_lines, + run_list, + func_dict, + func_name, + single_bb=True, + print_fixed_stack=print_fixed_stack, + first_check_is_next=first_check_is_next, + at_the_function_name=at_the_function_name, + ) + + if should_add_mir_line_to_output(input_line, prefix_set): + output_lines.append(input_line) + elif state == "mir function body": + if input_line.strip() == "...": + state = "toplevel" + func_name = None + if should_add_mir_line_to_output(input_line, prefix_set): + output_lines.append(input_line) + elif state == "ir function prefix": + m = IR_PREFIX_DATA_RE.match(input_line) + if not m: + state = "ir function body" + add_mir_checks_for_function( + test, + output_lines, + run_list, + func_dict, + func_name, + single_bb=False, + print_fixed_stack=print_fixed_stack, + first_check_is_next=first_check_is_next, + at_the_function_name=at_the_function_name, + ) + + if should_add_mir_line_to_output(input_line, prefix_set): + output_lines.append(input_line) + elif state == "ir function body": + if input_line.strip() == "}": + state = "toplevel" + func_name = None + if should_add_mir_line_to_output(input_line, prefix_set): + output_lines.append(input_line) + return output_lines diff --git a/llvm/utils/update_givaluetracking_test_checks.py b/llvm/utils/update_givaluetracking_test_checks.py index 49b068ac7bef0..9ad0f3ec9ad1c 100755 --- a/llvm/utils/update_givaluetracking_test_checks.py +++ b/llvm/utils/update_givaluetracking_test_checks.py @@ -19,6 +19,7 @@ import sys from UpdateTestChecks import common +from UpdateTestChecks import mir VT_FUNCTION_RE = re.compile( r"\s*name:\s*@(?P[A-Za-z0-9_-]+)" @@ -92,7 +93,7 @@ def update_test(ti: common.TestInfo): func_dict = builder.finish_and_get_func_dict() prefix_set = set([prefix for p in run_list for prefix in p[0]]) common.debug("Rewriting FileCheck prefixes:", str(prefix_set)) - output_lines = common.add_mir_checks( + output_lines = mir.add_mir_checks( ti.input_lines, prefix_set, ti.test_autogenerated_note, diff --git a/llvm/utils/update_mir_test_checks.py b/llvm/utils/update_mir_test_checks.py index c4ee0523a6469..ba70249db28e6 100755 --- a/llvm/utils/update_mir_test_checks.py +++ b/llvm/utils/update_mir_test_checks.py @@ -31,39 +31,7 @@ import sys from UpdateTestChecks import common - -VREG_RE = re.compile(r"(%[0-9]+)(?:\.[a-z0-9_]+)?(?::[a-z0-9_]+)?(?:\([<>a-z0-9 ]+\))?") -MI_FLAGS_STR = ( - r"(frame-setup |frame-destroy |nnan |ninf |nsz |arcp |contract |afn " - r"|reassoc |nuw |nsw |exact |nofpexcept |nomerge |unpredictable " - r"|noconvergent |nneg |disjoint |nusw |samesign |inbounds )*" -) -VREG_DEF_FLAGS_STR = r"(?:dead |undef )*" - -# Pattern to match the defined vregs and the opcode of an instruction that -# defines vregs. Opcodes starting with a lower-case 't' are allowed to match -# ARM's thumb instructions, like tADDi8 and t2ADDri. -VREG_DEF_RE = re.compile( - r"^ *(?P{2}{0}(?:, {2}{0})*) = " - r"{1}(?P[A-Zt][A-Za-z0-9_]+)".format( - VREG_RE.pattern, MI_FLAGS_STR, VREG_DEF_FLAGS_STR - ) -) - -MIR_FUNC_RE = re.compile( - r"^---$" - r"\n" - r"^ *name: *(?P[A-Za-z0-9_.-]+)$" - r".*?" - r"(?:^ *fixedStack: *(\[\])? *\n" - r"(?P.*?)\n?" - r"^ *stack:" - r".*?)?" - r"^ *body: *\|\n" - r"(?P.*?)\n" - r"^\.\.\.$", - flags=(re.M | re.S), -) +from UpdateTestChecks import mir class LLC: @@ -143,89 +111,6 @@ def build_run_list(test, run_lines, verbose=False): return run_list -def build_function_info_dictionary( - test, raw_tool_output, triple, prefixes, func_dict, verbose -): - for m in MIR_FUNC_RE.finditer(raw_tool_output): - func = m.group("func") - fixedStack = m.group("fixedStack") - body = m.group("body") - if verbose: - log("Processing function: {}".format(func)) - for l in body.splitlines(): - log(" {}".format(l)) - - # Vreg mangling - mangled = [] - vreg_map = {} - for func_line in body.splitlines(keepends=True): - m = VREG_DEF_RE.match(func_line) - if m: - for vreg in VREG_RE.finditer(m.group("vregs")): - if vreg.group(1) in vreg_map: - name = vreg_map[vreg.group(1)] - else: - name = mangle_vreg(m.group("opcode"), vreg_map.values()) - vreg_map[vreg.group(1)] = name - func_line = func_line.replace( - vreg.group(1), "[[{}:%[0-9]+]]".format(name), 1 - ) - for number, name in vreg_map.items(): - func_line = re.sub( - r"{}\b".format(number), "[[{}]]".format(name), func_line - ) - mangled.append(func_line) - body = "".join(mangled) - - for prefix in prefixes: - info = common.function_body( - body, fixedStack, None, None, None, None, ginfo=None - ) - if func in func_dict[prefix]: - if ( - not func_dict[prefix][func] - or func_dict[prefix][func].scrub != info.scrub - or func_dict[prefix][func].extrascrub != info.extrascrub - ): - func_dict[prefix][func] = None - else: - func_dict[prefix][func] = info - - -def mangle_vreg(opcode, current_names): - base = opcode - # Simplify some common prefixes and suffixes - if opcode.startswith("G_"): - base = base[len("G_") :] - if opcode.endswith("_PSEUDO"): - base = base[: len("_PSEUDO")] - # Shorten some common opcodes with long-ish names - base = dict( - IMPLICIT_DEF="DEF", - GLOBAL_VALUE="GV", - CONSTANT="C", - FCONSTANT="C", - MERGE_VALUES="MV", - UNMERGE_VALUES="UV", - INTRINSIC="INT", - INTRINSIC_W_SIDE_EFFECTS="INT", - INSERT_VECTOR_ELT="IVEC", - EXTRACT_VECTOR_ELT="EVEC", - SHUFFLE_VECTOR="SHUF", - ).get(base, base) - # Avoid ambiguity when opcodes end in numbers - if len(base.rstrip("0123456789")) < len(base): - base += "_" - - i = 0 - for name in current_names: - if name.rstrip("0123456789") == base: - i += 1 - if i: - return "{}{}".format(base, i) - return base - - def update_test_file(args, test, autogenerated_note): with open(test) as fd: input_lines = [l.rstrip() for l in fd] @@ -247,7 +132,7 @@ def update_test_file(args, test, autogenerated_note): common.warn("No triple found: skipping file", test_file=test) return - build_function_info_dictionary( + mir.build_function_info_dictionary( test, raw_tool_output, triple_in_cmd or triple_in_ir, @@ -259,7 +144,7 @@ def update_test_file(args, test, autogenerated_note): prefix_set = set([prefix for run in run_list for prefix in run[0]]) log("Rewriting FileCheck prefixes: {}".format(prefix_set), args.verbose) - output_lines = common.add_mir_checks( + output_lines = mir.add_mir_checks( input_lines, prefix_set, autogenerated_note, From 5e126e4d84d7b2aa572b19dd20d64c9c896e1d19 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Thu, 30 Oct 2025 11:51:25 +0300 Subject: [PATCH 164/539] [clang] Update C++ DR status page --- clang/www/cxx_dr_status.html | 260 +++++++++++++++++++++++++++++++---- 1 file changed, 232 insertions(+), 28 deletions(-) diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index b7da22cf9fb22..ae9b28ee625cd 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -3113,11 +3113,11 @@

C++ defect report implementation status

Default initialization of POD classes? N/A - +
511 - open + NAD POD-structs with template assignment operators - Not resolved + Unknown 512 @@ -10895,7 +10895,7 @@

C++ defect report implementation status

1845 - drafting + review Point of instantiation of a variable template specialization Not resolved @@ -12081,7 +12081,7 @@

C++ defect report implementation status

2042 - drafting + review Exceptions and deallocation functions Not resolved @@ -12335,7 +12335,7 @@

C++ defect report implementation status

2084 CD4 NSDMIs and deleted union default constructors - Unknown + Clang 3.1 2085 @@ -12837,7 +12837,7 @@

C++ defect report implementation status

2168 - open + review Narrowing conversions and +/- infinity Not resolved @@ -14237,11 +14237,11 @@

C++ defect report implementation status

Constexpr virtual functions and temporary objects Unknown - + 2401 - drafting + C++20 Array decay vs prohibition of subobject non-type arguments - Not resolved + Unknown 2402 @@ -15171,7 +15171,7 @@

C++ defect report implementation status

2555 - drafting + tentatively ready Ineffective redeclaration prevention for using-declarators Not resolved @@ -15311,23 +15311,23 @@

C++ defect report implementation status

Undefined behavior for preprocessing directives in macro arguments Not resolved - + 2578 - open + CD7 Undefined behavior when creating an invalid string literal via stringizing - Not resolved + Unknown - + 2579 - open + CD7 Undefined behavior when token pasting does not create a preprocessing token - Not resolved + Unknown - + 2580 - open + CD7 Undefined behavior with #line - Not resolved + Unknown 2581 @@ -17104,7 +17104,7 @@

C++ defect report implementation status

2875 - review + tentatively ready Missing support for round-tripping null pointer values through indirection/address operators Not resolved @@ -17400,7 +17400,7 @@

C++ defect report implementation status

2923 - review + tentatively ready Note about infinite loops and execution steps Not resolved @@ -17760,7 +17760,7 @@

C++ defect report implementation status

2983 - open + review Non-type template parameters are not variables Not resolved @@ -17868,7 +17868,7 @@

C++ defect report implementation status

3001 - review + tentatively ready Inconsistent restrictions for static_cast on pointers to out-of-lifetime objects Not resolved @@ -17932,7 +17932,7 @@

C++ defect report implementation status

3011 - open + tentatively ready Parenthesized aggregate initialization for new-expressions Not resolved @@ -17992,7 +17992,7 @@

C++ defect report implementation status

3021 - open + drafting Subsumption rules for fold expanded constraints Not resolved @@ -18058,7 +18058,7 @@

C++ defect report implementation status

3032 - open + tentatively ready Template argument disambiguation Not resolved @@ -18184,7 +18184,7 @@

C++ defect report implementation status

3053 - open + tentatively ready Allowing #undef likely Not resolved @@ -18265,6 +18265,210 @@

C++ defect report implementation status

tentatively ready Declarative nested-name-specifier in explicit instantiation Not resolved + + + 3067 + open + Array-to-pointer conversion with object type mismatch + Not resolved + + + 3068 + open + Access checking in friends involving qualified-ids + Not resolved + + + 3069 + open + Reference to wrong placeholder + Not resolved + + + 3070 + open + Trivial assignment can skip member subobjects + Not resolved + + + 3071 + open + Negative tuple_size in structured bindings + Not resolved + + + 3072 + open + Incorrect examples for lambda SFINAE + Not resolved + + + 3073 + open + Dependence of R on T2 is unclear + Not resolved + + + 3074 + tentatively ready + Redundant ill-formedness for module macros + Not resolved + + + 3075 + tentatively ready + Unclear matching of import directive + Not resolved + + + 3076 + tentatively ready + Remove unnecessary IFNDR for malformed header-name-tokens + Not resolved + + + 3077 + tentatively ready + Undesirable formation of import directive with string-literal + Not resolved + + + 3078 + review + Different treatment of #include pp-tokens and header-name-tokens + Not resolved + + + 3079 + open + Allow empty-declarations in anonymous unions + Not resolved + + + 3080 + tentatively ready + Clarify kinds of permitted template template arguments + Not resolved + + + 3081 + review + Require glvalue when splicing direct base class relationship + Not resolved + + + 3082 + tentatively ready + Allow for call-compatible function types in reinterpret_cast + Not resolved + + + 3083 + tentatively ready + Remove redundant restrictions on class and enum definitions + Not resolved + + + 3084 + tentatively ready + compound-statements inside iteration-statements + Not resolved + + + 3085 + tentatively ready + Apply restriction inside for-range-declaration + Not resolved + + + 3086 + tentatively ready + Destringizing should consider all sorts of encoding-prefixes + Not resolved + + + 3087 + open + Destringizing for raw string literals + Not resolved + + + 3088 + open + Clarify macro treatment of identifiers with special meaning + Not resolved + + + 3089 + tentatively ready + const-default-constructible improperly handles std::meta::info + Not resolved + + + 3090 + tentatively ready + Internal linkage from header units + Not resolved + + + 3091 + review + Linking of translation units as sequences of tokens + Not resolved + + + 3092 + tentatively ready + base-specifiers are not "declared" + Not resolved + + + 3093 + open + Missing integration of direct base class relationships + Not resolved + + + 3094 + review + Rework phases for string literal concatenation and token formation + Not resolved + + + 3095 + open + Type-dependent packs that are not structured binding packs + Not resolved + + + 3096 + open + Value-dependence of size of structured binding pack with non-dependent initializer + Not resolved + + + 3097 + tentatively ready + Lambda expression introduces a scope + Not resolved + + + 3098 + tentatively ready + Remove redundancy "names or designates" + Not resolved + + + 3099 + open + Instantiation of type aliases from alias templates is unspecified + Not resolved + + + 3100 + open + Destruction order for objects with static storage duration + Not resolved From 5732d1bf496705971dd5e0ad4676a51b43718299 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 30 Oct 2025 09:00:36 +0000 Subject: [PATCH 165/539] [AArch64][GlobalISel] Add some GISel test coverage for icmp-and tests. NFC --- llvm/test/CodeGen/AArch64/arm64-srl-and.ll | 42 +- ...st-and-by-const-from-lshr-in-eqcmp-zero.ll | 414 ++++++++++++----- ...ist-and-by-const-from-shl-in-eqcmp-zero.ll | 424 ++++++++++++----- llvm/test/CodeGen/AArch64/signbit-test.ll | 22 +- .../AArch64/signed-truncation-check.ll | 434 ++++++++++++------ 5 files changed, 942 insertions(+), 394 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/arm64-srl-and.ll b/llvm/test/CodeGen/AArch64/arm64-srl-and.ll index b58f6ba96a5b8..330f27bd6c0cd 100644 --- a/llvm/test/CodeGen/AArch64/arm64-srl-and.ll +++ b/llvm/test/CodeGen/AArch64/arm64-srl-and.ll @@ -1,22 +1,38 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -O3 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -O3 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-linux-gnu -O3 -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI ; This used to miscompile: ; The 16-bit -1 should not become 32-bit -1 (sub w8, w8, #1). @g = global i16 0, align 4 define i32 @srl_and() { -; CHECK-LABEL: srl_and: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: adrp x8, :got:g -; CHECK-NEXT: mov w9, #50 -; CHECK-NEXT: ldr x8, [x8, :got_lo12:g] -; CHECK-NEXT: ldrh w8, [x8] -; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: mov w9, #65535 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: and w0, w8, w8, lsr #16 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: srl_and: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: adrp x8, :got:g +; CHECK-SD-NEXT: mov w9, #50 // =0x32 +; CHECK-SD-NEXT: ldr x8, [x8, :got_lo12:g] +; CHECK-SD-NEXT: ldrh w8, [x8] +; CHECK-SD-NEXT: eor w8, w8, w9 +; CHECK-SD-NEXT: mov w9, #65535 // =0xffff +; CHECK-SD-NEXT: add w8, w8, w9 +; CHECK-SD-NEXT: and w0, w8, w8, lsr #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: srl_and: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adrp x8, :got:g +; CHECK-GI-NEXT: mov w9, #50 // =0x32 +; CHECK-GI-NEXT: ldr x8, [x8, :got_lo12:g] +; CHECK-GI-NEXT: ldrh w8, [x8] +; CHECK-GI-NEXT: eor w8, w8, w9 +; CHECK-GI-NEXT: mov w9, #65535 // =0xffff +; CHECK-GI-NEXT: add w8, w9, w8, uxth +; CHECK-GI-NEXT: and w9, w8, #0xffff +; CHECK-GI-NEXT: cmp w8, w9 +; CHECK-GI-NEXT: cset w8, ne +; CHECK-GI-NEXT: and w0, w9, w8 +; CHECK-GI-NEXT: ret entry: %0 = load i16, ptr @g, align 4 %1 = xor i16 %0, 50 @@ -29,3 +45,5 @@ entry: ret i32 %and } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll index c3fdc7db2abbe..8438f0b03179c 100644 --- a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-unknown-unknown -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI ; We are looking for the following pattern here: ; (X & (C l>> Y)) ==/!= 0 @@ -13,12 +14,21 @@ ; i8 scalar define i1 @scalar_i8_signbit_eq(i8 %x, i8 %y) nounwind { -; CHECK-LABEL: scalar_i8_signbit_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsl w8, w0, w1 -; CHECK-NEXT: tst w8, #0x80 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i8_signbit_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsl w8, w0, w1 +; CHECK-SD-NEXT: tst w8, #0x80 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i8_signbit_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #128 // =0x80 +; CHECK-GI-NEXT: and w9, w1, #0xff +; CHECK-GI-NEXT: lsr w8, w8, w9 +; CHECK-GI-NEXT: tst w8, w0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = lshr i8 128, %y %t1 = and i8 %t0, %x %res = icmp eq i8 %t1, 0 @@ -26,12 +36,21 @@ define i1 @scalar_i8_signbit_eq(i8 %x, i8 %y) nounwind { } define i1 @scalar_i8_lowestbit_eq(i8 %x, i8 %y) nounwind { -; CHECK-LABEL: scalar_i8_lowestbit_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsl w8, w0, w1 -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i8_lowestbit_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsl w8, w0, w1 +; CHECK-SD-NEXT: tst w8, #0x1 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i8_lowestbit_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: and w9, w1, #0xff +; CHECK-GI-NEXT: lsr w8, w8, w9 +; CHECK-GI-NEXT: tst w8, w0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = lshr i8 1, %y %t1 = and i8 %t0, %x %res = icmp eq i8 %t1, 0 @@ -39,12 +58,21 @@ define i1 @scalar_i8_lowestbit_eq(i8 %x, i8 %y) nounwind { } define i1 @scalar_i8_bitsinmiddle_eq(i8 %x, i8 %y) nounwind { -; CHECK-LABEL: scalar_i8_bitsinmiddle_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsl w8, w0, w1 -; CHECK-NEXT: tst w8, #0x18 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i8_bitsinmiddle_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsl w8, w0, w1 +; CHECK-SD-NEXT: tst w8, #0x18 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i8_bitsinmiddle_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #24 // =0x18 +; CHECK-GI-NEXT: and w9, w1, #0xff +; CHECK-GI-NEXT: lsr w8, w8, w9 +; CHECK-GI-NEXT: tst w8, w0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = lshr i8 24, %y %t1 = and i8 %t0, %x %res = icmp eq i8 %t1, 0 @@ -54,12 +82,21 @@ define i1 @scalar_i8_bitsinmiddle_eq(i8 %x, i8 %y) nounwind { ; i16 scalar define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind { -; CHECK-LABEL: scalar_i16_signbit_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsl w8, w0, w1 -; CHECK-NEXT: tst w8, #0x8000 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i16_signbit_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsl w8, w0, w1 +; CHECK-SD-NEXT: tst w8, #0x8000 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i16_signbit_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #32768 // =0x8000 +; CHECK-GI-NEXT: and w9, w1, #0xffff +; CHECK-GI-NEXT: lsr w8, w8, w9 +; CHECK-GI-NEXT: tst w8, w0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = lshr i16 32768, %y %t1 = and i16 %t0, %x %res = icmp eq i16 %t1, 0 @@ -67,12 +104,21 @@ define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind { } define i1 @scalar_i16_lowestbit_eq(i16 %x, i16 %y) nounwind { -; CHECK-LABEL: scalar_i16_lowestbit_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsl w8, w0, w1 -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i16_lowestbit_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsl w8, w0, w1 +; CHECK-SD-NEXT: tst w8, #0x1 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i16_lowestbit_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: and w9, w1, #0xffff +; CHECK-GI-NEXT: lsr w8, w8, w9 +; CHECK-GI-NEXT: tst w8, w0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = lshr i16 1, %y %t1 = and i16 %t0, %x %res = icmp eq i16 %t1, 0 @@ -80,12 +126,21 @@ define i1 @scalar_i16_lowestbit_eq(i16 %x, i16 %y) nounwind { } define i1 @scalar_i16_bitsinmiddle_eq(i16 %x, i16 %y) nounwind { -; CHECK-LABEL: scalar_i16_bitsinmiddle_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsl w8, w0, w1 -; CHECK-NEXT: tst w8, #0xff0 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i16_bitsinmiddle_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsl w8, w0, w1 +; CHECK-SD-NEXT: tst w8, #0xff0 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i16_bitsinmiddle_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #4080 // =0xff0 +; CHECK-GI-NEXT: and w9, w1, #0xffff +; CHECK-GI-NEXT: lsr w8, w8, w9 +; CHECK-GI-NEXT: tst w8, w0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = lshr i16 4080, %y %t1 = and i16 %t0, %x %res = icmp eq i16 %t1, 0 @@ -95,12 +150,20 @@ define i1 @scalar_i16_bitsinmiddle_eq(i16 %x, i16 %y) nounwind { ; i32 scalar define i1 @scalar_i32_signbit_eq(i32 %x, i32 %y) nounwind { -; CHECK-LABEL: scalar_i32_signbit_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsl w8, w0, w1 -; CHECK-NEXT: tst w8, #0x80000000 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i32_signbit_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsl w8, w0, w1 +; CHECK-SD-NEXT: tst w8, #0x80000000 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i32_signbit_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #-2147483648 // =0x80000000 +; CHECK-GI-NEXT: lsr w8, w8, w1 +; CHECK-GI-NEXT: tst w8, w0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = lshr i32 2147483648, %y %t1 = and i32 %t0, %x %res = icmp eq i32 %t1, 0 @@ -108,12 +171,20 @@ define i1 @scalar_i32_signbit_eq(i32 %x, i32 %y) nounwind { } define i1 @scalar_i32_lowestbit_eq(i32 %x, i32 %y) nounwind { -; CHECK-LABEL: scalar_i32_lowestbit_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsl w8, w0, w1 -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i32_lowestbit_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsl w8, w0, w1 +; CHECK-SD-NEXT: tst w8, #0x1 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i32_lowestbit_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: lsr w8, w8, w1 +; CHECK-GI-NEXT: tst w8, w0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = lshr i32 1, %y %t1 = and i32 %t0, %x %res = icmp eq i32 %t1, 0 @@ -121,12 +192,20 @@ define i1 @scalar_i32_lowestbit_eq(i32 %x, i32 %y) nounwind { } define i1 @scalar_i32_bitsinmiddle_eq(i32 %x, i32 %y) nounwind { -; CHECK-LABEL: scalar_i32_bitsinmiddle_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsl w8, w0, w1 -; CHECK-NEXT: tst w8, #0xffff00 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i32_bitsinmiddle_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsl w8, w0, w1 +; CHECK-SD-NEXT: tst w8, #0xffff00 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i32_bitsinmiddle_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #16776960 // =0xffff00 +; CHECK-GI-NEXT: lsr w8, w8, w1 +; CHECK-GI-NEXT: tst w8, w0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = lshr i32 16776960, %y %t1 = and i32 %t0, %x %res = icmp eq i32 %t1, 0 @@ -136,12 +215,20 @@ define i1 @scalar_i32_bitsinmiddle_eq(i32 %x, i32 %y) nounwind { ; i64 scalar define i1 @scalar_i64_signbit_eq(i64 %x, i64 %y) nounwind { -; CHECK-LABEL: scalar_i64_signbit_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsl x8, x0, x1 -; CHECK-NEXT: tst x8, #0x8000000000000000 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i64_signbit_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsl x8, x0, x1 +; CHECK-SD-NEXT: tst x8, #0x8000000000000000 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i64_signbit_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000 +; CHECK-GI-NEXT: lsr x8, x8, x1 +; CHECK-GI-NEXT: tst x8, x0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = lshr i64 9223372036854775808, %y %t1 = and i64 %t0, %x %res = icmp eq i64 %t1, 0 @@ -149,12 +236,20 @@ define i1 @scalar_i64_signbit_eq(i64 %x, i64 %y) nounwind { } define i1 @scalar_i64_lowestbit_eq(i64 %x, i64 %y) nounwind { -; CHECK-LABEL: scalar_i64_lowestbit_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsl x8, x0, x1 -; CHECK-NEXT: tst x8, #0x1 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i64_lowestbit_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsl x8, x0, x1 +; CHECK-SD-NEXT: tst x8, #0x1 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i64_lowestbit_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: lsr x8, x8, x1 +; CHECK-GI-NEXT: tst x8, x0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = lshr i64 1, %y %t1 = and i64 %t0, %x %res = icmp eq i64 %t1, 0 @@ -162,12 +257,20 @@ define i1 @scalar_i64_lowestbit_eq(i64 %x, i64 %y) nounwind { } define i1 @scalar_i64_bitsinmiddle_eq(i64 %x, i64 %y) nounwind { -; CHECK-LABEL: scalar_i64_bitsinmiddle_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsl x8, x0, x1 -; CHECK-NEXT: tst x8, #0xffffffff0000 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i64_bitsinmiddle_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsl x8, x0, x1 +; CHECK-SD-NEXT: tst x8, #0xffffffff0000 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i64_bitsinmiddle_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov x8, #281474976645120 // =0xffffffff0000 +; CHECK-GI-NEXT: lsr x8, x8, x1 +; CHECK-GI-NEXT: tst x8, x0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = lshr i64 281474976645120, %y %t1 = and i64 %t0, %x %res = icmp eq i64 %t1, 0 @@ -179,14 +282,24 @@ define i1 @scalar_i64_bitsinmiddle_eq(i64 %x, i64 %y) nounwind { ;------------------------------------------------------------------------------; define <4 x i1> @vec_4xi32_splat_eq(<4 x i32> %x, <4 x i32> %y) nounwind { -; CHECK-LABEL: vec_4xi32_splat_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: ushl v0.4s, v0.4s, v1.4s -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: vec_4xi32_splat_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.4s, #1 +; CHECK-SD-NEXT: ushl v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vec_4xi32_splat_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.4s, #1 +; CHECK-GI-NEXT: neg v1.4s, v1.4s +; CHECK-GI-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret %t0 = lshr <4 x i32> , %y %t1 = and <4 x i32> %t0, %x %res = icmp eq <4 x i32> %t1, @@ -211,44 +324,86 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind { } define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwind { -; CHECK-LABEL: vec_4xi32_nonsplat_undef0_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: ushl v0.4s, v0.4s, v1.4s -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: vec_4xi32_nonsplat_undef0_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.4s, #1 +; CHECK-SD-NEXT: ushl v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vec_4xi32_nonsplat_undef0_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: neg v1.4s, v1.4s +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[1], w8 +; CHECK-GI-NEXT: mov v2.s[3], w8 +; CHECK-GI-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret %t0 = lshr <4 x i32> , %y %t1 = and <4 x i32> %t0, %x %res = icmp eq <4 x i32> %t1, ret <4 x i1> %res } define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwind { -; CHECK-LABEL: vec_4xi32_nonsplat_undef1_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: neg v1.4s, v1.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: vec_4xi32_nonsplat_undef1_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.4s, #1 +; CHECK-SD-NEXT: neg v1.4s, v1.4s +; CHECK-SD-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-SD-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-SD-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vec_4xi32_nonsplat_undef1_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi d2, #0000000000000000 +; CHECK-GI-NEXT: movi v3.4s, #1 +; CHECK-GI-NEXT: neg v1.4s, v1.4s +; CHECK-GI-NEXT: mov v2.s[1], wzr +; CHECK-GI-NEXT: ushl v1.4s, v3.4s, v1.4s +; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: mov v2.s[3], wzr +; CHECK-GI-NEXT: cmeq v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret %t0 = lshr <4 x i32> , %y %t1 = and <4 x i32> %t0, %x %res = icmp eq <4 x i32> %t1, ret <4 x i1> %res } define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwind { -; CHECK-LABEL: vec_4xi32_nonsplat_undef2_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: neg v1.4s, v1.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: vec_4xi32_nonsplat_undef2_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.4s, #1 +; CHECK-SD-NEXT: neg v1.4s, v1.4s +; CHECK-SD-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-SD-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-SD-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vec_4xi32_nonsplat_undef2_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: movi d2, #0000000000000000 +; CHECK-GI-NEXT: neg v1.4s, v1.4s +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v3.s[1], w8 +; CHECK-GI-NEXT: mov v2.s[1], wzr +; CHECK-GI-NEXT: mov v3.s[3], w8 +; CHECK-GI-NEXT: mov v2.s[3], wzr +; CHECK-GI-NEXT: ushl v1.4s, v3.4s, v1.4s +; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: cmeq v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret %t0 = lshr <4 x i32> , %y %t1 = and <4 x i32> %t0, %x %res = icmp eq <4 x i32> %t1, @@ -260,11 +415,20 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi ;------------------------------------------------------------------------------; define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind { -; CHECK-LABEL: scalar_i8_signbit_ne: -; CHECK: // %bb.0: -; CHECK-NEXT: lsl w8, w0, w1 -; CHECK-NEXT: ubfx w0, w8, #7, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i8_signbit_ne: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsl w8, w0, w1 +; CHECK-SD-NEXT: ubfx w0, w8, #7, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i8_signbit_ne: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #128 // =0x80 +; CHECK-GI-NEXT: and w9, w1, #0xff +; CHECK-GI-NEXT: lsr w8, w8, w9 +; CHECK-GI-NEXT: tst w8, w0 +; CHECK-GI-NEXT: cset w0, ne +; CHECK-GI-NEXT: ret %t0 = lshr i8 128, %y %t1 = and i8 %t0, %x %res = icmp ne i8 %t1, 0 ; we are perfectly happy with 'ne' predicate @@ -315,14 +479,24 @@ define i1 @scalar_i8_bitsinmiddle_slt(i8 %x, i8 %y) nounwind { } define i1 @scalar_i8_signbit_eq_with_nonzero(i8 %x, i8 %y) nounwind { -; CHECK-LABEL: scalar_i8_signbit_eq_with_nonzero: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #128 // =0x80 -; CHECK-NEXT: lsr w8, w8, w1 -; CHECK-NEXT: and w8, w8, w0 -; CHECK-NEXT: cmp w8, #1 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i8_signbit_eq_with_nonzero: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, #128 // =0x80 +; CHECK-SD-NEXT: lsr w8, w8, w1 +; CHECK-SD-NEXT: and w8, w8, w0 +; CHECK-SD-NEXT: cmp w8, #1 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i8_signbit_eq_with_nonzero: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #128 // =0x80 +; CHECK-GI-NEXT: and w9, w1, #0xff +; CHECK-GI-NEXT: lsr w8, w8, w9 +; CHECK-GI-NEXT: and w8, w8, w0 +; CHECK-GI-NEXT: cmp w8, #1 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = lshr i8 128, %y %t1 = and i8 %t0, %x %res = icmp eq i8 %t1, 1 ; should be comparing with 0 diff --git a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll index 4a73b10811d29..cc1bf27b8d4b7 100644 --- a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-unknown-unknown -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI ; We are looking for the following pattern here: ; (X & (C << Y)) ==/!= 0 @@ -13,13 +14,23 @@ ; i8 scalar define i1 @scalar_i8_signbit_eq(i8 %x, i8 %y) nounwind { -; CHECK-LABEL: scalar_i8_signbit_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: lsr w8, w8, w1 -; CHECK-NEXT: tst w8, #0x80 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i8_signbit_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xff +; CHECK-SD-NEXT: lsr w8, w8, w1 +; CHECK-SD-NEXT: tst w8, #0x80 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i8_signbit_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #-128 // =0xffffff80 +; CHECK-GI-NEXT: and w9, w1, #0xff +; CHECK-GI-NEXT: lsl w8, w8, w9 +; CHECK-GI-NEXT: and w8, w8, w0 +; CHECK-GI-NEXT: tst w8, #0xff +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = shl i8 128, %y %t1 = and i8 %t0, %x %res = icmp eq i8 %t1, 0 @@ -27,13 +38,23 @@ define i1 @scalar_i8_signbit_eq(i8 %x, i8 %y) nounwind { } define i1 @scalar_i8_lowestbit_eq(i8 %x, i8 %y) nounwind { -; CHECK-LABEL: scalar_i8_lowestbit_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: lsr w8, w8, w1 -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i8_lowestbit_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xff +; CHECK-SD-NEXT: lsr w8, w8, w1 +; CHECK-SD-NEXT: tst w8, #0x1 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i8_lowestbit_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: and w9, w1, #0xff +; CHECK-GI-NEXT: lsl w8, w8, w9 +; CHECK-GI-NEXT: and w8, w8, w0 +; CHECK-GI-NEXT: tst w8, #0xff +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = shl i8 1, %y %t1 = and i8 %t0, %x %res = icmp eq i8 %t1, 0 @@ -41,13 +62,23 @@ define i1 @scalar_i8_lowestbit_eq(i8 %x, i8 %y) nounwind { } define i1 @scalar_i8_bitsinmiddle_eq(i8 %x, i8 %y) nounwind { -; CHECK-LABEL: scalar_i8_bitsinmiddle_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: lsr w8, w8, w1 -; CHECK-NEXT: tst w8, #0x18 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i8_bitsinmiddle_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xff +; CHECK-SD-NEXT: lsr w8, w8, w1 +; CHECK-SD-NEXT: tst w8, #0x18 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i8_bitsinmiddle_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #24 // =0x18 +; CHECK-GI-NEXT: and w9, w1, #0xff +; CHECK-GI-NEXT: lsl w8, w8, w9 +; CHECK-GI-NEXT: and w8, w8, w0 +; CHECK-GI-NEXT: tst w8, #0xff +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = shl i8 24, %y %t1 = and i8 %t0, %x %res = icmp eq i8 %t1, 0 @@ -57,13 +88,23 @@ define i1 @scalar_i8_bitsinmiddle_eq(i8 %x, i8 %y) nounwind { ; i16 scalar define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind { -; CHECK-LABEL: scalar_i16_signbit_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: lsr w8, w8, w1 -; CHECK-NEXT: tst w8, #0x8000 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i16_signbit_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xffff +; CHECK-SD-NEXT: lsr w8, w8, w1 +; CHECK-SD-NEXT: tst w8, #0x8000 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i16_signbit_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #-32768 // =0xffff8000 +; CHECK-GI-NEXT: and w9, w1, #0xffff +; CHECK-GI-NEXT: lsl w8, w8, w9 +; CHECK-GI-NEXT: and w8, w8, w0 +; CHECK-GI-NEXT: tst w8, #0xffff +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = shl i16 32768, %y %t1 = and i16 %t0, %x %res = icmp eq i16 %t1, 0 @@ -71,13 +112,23 @@ define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind { } define i1 @scalar_i16_lowestbit_eq(i16 %x, i16 %y) nounwind { -; CHECK-LABEL: scalar_i16_lowestbit_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: lsr w8, w8, w1 -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i16_lowestbit_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xffff +; CHECK-SD-NEXT: lsr w8, w8, w1 +; CHECK-SD-NEXT: tst w8, #0x1 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i16_lowestbit_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: and w9, w1, #0xffff +; CHECK-GI-NEXT: lsl w8, w8, w9 +; CHECK-GI-NEXT: and w8, w8, w0 +; CHECK-GI-NEXT: tst w8, #0xffff +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = shl i16 1, %y %t1 = and i16 %t0, %x %res = icmp eq i16 %t1, 0 @@ -85,13 +136,23 @@ define i1 @scalar_i16_lowestbit_eq(i16 %x, i16 %y) nounwind { } define i1 @scalar_i16_bitsinmiddle_eq(i16 %x, i16 %y) nounwind { -; CHECK-LABEL: scalar_i16_bitsinmiddle_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: lsr w8, w8, w1 -; CHECK-NEXT: tst w8, #0xff0 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i16_bitsinmiddle_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xffff +; CHECK-SD-NEXT: lsr w8, w8, w1 +; CHECK-SD-NEXT: tst w8, #0xff0 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i16_bitsinmiddle_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #4080 // =0xff0 +; CHECK-GI-NEXT: and w9, w1, #0xffff +; CHECK-GI-NEXT: lsl w8, w8, w9 +; CHECK-GI-NEXT: and w8, w8, w0 +; CHECK-GI-NEXT: tst w8, #0xffff +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = shl i16 4080, %y %t1 = and i16 %t0, %x %res = icmp eq i16 %t1, 0 @@ -101,12 +162,20 @@ define i1 @scalar_i16_bitsinmiddle_eq(i16 %x, i16 %y) nounwind { ; i32 scalar define i1 @scalar_i32_signbit_eq(i32 %x, i32 %y) nounwind { -; CHECK-LABEL: scalar_i32_signbit_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, w1 -; CHECK-NEXT: tst w8, #0x80000000 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i32_signbit_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsr w8, w0, w1 +; CHECK-SD-NEXT: tst w8, #0x80000000 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i32_signbit_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #-2147483648 // =0x80000000 +; CHECK-GI-NEXT: lsl w8, w8, w1 +; CHECK-GI-NEXT: tst w8, w0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = shl i32 2147483648, %y %t1 = and i32 %t0, %x %res = icmp eq i32 %t1, 0 @@ -114,12 +183,20 @@ define i1 @scalar_i32_signbit_eq(i32 %x, i32 %y) nounwind { } define i1 @scalar_i32_lowestbit_eq(i32 %x, i32 %y) nounwind { -; CHECK-LABEL: scalar_i32_lowestbit_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, w1 -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i32_lowestbit_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsr w8, w0, w1 +; CHECK-SD-NEXT: tst w8, #0x1 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i32_lowestbit_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: lsl w8, w8, w1 +; CHECK-GI-NEXT: tst w8, w0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = shl i32 1, %y %t1 = and i32 %t0, %x %res = icmp eq i32 %t1, 0 @@ -127,12 +204,20 @@ define i1 @scalar_i32_lowestbit_eq(i32 %x, i32 %y) nounwind { } define i1 @scalar_i32_bitsinmiddle_eq(i32 %x, i32 %y) nounwind { -; CHECK-LABEL: scalar_i32_bitsinmiddle_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, w1 -; CHECK-NEXT: tst w8, #0xffff00 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i32_bitsinmiddle_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsr w8, w0, w1 +; CHECK-SD-NEXT: tst w8, #0xffff00 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i32_bitsinmiddle_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #16776960 // =0xffff00 +; CHECK-GI-NEXT: lsl w8, w8, w1 +; CHECK-GI-NEXT: tst w8, w0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = shl i32 16776960, %y %t1 = and i32 %t0, %x %res = icmp eq i32 %t1, 0 @@ -142,12 +227,20 @@ define i1 @scalar_i32_bitsinmiddle_eq(i32 %x, i32 %y) nounwind { ; i64 scalar define i1 @scalar_i64_signbit_eq(i64 %x, i64 %y) nounwind { -; CHECK-LABEL: scalar_i64_signbit_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: tst x8, #0x8000000000000000 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i64_signbit_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsr x8, x0, x1 +; CHECK-SD-NEXT: tst x8, #0x8000000000000000 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i64_signbit_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000 +; CHECK-GI-NEXT: lsl x8, x8, x1 +; CHECK-GI-NEXT: tst x8, x0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = shl i64 9223372036854775808, %y %t1 = and i64 %t0, %x %res = icmp eq i64 %t1, 0 @@ -155,12 +248,20 @@ define i1 @scalar_i64_signbit_eq(i64 %x, i64 %y) nounwind { } define i1 @scalar_i64_lowestbit_eq(i64 %x, i64 %y) nounwind { -; CHECK-LABEL: scalar_i64_lowestbit_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: tst x8, #0x1 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i64_lowestbit_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsr x8, x0, x1 +; CHECK-SD-NEXT: tst x8, #0x1 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i64_lowestbit_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: lsl x8, x8, x1 +; CHECK-GI-NEXT: tst x8, x0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = shl i64 1, %y %t1 = and i64 %t0, %x %res = icmp eq i64 %t1, 0 @@ -168,12 +269,20 @@ define i1 @scalar_i64_lowestbit_eq(i64 %x, i64 %y) nounwind { } define i1 @scalar_i64_bitsinmiddle_eq(i64 %x, i64 %y) nounwind { -; CHECK-LABEL: scalar_i64_bitsinmiddle_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: tst x8, #0xffffffff0000 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i64_bitsinmiddle_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: lsr x8, x0, x1 +; CHECK-SD-NEXT: tst x8, #0xffffffff0000 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i64_bitsinmiddle_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov x8, #281474976645120 // =0xffffffff0000 +; CHECK-GI-NEXT: lsl x8, x8, x1 +; CHECK-GI-NEXT: tst x8, x0 +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %t0 = shl i64 281474976645120, %y %t1 = and i64 %t0, %x %res = icmp eq i64 %t1, 0 @@ -216,42 +325,81 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind { } define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwind { -; CHECK-LABEL: vec_4xi32_nonsplat_undef0_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: vec_4xi32_nonsplat_undef0_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.4s, #1 +; CHECK-SD-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-SD-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-SD-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vec_4xi32_nonsplat_undef0_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[1], w8 +; CHECK-GI-NEXT: mov v2.s[3], w8 +; CHECK-GI-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret %t0 = shl <4 x i32> , %y %t1 = and <4 x i32> %t0, %x %res = icmp eq <4 x i32> %t1, ret <4 x i1> %res } define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwind { -; CHECK-LABEL: vec_4xi32_nonsplat_undef1_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: vec_4xi32_nonsplat_undef1_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.4s, #1 +; CHECK-SD-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-SD-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-SD-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vec_4xi32_nonsplat_undef1_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi d3, #0000000000000000 +; CHECK-GI-NEXT: movi v2.4s, #1 +; CHECK-GI-NEXT: mov v3.s[1], wzr +; CHECK-GI-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: mov v3.s[3], wzr +; CHECK-GI-NEXT: cmeq v0.4s, v0.4s, v3.4s +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret %t0 = shl <4 x i32> , %y %t1 = and <4 x i32> %t0, %x %res = icmp eq <4 x i32> %t1, ret <4 x i1> %res } define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwind { -; CHECK-LABEL: vec_4xi32_nonsplat_undef2_eq: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: vec_4xi32_nonsplat_undef2_eq: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.4s, #1 +; CHECK-SD-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-SD-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-SD-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vec_4xi32_nonsplat_undef2_eq: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: movi d3, #0000000000000000 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[1], w8 +; CHECK-GI-NEXT: mov v3.s[1], wzr +; CHECK-GI-NEXT: mov v2.s[3], w8 +; CHECK-GI-NEXT: mov v3.s[3], wzr +; CHECK-GI-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: cmeq v0.4s, v0.4s, v3.4s +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret %t0 = shl <4 x i32> , %y %t1 = and <4 x i32> %t0, %x %res = icmp eq <4 x i32> %t1, @@ -263,12 +411,22 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi ;------------------------------------------------------------------------------; define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind { -; CHECK-LABEL: scalar_i8_signbit_ne: -; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: lsr w8, w8, w1 -; CHECK-NEXT: lsr w0, w8, #7 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i8_signbit_ne: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xff +; CHECK-SD-NEXT: lsr w8, w8, w1 +; CHECK-SD-NEXT: lsr w0, w8, #7 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i8_signbit_ne: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #-128 // =0xffffff80 +; CHECK-GI-NEXT: and w9, w1, #0xff +; CHECK-GI-NEXT: lsl w8, w8, w9 +; CHECK-GI-NEXT: and w8, w8, w0 +; CHECK-GI-NEXT: tst w8, #0xff +; CHECK-GI-NEXT: cset w0, ne +; CHECK-GI-NEXT: ret %t0 = shl i8 128, %y %t1 = and i8 %t0, %x %res = icmp ne i8 %t1, 0 ; we are perfectly happy with 'ne' predicate @@ -310,13 +468,24 @@ define i1 @scalar_i32_x_is_const2_eq(i32 %y) nounwind { } define i1 @scalar_i8_bitsinmiddle_slt(i8 %x, i8 %y) nounwind { -; CHECK-LABEL: scalar_i8_bitsinmiddle_slt: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #24 // =0x18 -; CHECK-NEXT: lsl w8, w8, w1 -; CHECK-NEXT: and w8, w8, w0 -; CHECK-NEXT: ubfx w0, w8, #7, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i8_bitsinmiddle_slt: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, #24 // =0x18 +; CHECK-SD-NEXT: lsl w8, w8, w1 +; CHECK-SD-NEXT: and w8, w8, w0 +; CHECK-SD-NEXT: ubfx w0, w8, #7, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i8_bitsinmiddle_slt: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #24 // =0x18 +; CHECK-GI-NEXT: and w9, w1, #0xff +; CHECK-GI-NEXT: lsl w8, w8, w9 +; CHECK-GI-NEXT: and w8, w8, w0 +; CHECK-GI-NEXT: sxtb w8, w8 +; CHECK-GI-NEXT: cmp w8, #0 +; CHECK-GI-NEXT: cset w0, mi +; CHECK-GI-NEXT: ret %t0 = shl i8 24, %y %t1 = and i8 %t0, %x %res = icmp slt i8 %t1, 0 @@ -324,15 +493,20 @@ define i1 @scalar_i8_bitsinmiddle_slt(i8 %x, i8 %y) nounwind { } define i1 @scalar_i8_signbit_eq_with_nonzero(i8 %x, i8 %y) nounwind { -; CHECK-LABEL: scalar_i8_signbit_eq_with_nonzero: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-128 // =0xffffff80 -; CHECK-NEXT: lsl w8, w8, w1 -; CHECK-NEXT: and w8, w8, w0 -; CHECK-NEXT: and w8, w8, #0x80 -; CHECK-NEXT: cmp w8, #1 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: scalar_i8_signbit_eq_with_nonzero: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, #-128 // =0xffffff80 +; CHECK-SD-NEXT: lsl w8, w8, w1 +; CHECK-SD-NEXT: and w8, w8, w0 +; CHECK-SD-NEXT: and w8, w8, #0x80 +; CHECK-SD-NEXT: cmp w8, #1 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: scalar_i8_signbit_eq_with_nonzero: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w0, wzr +; CHECK-GI-NEXT: ret %t0 = shl i8 128, %y %t1 = and i8 %t0, %x %res = icmp eq i8 %t1, 1 ; should be comparing with 0 diff --git a/llvm/test/CodeGen/AArch64/signbit-test.ll b/llvm/test/CodeGen/AArch64/signbit-test.ll index c74a934ee09d8..298495bcf5a01 100644 --- a/llvm/test/CodeGen/AArch64/signbit-test.ll +++ b/llvm/test/CodeGen/AArch64/signbit-test.ll @@ -1,13 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s +; RUN: llc -mtriple=aarch64-- < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-- -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI define i64 @test_clear_mask_i64_i32(i64 %x) nounwind { -; CHECK-LABEL: test_clear_mask_i64_i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 // =0x2a -; CHECK-NEXT: cmn w0, #1 -; CHECK-NEXT: csel x0, x8, x0, gt -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_clear_mask_i64_i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #42 // =0x2a +; CHECK-SD-NEXT: cmn w0, #1 +; CHECK-SD-NEXT: csel x0, x8, x0, gt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_clear_mask_i64_i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #42 // =0x2a +; CHECK-GI-NEXT: tst x0, #0x80000000 +; CHECK-GI-NEXT: csel x0, x8, x0, eq +; CHECK-GI-NEXT: ret entry: %a = and i64 %x, 2147483648 %r = icmp eq i64 %a, 0 diff --git a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll index 7c80f9320faec..fc01c6b2c5471 100644 --- a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll +++ b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI ; https://bugs.llvm.org/show_bug.cgi?id=38149 @@ -19,13 +20,22 @@ ; ---------------------------------------------------------------------------- ; define i1 @shifts_eqcmp_i16_i8(i16 %x) nounwind { -; CHECK-LABEL: shifts_eqcmp_i16_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: and w8, w8, #0xffff -; CHECK-NEXT: cmp w8, w0, uxth -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shifts_eqcmp_i16_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sxtb w8, w0 +; CHECK-SD-NEXT: and w8, w8, #0xffff +; CHECK-SD-NEXT: cmp w8, w0, uxth +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shifts_eqcmp_i16_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: lsl w8, w0, #8 +; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 +; CHECK-GI-NEXT: and w8, w8, #0xffff +; CHECK-GI-NEXT: cmp w8, w0, uxth +; CHECK-GI-NEXT: cset w0, eq +; CHECK-GI-NEXT: ret %tmp0 = shl i16 %x, 8 ; 16-8 %tmp1 = ashr exact i16 %tmp0, 8 ; 16-8 %tmp2 = icmp eq i16 %tmp1, %x @@ -97,26 +107,43 @@ define i1 @shifts_eqcmp_i64_i8(i64 %x) nounwind { ; ---------------------------------------------------------------------------- ; define i1 @add_ugecmp_i16_i8(i16 %x) nounwind { -; CHECK-LABEL: add_ugecmp_i16_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: sub w8, w8, #128 -; CHECK-NEXT: lsr w8, w8, #8 -; CHECK-NEXT: cmp w8, #254 -; CHECK-NEXT: cset w0, hi -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ugecmp_i16_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xffff +; CHECK-SD-NEXT: sub w8, w8, #128 +; CHECK-SD-NEXT: lsr w8, w8, #8 +; CHECK-SD-NEXT: cmp w8, #254 +; CHECK-SD-NEXT: cset w0, hi +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ugecmp_i16_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #-128 // =0xffffff80 +; CHECK-GI-NEXT: mov w9, #65280 // =0xff00 +; CHECK-GI-NEXT: add w8, w8, w0, uxth +; CHECK-GI-NEXT: cmp w8, w9 +; CHECK-GI-NEXT: cset w0, hs +; CHECK-GI-NEXT: ret %tmp0 = add i16 %x, -128 ; ~0U << (8-1) %tmp1 = icmp uge i16 %tmp0, -256 ; ~0U << 8 ret i1 %tmp1 } define i1 @add_ugecmp_i32_i16_i8(i16 %xx) nounwind { -; CHECK-LABEL: add_ugecmp_i32_i16_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: cmp w8, w8, sxtb -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ugecmp_i32_i16_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xffff +; CHECK-SD-NEXT: cmp w8, w8, sxtb +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ugecmp_i32_i16_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #-128 // =0xffffff80 +; CHECK-GI-NEXT: add w8, w8, w0, uxth +; CHECK-GI-NEXT: cmn w8, #256 +; CHECK-GI-NEXT: cset w0, hs +; CHECK-GI-NEXT: ret %x = zext i16 %xx to i32 %tmp0 = add i32 %x, -128 ; ~0U << (8-1) %tmp1 = icmp uge i32 %tmp0, -256 ; ~0U << 8 @@ -124,55 +151,92 @@ define i1 @add_ugecmp_i32_i16_i8(i16 %xx) nounwind { } define i1 @add_ugecmp_i32_i16(i32 %x) nounwind { -; CHECK-LABEL: add_ugecmp_i32_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, w0, sxth -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ugecmp_i32_i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w0, sxth +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ugecmp_i32_i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub w8, w0, #8, lsl #12 // =32768 +; CHECK-GI-NEXT: cmn w8, #16, lsl #12 // =65536 +; CHECK-GI-NEXT: cset w0, hs +; CHECK-GI-NEXT: ret %tmp0 = add i32 %x, -32768 ; ~0U << (16-1) %tmp1 = icmp uge i32 %tmp0, -65536 ; ~0U << 16 ret i1 %tmp1 } define i1 @add_ugecmp_i32_i8(i32 %x) nounwind { -; CHECK-LABEL: add_ugecmp_i32_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, w0, sxtb -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ugecmp_i32_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w0, sxtb +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ugecmp_i32_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub w8, w0, #128 +; CHECK-GI-NEXT: cmn w8, #256 +; CHECK-GI-NEXT: cset w0, hs +; CHECK-GI-NEXT: ret %tmp0 = add i32 %x, -128 ; ~0U << (8-1) %tmp1 = icmp uge i32 %tmp0, -256 ; ~0U << 8 ret i1 %tmp1 } define i1 @add_ugecmp_i64_i32(i64 %x) nounwind { -; CHECK-LABEL: add_ugecmp_i64_i32: -; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, w0, sxtw -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ugecmp_i64_i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp x0, w0, sxtw +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ugecmp_i64_i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov x8, #-2147483648 // =0xffffffff80000000 +; CHECK-GI-NEXT: mov x9, #-4294967296 // =0xffffffff00000000 +; CHECK-GI-NEXT: add x8, x0, x8 +; CHECK-GI-NEXT: cmp x8, x9 +; CHECK-GI-NEXT: cset w0, hs +; CHECK-GI-NEXT: ret %tmp0 = add i64 %x, -2147483648 ; ~0U << (32-1) %tmp1 = icmp uge i64 %tmp0, -4294967296 ; ~0U << 32 ret i1 %tmp1 } define i1 @add_ugecmp_i64_i16(i64 %x) nounwind { -; CHECK-LABEL: add_ugecmp_i64_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, w0, sxth -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ugecmp_i64_i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp x0, w0, sxth +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ugecmp_i64_i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub x8, x0, #8, lsl #12 // =32768 +; CHECK-GI-NEXT: cmn x8, #16, lsl #12 // =65536 +; CHECK-GI-NEXT: cset w0, hs +; CHECK-GI-NEXT: ret %tmp0 = add i64 %x, -32768 ; ~0U << (16-1) %tmp1 = icmp uge i64 %tmp0, -65536 ; ~0U << 16 ret i1 %tmp1 } define i1 @add_ugecmp_i64_i8(i64 %x) nounwind { -; CHECK-LABEL: add_ugecmp_i64_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, w0, sxtb -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ugecmp_i64_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp x0, w0, sxtb +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ugecmp_i64_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub x8, x0, #128 +; CHECK-GI-NEXT: cmn x8, #256 +; CHECK-GI-NEXT: cset w0, hs +; CHECK-GI-NEXT: ret %tmp0 = add i64 %x, -128 ; ~0U << (8-1) %tmp1 = icmp uge i64 %tmp0, -256 ; ~0U << 8 ret i1 %tmp1 @@ -180,14 +244,23 @@ define i1 @add_ugecmp_i64_i8(i64 %x) nounwind { ; Slightly more canonical variant define i1 @add_ugtcmp_i16_i8(i16 %x) nounwind { -; CHECK-LABEL: add_ugtcmp_i16_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: sub w8, w8, #128 -; CHECK-NEXT: lsr w8, w8, #8 -; CHECK-NEXT: cmp w8, #254 -; CHECK-NEXT: cset w0, hi -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ugtcmp_i16_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xffff +; CHECK-SD-NEXT: sub w8, w8, #128 +; CHECK-SD-NEXT: lsr w8, w8, #8 +; CHECK-SD-NEXT: cmp w8, #254 +; CHECK-SD-NEXT: cset w0, hi +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ugtcmp_i16_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #-128 // =0xffffff80 +; CHECK-GI-NEXT: mov w9, #65279 // =0xfeff +; CHECK-GI-NEXT: add w8, w8, w0, uxth +; CHECK-GI-NEXT: cmp w8, w9 +; CHECK-GI-NEXT: cset w0, hi +; CHECK-GI-NEXT: ret %tmp0 = add i16 %x, -128 ; ~0U << (8-1) %tmp1 = icmp ugt i16 %tmp0, -257 ; ~0U << 8 - 1 ret i1 %tmp1 @@ -198,68 +271,113 @@ define i1 @add_ugtcmp_i16_i8(i16 %x) nounwind { ; ---------------------------------------------------------------------------- ; define i1 @add_ultcmp_i16_i8(i16 %x) nounwind { -; CHECK-LABEL: add_ultcmp_i16_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: and w8, w8, #0xffff -; CHECK-NEXT: cmp w8, w0, uxth -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ultcmp_i16_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sxtb w8, w0 +; CHECK-SD-NEXT: and w8, w8, #0xffff +; CHECK-SD-NEXT: cmp w8, w0, uxth +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ultcmp_i16_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add w8, w0, #128 +; CHECK-GI-NEXT: and w8, w8, #0xffff +; CHECK-GI-NEXT: cmp w8, #256 +; CHECK-GI-NEXT: cset w0, lo +; CHECK-GI-NEXT: ret %tmp0 = add i16 %x, 128 ; 1U << (8-1) %tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8 ret i1 %tmp1 } define i1 @add_ultcmp_i32_i16(i32 %x) nounwind { -; CHECK-LABEL: add_ultcmp_i32_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, w0, sxth -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ultcmp_i32_i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w0, sxth +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ultcmp_i32_i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add w8, w0, #8, lsl #12 // =32768 +; CHECK-GI-NEXT: cmp w8, #16, lsl #12 // =65536 +; CHECK-GI-NEXT: cset w0, lo +; CHECK-GI-NEXT: ret %tmp0 = add i32 %x, 32768 ; 1U << (16-1) %tmp1 = icmp ult i32 %tmp0, 65536 ; 1U << 16 ret i1 %tmp1 } define i1 @add_ultcmp_i32_i8(i32 %x) nounwind { -; CHECK-LABEL: add_ultcmp_i32_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, w0, sxtb -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ultcmp_i32_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w0, sxtb +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ultcmp_i32_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add w8, w0, #128 +; CHECK-GI-NEXT: cmp w8, #256 +; CHECK-GI-NEXT: cset w0, lo +; CHECK-GI-NEXT: ret %tmp0 = add i32 %x, 128 ; 1U << (8-1) %tmp1 = icmp ult i32 %tmp0, 256 ; 1U << 8 ret i1 %tmp1 } define i1 @add_ultcmp_i64_i32(i64 %x) nounwind { -; CHECK-LABEL: add_ultcmp_i64_i32: -; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, w0, sxtw -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ultcmp_i64_i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp x0, w0, sxtw +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ultcmp_i64_i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #-2147483648 // =0x80000000 +; CHECK-GI-NEXT: mov x9, #4294967296 // =0x100000000 +; CHECK-GI-NEXT: add x8, x0, x8 +; CHECK-GI-NEXT: cmp x8, x9 +; CHECK-GI-NEXT: cset w0, lo +; CHECK-GI-NEXT: ret %tmp0 = add i64 %x, 2147483648 ; 1U << (32-1) %tmp1 = icmp ult i64 %tmp0, 4294967296 ; 1U << 32 ret i1 %tmp1 } define i1 @add_ultcmp_i64_i16(i64 %x) nounwind { -; CHECK-LABEL: add_ultcmp_i64_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, w0, sxth -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ultcmp_i64_i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp x0, w0, sxth +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ultcmp_i64_i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add x8, x0, #8, lsl #12 // =32768 +; CHECK-GI-NEXT: cmp x8, #16, lsl #12 // =65536 +; CHECK-GI-NEXT: cset w0, lo +; CHECK-GI-NEXT: ret %tmp0 = add i64 %x, 32768 ; 1U << (16-1) %tmp1 = icmp ult i64 %tmp0, 65536 ; 1U << 16 ret i1 %tmp1 } define i1 @add_ultcmp_i64_i8(i64 %x) nounwind { -; CHECK-LABEL: add_ultcmp_i64_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, w0, sxtb -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ultcmp_i64_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp x0, w0, sxtb +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ultcmp_i64_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add x8, x0, #128 +; CHECK-GI-NEXT: cmp x8, #256 +; CHECK-GI-NEXT: cset w0, lo +; CHECK-GI-NEXT: ret %tmp0 = add i64 %x, 128 ; 1U << (8-1) %tmp1 = icmp ult i64 %tmp0, 256 ; 1U << 8 ret i1 %tmp1 @@ -267,13 +385,21 @@ define i1 @add_ultcmp_i64_i8(i64 %x) nounwind { ; Slightly more canonical variant define i1 @add_ulecmp_i16_i8(i16 %x) nounwind { -; CHECK-LABEL: add_ulecmp_i16_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: and w8, w8, #0xffff -; CHECK-NEXT: cmp w8, w0, uxth -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ulecmp_i16_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sxtb w8, w0 +; CHECK-SD-NEXT: and w8, w8, #0xffff +; CHECK-SD-NEXT: cmp w8, w0, uxth +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ulecmp_i16_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add w8, w0, #128 +; CHECK-GI-NEXT: and w8, w8, #0xffff +; CHECK-GI-NEXT: cmp w8, #255 +; CHECK-GI-NEXT: cset w0, ls +; CHECK-GI-NEXT: ret %tmp0 = add i16 %x, 128 ; 1U << (8-1) %tmp1 = icmp ule i16 %tmp0, 255 ; (1U << 8) - 1 ret i1 %tmp1 @@ -284,12 +410,20 @@ define i1 @add_ulecmp_i16_i8(i16 %x) nounwind { ; Adding not a constant define i1 @add_ultcmp_bad_i16_i8_add(i16 %x, i16 %y) nounwind { -; CHECK-LABEL: add_ultcmp_bad_i16_i8_add: -; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, w1 -; CHECK-NEXT: tst w8, #0xff00 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ultcmp_bad_i16_i8_add: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add w8, w0, w1 +; CHECK-SD-NEXT: tst w8, #0xff00 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ultcmp_bad_i16_i8_add: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add w8, w0, w1 +; CHECK-GI-NEXT: and w8, w8, #0xffff +; CHECK-GI-NEXT: cmp w8, #256 +; CHECK-GI-NEXT: cset w0, lo +; CHECK-GI-NEXT: ret %tmp0 = add i16 %x, %y %tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8 ret i1 %tmp1 @@ -311,12 +445,20 @@ define i1 @add_ultcmp_bad_i16_i8_cmp(i16 %x, i16 %y) nounwind { ; Second constant is not larger than the first one define i1 @add_ultcmp_bad_i8_i16(i16 %x) nounwind { -; CHECK-LABEL: add_ultcmp_bad_i8_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: add w8, w8, #128 -; CHECK-NEXT: lsr w0, w8, #16 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ultcmp_bad_i8_i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xffff +; CHECK-SD-NEXT: add w8, w8, #128 +; CHECK-SD-NEXT: lsr w0, w8, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ultcmp_bad_i8_i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: and w8, w0, #0xffff +; CHECK-GI-NEXT: add w8, w8, #128 +; CHECK-GI-NEXT: cmp w8, w8, uxth +; CHECK-GI-NEXT: cset w0, ne +; CHECK-GI-NEXT: ret %tmp0 = add i16 %x, 128 ; 1U << (8-1) %tmp1 = icmp ult i16 %tmp0, 128 ; 1U << (8-1) ret i1 %tmp1 @@ -324,12 +466,20 @@ define i1 @add_ultcmp_bad_i8_i16(i16 %x) nounwind { ; First constant is not power of two define i1 @add_ultcmp_bad_i16_i8_c0notpoweroftwo(i16 %x) nounwind { -; CHECK-LABEL: add_ultcmp_bad_i16_i8_c0notpoweroftwo: -; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, #192 -; CHECK-NEXT: tst w8, #0xff00 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ultcmp_bad_i16_i8_c0notpoweroftwo: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add w8, w0, #192 +; CHECK-SD-NEXT: tst w8, #0xff00 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ultcmp_bad_i16_i8_c0notpoweroftwo: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add w8, w0, #192 +; CHECK-GI-NEXT: and w8, w8, #0xffff +; CHECK-GI-NEXT: cmp w8, #256 +; CHECK-GI-NEXT: cset w0, lo +; CHECK-GI-NEXT: ret %tmp0 = add i16 %x, 192 ; (1U << (8-1)) + (1U << (8-1-1)) %tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8 ret i1 %tmp1 @@ -351,12 +501,20 @@ define i1 @add_ultcmp_bad_i16_i8_c1notpoweroftwo(i16 %x) nounwind { ; Magic check fails, 64 << 1 != 256 define i1 @add_ultcmp_bad_i16_i8_magic(i16 %x) nounwind { -; CHECK-LABEL: add_ultcmp_bad_i16_i8_magic: -; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, #64 -; CHECK-NEXT: tst w8, #0xff00 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ultcmp_bad_i16_i8_magic: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add w8, w0, #64 +; CHECK-SD-NEXT: tst w8, #0xff00 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ultcmp_bad_i16_i8_magic: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add w8, w0, #64 +; CHECK-GI-NEXT: and w8, w8, #0xffff +; CHECK-GI-NEXT: cmp w8, #256 +; CHECK-GI-NEXT: cset w0, lo +; CHECK-GI-NEXT: ret %tmp0 = add i16 %x, 64 ; 1U << (8-1-1) %tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8 ret i1 %tmp1 @@ -364,12 +522,20 @@ define i1 @add_ultcmp_bad_i16_i8_magic(i16 %x) nounwind { ; Bad 'destination type' define i1 @add_ultcmp_bad_i16_i4(i16 %x) nounwind { -; CHECK-LABEL: add_ultcmp_bad_i16_i4: -; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, #8 -; CHECK-NEXT: tst w8, #0xfff0 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ultcmp_bad_i16_i4: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add w8, w0, #8 +; CHECK-SD-NEXT: tst w8, #0xfff0 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ultcmp_bad_i16_i4: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add w8, w0, #8 +; CHECK-GI-NEXT: and w8, w8, #0xffff +; CHECK-GI-NEXT: cmp w8, #16 +; CHECK-GI-NEXT: cset w0, lo +; CHECK-GI-NEXT: ret %tmp0 = add i16 %x, 8 ; 1U << (4-1) %tmp1 = icmp ult i16 %tmp0, 16 ; 1U << 4 ret i1 %tmp1 @@ -377,12 +543,20 @@ define i1 @add_ultcmp_bad_i16_i4(i16 %x) nounwind { ; Bad storage type define i1 @add_ultcmp_bad_i24_i8(i24 %x) nounwind { -; CHECK-LABEL: add_ultcmp_bad_i24_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, #128 -; CHECK-NEXT: tst w8, #0xffff00 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: ret +; CHECK-SD-LABEL: add_ultcmp_bad_i24_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add w8, w0, #128 +; CHECK-SD-NEXT: tst w8, #0xffff00 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: add_ultcmp_bad_i24_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add w8, w0, #128 +; CHECK-GI-NEXT: and w8, w8, #0xffffff +; CHECK-GI-NEXT: cmp w8, #256 +; CHECK-GI-NEXT: cset w0, lo +; CHECK-GI-NEXT: ret %tmp0 = add i24 %x, 128 ; 1U << (8-1) %tmp1 = icmp ult i24 %tmp0, 256 ; 1U << 8 ret i1 %tmp1 From 5c0862439c01a5c714409e627f1c5945a3ab0da9 Mon Sep 17 00:00:00 2001 From: Orlando Cazalet-Hyams Date: Thu, 30 Oct 2025 09:08:43 +0000 Subject: [PATCH 166/539] [DebugInfo] Add bit size to _BitInt name in debug info (#165583) Follow on from #164372 This changes the DW_AT_name for `_BitInt(N)` from `_BitInt` to `_BitInt(N)` --- clang/lib/CodeGen/CGDebugInfo.cpp | 5 ++++- clang/test/DebugInfo/Generic/bit-int.c | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 07a2cfb21bef2..fd2f6dcf182b5 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -1174,7 +1174,10 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) { } llvm::DIType *CGDebugInfo::CreateType(const BitIntType *Ty) { - StringRef Name = Ty->isUnsigned() ? "unsigned _BitInt" : "_BitInt"; + SmallString<32> Name; + llvm::raw_svector_ostream OS(Name); + OS << (Ty->isUnsigned() ? "unsigned _BitInt(" : "_BitInt(") + << Ty->getNumBits() << ")"; llvm::dwarf::TypeKind Encoding = Ty->isUnsigned() ? llvm::dwarf::DW_ATE_unsigned : llvm::dwarf::DW_ATE_signed; diff --git a/clang/test/DebugInfo/Generic/bit-int.c b/clang/test/DebugInfo/Generic/bit-int.c index 94b93013e3b46..88ecc139eee9f 100644 --- a/clang/test/DebugInfo/Generic/bit-int.c +++ b/clang/test/DebugInfo/Generic/bit-int.c @@ -4,5 +4,5 @@ unsigned _BitInt(17) a; _BitInt(2) b; -// CHECK: !DIBasicType(name: "_BitInt", size: 8, dataSize: 2, encoding: DW_ATE_signed) -// CHECK: !DIBasicType(name: "unsigned _BitInt", size: 32, dataSize: 17, encoding: DW_ATE_unsigned) +// CHECK: !DIBasicType(name: "_BitInt(2)", size: 8, dataSize: 2, encoding: DW_ATE_signed) +// CHECK: !DIBasicType(name: "unsigned _BitInt(17)", size: 32, dataSize: 17, encoding: DW_ATE_unsigned) From 688e6d7ca57c3d717f218ec6a08d0cb996dc62f5 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 30 Oct 2025 10:23:40 +0100 Subject: [PATCH 167/539] [MemCpyOpt] Allow stack move optimization if one address captured (#165527) Allow the stack move optimization (which merges two allocas) when the address of only one alloca is captured (and the provenance is not captured). Both addresses need to be captured to observe that the allocas were merged. Fixes https://github.com/llvm/llvm-project/issues/165484. --- .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 20 +++++-- llvm/test/Transforms/MemCpyOpt/stack-move.ll | 58 +++++++++++++++++++ 2 files changed, 73 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index e043d072a7638..08be5df9872b7 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1534,8 +1534,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, bool SrcNotDom = false; auto CaptureTrackingWithModRef = - [&](Instruction *AI, - function_ref ModRefCallback) -> bool { + [&](Instruction *AI, function_ref ModRefCallback, + bool &AddressCaptured) -> bool { SmallVector Worklist; Worklist.push_back(AI); unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking(); @@ -1559,8 +1559,9 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, if (!Visited.insert(&U).second) continue; UseCaptureInfo CI = DetermineUseCaptureKind(U, AI); - if (capturesAnything(CI.UseCC)) + if (capturesAnyProvenance(CI.UseCC)) return false; + AddressCaptured |= capturesAddress(CI.UseCC); if (UI->mayReadOrWriteMemory()) { if (UI->isLifetimeStartOrEnd()) { @@ -1627,7 +1628,9 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, return true; }; - if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback)) + bool DestAddressCaptured = false; + if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback, + DestAddressCaptured)) return false; // Bailout if Dest may have any ModRef before Store. if (!ReachabilityWorklist.empty() && @@ -1653,7 +1656,14 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, return true; }; - if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback)) + bool SrcAddressCaptured = false; + if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback, + SrcAddressCaptured)) + return false; + + // If both the source and destination address are captured, the fact that they + // are no longer two separate allocations may be observed. + if (DestAddressCaptured && SrcAddressCaptured) return false; // We can do the transformation. First, move the SrcAlloca to the start of the diff --git a/llvm/test/Transforms/MemCpyOpt/stack-move.ll b/llvm/test/Transforms/MemCpyOpt/stack-move.ll index 940e30ec46881..0c2e05fa8fed6 100644 --- a/llvm/test/Transforms/MemCpyOpt/stack-move.ll +++ b/llvm/test/Transforms/MemCpyOpt/stack-move.ll @@ -1729,3 +1729,61 @@ define i32 @test_ret_only_capture() { %v = load i32, ptr %a ret i32 %v } + +declare ptr @captures_address_only(ptr captures(address)) + +; Can transform: Only one address captured. +define void @test_captures_address_captures_none() { +; CHECK-LABEL: define void @test_captures_address_captures_none() { +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4 +; CHECK-NEXT: call void @captures_address_only(ptr [[SRC]]) +; CHECK-NEXT: call void @use_nocapture(ptr [[SRC]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dst = alloca %struct.Foo, align 4 + store %struct.Foo { i32 10, i32 20, i32 30 }, ptr %src + call void @captures_address_only(ptr %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dst, ptr align 4 %src, i64 12, i1 false) + call void @use_nocapture(ptr %dst) + ret void +} + +; Can transform: Only one address captured. +define void @test_captures_none_and_captures_address() { +; CHECK-LABEL: define void @test_captures_none_and_captures_address() { +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4 +; CHECK-NEXT: call void @use_nocapture(ptr [[SRC]]) +; CHECK-NEXT: call void @captures_address_only(ptr [[SRC]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dst = alloca %struct.Foo, align 4 + store %struct.Foo { i32 10, i32 20, i32 30 }, ptr %src + call void @use_nocapture(ptr %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dst, ptr align 4 %src, i64 12, i1 false) + call void @captures_address_only(ptr %dst) + ret void +} + +; Cannot transform: Both addresses captured. +define void @test_captures_address_and_captures_address() { +; CHECK-LABEL: define void @test_captures_address_and_captures_address() { +; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 +; CHECK-NEXT: [[DST:%.*]] = alloca [[STRUCT_FOO]], align 4 +; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4 +; CHECK-NEXT: call void @captures_address_only(ptr [[SRC]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DST]], ptr align 4 [[SRC]], i64 12, i1 false) +; CHECK-NEXT: call void @captures_address_only(ptr [[DST]]) +; CHECK-NEXT: ret void +; + %src = alloca %struct.Foo, align 4 + %dst = alloca %struct.Foo, align 4 + store %struct.Foo { i32 10, i32 20, i32 30 }, ptr %src + call void @captures_address_only(ptr %src) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dst, ptr align 4 %src, i64 12, i1 false) + call void @captures_address_only(ptr %dst) + ret void +} From 46bffc0811321c78750b2f00c820f56d1bb0da31 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 30 Oct 2025 10:24:19 +0100 Subject: [PATCH 168/539] [DeveloperPolicy] Add guidelines for adding/enabling passes (#158591) This documents two things: * The recommended way to go about adding a new pass. * The criteria for enabling a pass. RFC: https://discourse.llvm.org/t/rfc-guidelines-for-adding-enabling-new-passes/88290 --- llvm/docs/DeveloperPolicy.rst | 49 +++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/llvm/docs/DeveloperPolicy.rst b/llvm/docs/DeveloperPolicy.rst index 45f2df20984e6..9135406c2e2a1 100644 --- a/llvm/docs/DeveloperPolicy.rst +++ b/llvm/docs/DeveloperPolicy.rst @@ -1189,6 +1189,55 @@ Suggested disclaimer for the project README and the main project web page: necessarily a reflection of the completeness or stability of the code, it does indicate that the project is not yet endorsed as a component of LLVM. +Adding or enabling a new LLVM pass +---------------------------------- + +The guidelines here are primarily targeted at the enablement of new major +passes in the target-independent optimization pipeline. Small additions, or +backend-specific passes, require a lesser degree of care. Before creating a new +pass, consider whether the functionality can be integrated into an existing +pass first. This is often both faster and more powerful. + +When adding a new pass, the goal should be to enable it as part of the default +optimization pipeline as early as possible and then continue development +incrementally. (This does not apply to passes that are only relevant for +specific uses of LLVM, such as GC support passes.) + +The recommended workflow is: + +1. Implement a basic version of the pass and add it to the pass pipeline behind + a flag that is disabled by default. The initial version should focus on + handling simple cases correctly and efficiently. +2. Enable the pass by default. Separating this step allows easily disabling the + pass if issues are encountered, without having to revert the entire + implementation. +3. Incrementally extend the pass with new functionality. As the pass is already + enabled, it becomes easier to identify the specific change that has caused a + regression in correctness, optimization quality or compile-time. + +When enabling a pass, certain requirements must be met (in no particular order): + + * **Maintenance:** The pass (and any analyses it depends on) must have at + least one maintainer. + * **Usefulness:** There should be evidence that the pass improves performance + (or whatever metric it optimizes for) on real-world workloads. Improvements + seen only on synthetic benchmarks may be insufficient. + * **Compile-Time:** The pass should not have a large impact on compile-time, + where the evaluation of what "large" means is up to reviewer discretion, and + may differ based on the value the pass provides. In any case, it is expected + that a concerted effort has been made to mitigate the compile-time impact, + both for the average case, and for pathological cases. + * **Correctness:** The pass should have no known correctness issues (except + global correctness issues that affect all of LLVM). If an old pass is being + enabled (rather than implementing a new one incrementally), additional due + diligence is required. The pass should be fully reviewed to ensure that it + still complies with current quality standards. Fuzzing with disabled + profitability checks may help gain additional confidence in the + implementation. + +If non-trivial issues are found in a newly enabled pass, it may be temporarily +disabled again, until the issues have been resolved. + .. _copyright-license-patents: Copyright, License, and Patents From 0a9195606bcd13e73e364a16a4a604a5a142fea6 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 30 Oct 2025 10:34:41 +0100 Subject: [PATCH 169/539] [libc++] Fix LLVM 22 TODOs (#153367) We've upgraded to LLVM 22 now, so we can remove a bunch of TODOs. --- libcxx/include/__config | 11 +------- libcxx/include/__configuration/abi.h | 8 ------ libcxx/include/__format/format_arg.h | 17 +++++------- libcxx/include/__format/format_context.h | 4 +-- libcxx/include/__hash_table | 12 --------- libcxx/include/__iterator/concepts.h | 15 +++++------ libcxx/include/__math/traits.h | 15 +++-------- libcxx/include/__ranges/transform_view.h | 3 +-- libcxx/include/__tree | 12 --------- .../reference_constructs_from_temporary.h | 6 ----- libcxx/include/forward_list | 11 -------- libcxx/include/list | 11 -------- libcxx/include/tuple | 4 +-- libcxx/include/variant | 2 +- .../meta/is_referenceable.compile.pass.cpp | 2 +- .../c.math/constexpr-cxx23-clang.pass.cpp | 7 ----- .../transform_error.mandates.verify.cpp | 27 ++++++------------- .../transform_error.mandates.verify.cpp | 27 ++++--------------- .../format.arg/visit.pass.cpp | 2 -- .../format.arg/visit.return_type.pass.cpp | 2 -- .../visit_format_arg.deprecated.verify.cpp | 1 - .../format.arg/visit_format_arg.pass.cpp | 6 ++--- .../format.arguments/format.args/get.pass.cpp | 6 ++--- ...855_tuple_ref_binding_diagnostics.pass.cpp | 22 --------------- .../robust_against_adl.pass.cpp | 1 - .../variant.visit.member/visit.pass.cpp | 2 -- .../visit_return_type.pass.cpp | 2 -- libcxx/test/support/test_basic_format_arg.h | 2 +- libcxx/test/support/test_macros.h | 7 ----- 29 files changed, 44 insertions(+), 203 deletions(-) diff --git a/libcxx/include/__config b/libcxx/include/__config index b4c081dcdff1b..357f77b7d27d6 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -1050,8 +1050,7 @@ typedef __char32_t char32_t; # define _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(_ClassName) static_assert(true, "") # endif -// TODO(LLVM 22): Remove the workaround -# if defined(__OBJC__) && (!defined(_LIBCPP_CLANG_VER) || _LIBCPP_CLANG_VER < 2001) +# if defined(__OBJC__) && defined(_LIBCPP_APPLE_CLANG_VER) # define _LIBCPP_WORKAROUND_OBJCXX_COMPILER_INTRINSICS # endif @@ -1255,14 +1254,6 @@ typedef __char32_t char32_t; # define _LIBCPP_DIAGNOSE_NULLPTR # endif -// TODO(LLVM 22): Remove this macro once LLVM19 support ends. __cpp_explicit_this_parameter has been set in LLVM20. -// Clang-18 has support for deducing this, but it does not set the FTM. -# if defined(__cpp_explicit_this_parameter) || (defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 1800) -# define _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER 1 -# else -# define _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER 0 -# endif - #endif // __cplusplus #endif // _LIBCPP___CONFIG diff --git a/libcxx/include/__configuration/abi.h b/libcxx/include/__configuration/abi.h index c9936df30ff7f..38b85c6ac70de 100644 --- a/libcxx/include/__configuration/abi.h +++ b/libcxx/include/__configuration/abi.h @@ -61,14 +61,6 @@ // According to the Standard, `bitset::operator[] const` returns bool # define _LIBCPP_ABI_BITSET_VECTOR_BOOL_CONST_SUBSCRIPT_RETURN_BOOL -// In LLVM 20, we've changed to take these ABI breaks unconditionally. These flags only exist in case someone is running -// into the static_asserts we added to catch the ABI break and don't care that it is one. -// TODO(LLVM 22): Remove these flags -# define _LIBCPP_ABI_LIST_REMOVE_NODE_POINTER_UB -# define _LIBCPP_ABI_TREE_REMOVE_NODE_POINTER_UB -# define _LIBCPP_ABI_FIX_UNORDERED_NODE_POINTER_UB -# define _LIBCPP_ABI_FORWARD_LIST_REMOVE_NODE_POINTER_UB - // These flags are documented in ABIGuarantees.rst # define _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT # define _LIBCPP_ABI_DO_NOT_EXPORT_BASIC_STRING_COMMON diff --git a/libcxx/include/__format/format_arg.h b/libcxx/include/__format/format_arg.h index ed5e76275ea87..19794f0f084ce 100644 --- a/libcxx/include/__format/format_arg.h +++ b/libcxx/include/__format/format_arg.h @@ -149,7 +149,7 @@ _LIBCPP_HIDE_FROM_ABI decltype(auto) __visit_format_arg(_Visitor&& __vis, basic_ __libcpp_unreachable(); } -# if _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER +# if _LIBCPP_STD_VER >= 26 template _LIBCPP_HIDE_FROM_ABI _Rp __visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) { @@ -200,7 +200,7 @@ _LIBCPP_HIDE_FROM_ABI _Rp __visit_format_arg(_Visitor&& __vis, basic_format_arg< __libcpp_unreachable(); } -# endif // _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER +# endif // _LIBCPP_STD_VER >= 26 /// Contains the values used in basic_format_arg. /// @@ -285,7 +285,7 @@ class _LIBCPP_NO_SPECIALIZATIONS basic_format_arg { _LIBCPP_HIDE_FROM_ABI explicit operator bool() const noexcept { return __type_ != __format::__arg_t::__none; } -# if _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER +# if _LIBCPP_STD_VER >= 26 // This function is user facing, so it must wrap the non-standard types of // the "variant" in a handle to stay conforming. See __arg_t for more details. @@ -329,7 +329,7 @@ class _LIBCPP_NO_SPECIALIZATIONS basic_format_arg { } } -# endif // _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER +# endif // _LIBCPP_STD_VER >= 26 private: using char_type = typename _Context::char_type; @@ -371,11 +371,8 @@ class basic_format_arg<_Context>::handle { // This function is user facing, so it must wrap the non-standard types of // the "variant" in a handle to stay conforming. See __arg_t for more details. template -# if _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER -_LIBCPP_DEPRECATED_IN_CXX26 -# endif - _LIBCPP_HIDE_FROM_ABI decltype(auto) - visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) { +_LIBCPP_DEPRECATED_IN_CXX26 _LIBCPP_HIDE_FROM_ABI decltype(auto) +visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) { switch (__arg.__type_) { # if _LIBCPP_HAS_INT128 case __format::__arg_t::__i128: { @@ -387,7 +384,7 @@ _LIBCPP_DEPRECATED_IN_CXX26 typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u128_}; return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h}); } -# endif // _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER +# endif // _LIBCPP_HAS_INT128 default: return std::__visit_format_arg(std::forward<_Visitor>(__vis), __arg); } diff --git a/libcxx/include/__format/format_context.h b/libcxx/include/__format/format_context.h index e672ee7ad0581..1771dd34b82fb 100644 --- a/libcxx/include/__format/format_context.h +++ b/libcxx/include/__format/format_context.h @@ -175,13 +175,13 @@ class basic_format_context::__itera __format::__determine_arg_t(), __basic_format_arg_value(__arg)}; }; -# if _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER +# if _LIBCPP_STD_VER >= 26 return static_cast<_Context*>(__c)->arg(__id).visit(std::move(__visitor)); # else _LIBCPP_SUPPRESS_DEPRECATED_PUSH return std::visit_format_arg(std::move(__visitor), static_cast<_Context*>(__c)->arg(__id)); _LIBCPP_SUPPRESS_DEPRECATED_POP -# endif // _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER +# endif // _LIBCPP_STD_VER >= 26 }) { } diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table index 5432abb4ab39d..e1897949a47e6 100644 --- a/libcxx/include/__hash_table +++ b/libcxx/include/__hash_table @@ -83,18 +83,6 @@ struct __hash_node_base { typedef _NodePtr __node_pointer; typedef __node_base_pointer __next_pointer; -// TODO(LLVM 22): Remove this check -#ifndef _LIBCPP_ABI_FIX_UNORDERED_NODE_POINTER_UB - static_assert(sizeof(__node_base_pointer) == sizeof(__node_pointer) && _LIBCPP_ALIGNOF(__node_base_pointer) == - _LIBCPP_ALIGNOF(__node_pointer), - "It looks like you are using std::__hash_table (an implementation detail for the unordered containers) " - "with a fancy pointer type that thas a different representation depending on whether it points to a " - "__hash_table base pointer or a __hash_table node pointer (both of which are implementation details of " - "the standard library). This means that your ABI is being broken between LLVM 19 and LLVM 20. If you " - "don't care about your ABI being broken, define the _LIBCPP_ABI_TREE_REMOVE_NODE_POINTER_UB macro to " - "silence this diagnostic."); -#endif - __next_pointer __next_; _LIBCPP_HIDE_FROM_ABI __next_pointer __ptr() _NOEXCEPT { diff --git a/libcxx/include/__iterator/concepts.h b/libcxx/include/__iterator/concepts.h index f38688734b38a..3b43920443636 100644 --- a/libcxx/include/__iterator/concepts.h +++ b/libcxx/include/__iterator/concepts.h @@ -117,15 +117,12 @@ template concept __signed_integer_like = signed_integral<_Tp>; template -concept weakly_incrementable = - // TODO: remove this once the clang bug is fixed (https://llvm.org/PR48173). - !same_as<_Ip, bool> && // Currently, clang does not handle bool correctly. - movable<_Ip> && requires(_Ip __i) { - typename iter_difference_t<_Ip>; - requires __signed_integer_like>; - { ++__i } -> same_as<_Ip&>; // not required to be equality-preserving - __i++; // not required to be equality-preserving - }; +concept weakly_incrementable = movable<_Ip> && requires(_Ip __i) { + typename iter_difference_t<_Ip>; + requires __signed_integer_like>; + { ++__i } -> same_as<_Ip&>; // not required to be equality-preserving + __i++; // not required to be equality-preserving +}; // [iterator.concept.inc] template diff --git a/libcxx/include/__math/traits.h b/libcxx/include/__math/traits.h index 00db2a8289fb3..ff22cee7305d7 100644 --- a/libcxx/include/__math/traits.h +++ b/libcxx/include/__math/traits.h @@ -25,33 +25,26 @@ namespace __math { // signbit -// TODO(LLVM 22): Remove conditional once support for Clang 19 is dropped. -#if defined(_LIBCPP_COMPILER_GCC) || __has_constexpr_builtin(__builtin_signbit) -# define _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_CONSTEXPR_SINCE_CXX23 -#else -# define _LIBCPP_SIGNBIT_CONSTEXPR -#endif - // The universal C runtime (UCRT) in the WinSDK provides floating point overloads // for std::signbit(). By defining our overloads as templates, we can work around // this issue as templates are less preferred than non-template functions. template -[[__nodiscard__]] inline _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_HIDE_FROM_ABI bool signbit(float __x) _NOEXCEPT { +[[__nodiscard__]] inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool signbit(float __x) _NOEXCEPT { return __builtin_signbit(__x); } template -[[__nodiscard__]] inline _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_HIDE_FROM_ABI bool signbit(double __x) _NOEXCEPT { +[[__nodiscard__]] inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool signbit(double __x) _NOEXCEPT { return __builtin_signbit(__x); } template -[[__nodiscard__]] inline _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_HIDE_FROM_ABI bool signbit(long double __x) _NOEXCEPT { +[[__nodiscard__]] inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool signbit(long double __x) _NOEXCEPT { return __builtin_signbit(__x); } template ::value, int> = 0> -[[__nodiscard__]] inline _LIBCPP_SIGNBIT_CONSTEXPR _LIBCPP_HIDE_FROM_ABI bool signbit(_A1 __x) _NOEXCEPT { +[[__nodiscard__]] inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool signbit(_A1 __x) _NOEXCEPT { return __x < 0; } diff --git a/libcxx/include/__ranges/transform_view.h b/libcxx/include/__ranges/transform_view.h index ae85dfa452d72..ab1adf9cdbe68 100644 --- a/libcxx/include/__ranges/transform_view.h +++ b/libcxx/include/__ranges/transform_view.h @@ -13,7 +13,6 @@ #include <__compare/three_way_comparable.h> #include <__concepts/constructible.h> #include <__concepts/convertible_to.h> -#include <__concepts/copyable.h> #include <__concepts/derived_from.h> #include <__concepts/equality_comparable.h> #include <__concepts/invocable.h> @@ -64,7 +63,7 @@ concept __regular_invocable_with_range_ref = regular_invocable<_Fn, range_refere template concept __transform_view_constraints = view<_View> && is_object_v<_Fn> && regular_invocable<_Fn&, range_reference_t<_View>> && - __is_referenceable_v>>; + __referenceable>>; # if _LIBCPP_STD_VER >= 23 template diff --git a/libcxx/include/__tree b/libcxx/include/__tree index 0738c8c6a5e2b..694796922c914 100644 --- a/libcxx/include/__tree +++ b/libcxx/include/__tree @@ -823,18 +823,6 @@ public: using __node_allocator _LIBCPP_NODEBUG = __rebind_alloc<__alloc_traits, __node>; using __node_traits _LIBCPP_NODEBUG = allocator_traits<__node_allocator>; -// TODO(LLVM 22): Remove this check -#ifndef _LIBCPP_ABI_TREE_REMOVE_NODE_POINTER_UB - static_assert(sizeof(__node_base_pointer) == sizeof(__end_node_pointer) && _LIBCPP_ALIGNOF(__node_base_pointer) == - _LIBCPP_ALIGNOF(__end_node_pointer), - "It looks like you are using std::__tree (an implementation detail for (multi)map/set) with a fancy " - "pointer type that thas a different representation depending on whether it points to a __tree base " - "pointer or a __tree node pointer (both of which are implementation details of the standard library). " - "This means that your ABI is being broken between LLVM 19 and LLVM 20. If you don't care about your " - "ABI being broken, define the _LIBCPP_ABI_TREE_REMOVE_NODE_POINTER_UB macro to silence this " - "diagnostic."); -#endif - private: // check for sane allocator pointer rebinding semantics. Rebinding the // allocator for a new pointer type should be exactly the same as rebinding diff --git a/libcxx/include/__type_traits/reference_constructs_from_temporary.h b/libcxx/include/__type_traits/reference_constructs_from_temporary.h index 2ff549b4e15ce..3d097ce90cb09 100644 --- a/libcxx/include/__type_traits/reference_constructs_from_temporary.h +++ b/libcxx/include/__type_traits/reference_constructs_from_temporary.h @@ -30,14 +30,8 @@ _LIBCPP_NO_SPECIALIZATIONS inline constexpr bool reference_constructs_from_tempo #endif -#if __has_builtin(__reference_constructs_from_temporary) template inline const bool __reference_constructs_from_temporary_v = __reference_constructs_from_temporary(_Tp, _Up); -#else -// TODO(LLVM 22): Remove this as all supported compilers should have __reference_constructs_from_temporary implemented. -template -inline const bool __reference_constructs_from_temporary_v = __reference_binds_to_temporary(_Tp, _Up); -#endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list index df7da20cfb611..88d863f494e86 100644 --- a/libcxx/include/forward_list +++ b/libcxx/include/forward_list @@ -284,17 +284,6 @@ struct __forward_node_traits { typedef _NodePtr __node_pointer; typedef __forward_begin_node<_NodePtr> __begin_node; typedef __rebind_pointer_t<_NodePtr, __begin_node> __begin_node_pointer; - -// TODO(LLVM 22): Remove this check -# ifndef _LIBCPP_ABI_FORWARD_LIST_REMOVE_NODE_POINTER_UB - static_assert(sizeof(__begin_node_pointer) == sizeof(__node_pointer) && _LIBCPP_ALIGNOF(__begin_node_pointer) == - _LIBCPP_ALIGNOF(__node_pointer), - "It looks like you are using std::forward_list with a fancy pointer type that thas a different " - "representation depending on whether it points to a forward_list base pointer or a forward_list node " - "pointer (both of which are implementation details of the standard library). This means that your ABI " - "is being broken between LLVM 19 and LLVM 20. If you don't care about your ABI being broken, define " - "the _LIBCPP_ABI_FORWARD_LIST_REMOVE_NODE_POINTER_UB macro to silence this diagnostic."); -# endif }; template diff --git a/libcxx/include/list b/libcxx/include/list index c5c2a8508999c..0ff85d2ebcb86 100644 --- a/libcxx/include/list +++ b/libcxx/include/list @@ -276,17 +276,6 @@ template struct __list_node_pointer_traits { typedef __rebind_pointer_t<_VoidPtr, __list_node<_Tp, _VoidPtr> > __node_pointer; typedef __rebind_pointer_t<_VoidPtr, __list_node_base<_Tp, _VoidPtr> > __base_pointer; - -// TODO(LLVM 22): Remove this check -# ifndef _LIBCPP_ABI_LIST_REMOVE_NODE_POINTER_UB - static_assert(sizeof(__node_pointer) == sizeof(__node_pointer) && _LIBCPP_ALIGNOF(__base_pointer) == - _LIBCPP_ALIGNOF(__node_pointer), - "It looks like you are using std::list with a fancy pointer type that thas a different representation " - "depending on whether it points to a list base pointer or a list node pointer (both of which are " - "implementation details of the standard library). This means that your ABI is being broken between " - "LLVM 19 and LLVM 20. If you don't care about your ABI being broken, define the " - "_LIBCPP_ABI_LIST_REMOVE_NODE_POINTER_UB macro to silence this diagnostic."); -# endif }; template diff --git a/libcxx/include/tuple b/libcxx/include/tuple index 5f3bb72e0678b..466f501b5f4f8 100644 --- a/libcxx/include/tuple +++ b/libcxx/include/tuple @@ -301,7 +301,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool __tuple_compare_equal(c template >> inline constexpr bool __can_tuple_compare_equal = false; -// TODO(LLVM 22): Remove `tuple_size_v<_Tp> == tuple_size_v<_Up>` here once once LLVM-20 support ends +// TODO(LLVM 23): Remove `tuple_size_v<_Tp> == tuple_size_v<_Up>` here once once LLVM-20 support ends // because the resolution of CWG2369 landed in LLVM-21. template requires(tuple_size_v<_Tp> == tuple_size_v<_Up>) @@ -328,7 +328,7 @@ concept __tuple_like_no_tuple = __tuple_like<_Tp> && !__is_tuple_v<_Tp>; template struct __tuple_common_comparison_category_impl {}; -// TODO(LLVM 22): Remove `tuple_size_v<_Tp> == tuple_size_v<_Up>` here once once LLVM-20 support ends +// TODO(LLVM 23): Remove `tuple_size_v<_Tp> == tuple_size_v<_Up>` here once once LLVM-20 support ends // because the resolution of CWG2369 landed in LLVM-21. template requires(tuple_size_v<_Tp> == tuple_size_v<_Up>) && requires { diff --git a/libcxx/include/variant b/libcxx/include/variant index 9beef146f203c..8e958581a6b07 100644 --- a/libcxx/include/variant +++ b/libcxx/include/variant @@ -1299,7 +1299,7 @@ public: __impl_.__swap(__that.__impl_); } -# if _LIBCPP_STD_VER >= 26 && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER +# if _LIBCPP_STD_VER >= 26 // Helper class to implement [variant.visit]/10 // Constraints: The call to visit does not use an explicit template-argument-list // that begins with a type template-argument. diff --git a/libcxx/test/libcxx-03/utilities/meta/is_referenceable.compile.pass.cpp b/libcxx/test/libcxx-03/utilities/meta/is_referenceable.compile.pass.cpp index 093bbae289723..f39d1a5da41af 100644 --- a/libcxx/test/libcxx-03/utilities/meta/is_referenceable.compile.pass.cpp +++ b/libcxx/test/libcxx-03/utilities/meta/is_referenceable.compile.pass.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // -// __is_referenceable_v +// __libcpp_is_referenceable // // [defns.referenceable] defines "a referenceable type" as: // An object type, a function type that does not have cv-qualifiers diff --git a/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp b/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp index 3f17f21e8c108..20887b8cf2678 100644 --- a/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp +++ b/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp @@ -220,16 +220,9 @@ int main(int, char**) { ASSERT_CONSTEXPR_CXX23(std::isnormal(-1.0) == 1); ASSERT_CONSTEXPR_CXX23(std::isnormal(-1.0L) == 1); -// TODO(LLVM 22): Remove `__has_constexpr_builtin` conditional once support for Clang 19 is dropped. -#if !__has_constexpr_builtin(__builtin_signbit) - ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0f) == 1); - ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0) == 1); - ASSERT_NOT_CONSTEXPR_CXX23(std::signbit(-1.0L) == 1); -#else ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0f) == 1); ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0) == 1); ASSERT_CONSTEXPR_CXX23(std::signbit(-1.0L) == 1); -#endif ASSERT_NOT_CONSTEXPR_CXX23(std::isgreater(-1.0f, 0.0f) == 0); ASSERT_NOT_CONSTEXPR_CXX23(std::isgreater(-1.0, 0.0) == 0); diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp index 09ebd0069b3a9..3e9bdd98cd394 100644 --- a/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp +++ b/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp @@ -8,15 +8,6 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 -// With clang-cl, some warnings have a 'which is a Microsoft extension' suffix -// which break the tests. But #102851 will turn it into an error, making the test pass. -// However, upstream libcxx buildbots do not build clang from source while testing, so -// this tests still expected to fail on these bots. -// -// TODO(LLVM 22): Remove '0-1' from 'expected-error-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}}}' -// and remove 'expected-warning-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}, which is a Microsoft extension}}' -// once LLVM 22 releases. See https://llvm.org/PR104885. - // Test the mandates // template constexpr auto transform_error(F&& f) &; @@ -55,41 +46,39 @@ void test() { { std::expected e; e.transform_error(return_unexpected); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} - // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}} // expected-error-re@*:* {{static assertion failed {{.*}}[expected.object.general] A program that instantiates the definition of template expected for {{.*}} is ill-formed.}} - // expected-error-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}}} + // expected-error-re@*:* {{union member {{.*}} has reference type {{.*}}}} e.transform_error(return_no_object); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} - // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}} + // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}} // expected-error-re@*:* {{static assertion failed {{.*}}[expected.object.general] A program that instantiates the definition of template expected for {{.*}} is ill-formed.}} - // expected-warning-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}, which is a Microsoft extension}} } // Test const& overload { const std::expected e; e.transform_error(return_unexpected); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} - // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}} + // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}} e.transform_error(return_no_object); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} - // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}} + // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}} } // Test && overload { std::expected e; std::move(e).transform_error(return_unexpected); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} - // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}} + // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}} std::move(e).transform_error(return_no_object); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} - // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}} + // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}} } // Test const&& overload { const std::expected e; std::move(e).transform_error(return_unexpected); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} - // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}} + // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}} std::move(e).transform_error(return_no_object); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} - // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}} + // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}} } } // clang-format on diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp index 9fd7452af64fb..c5acc27af03ea 100644 --- a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp +++ b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp @@ -8,16 +8,6 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 -// With clang-cl, some warnings have a 'which is a Microsoft extension' suffix -// which break the tests. But #102851 will turn it into an error, making the test pass. -// However, upstream libcxx buildbots do not build clang from source while testing, so -// this tests still expected to fail on these bots. -// -// TODO(LLVM 22): Remove '0-1' from 'expected-error-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}}}' -// and remove 'expected-warning-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}, which is a Microsoft extension}}' -// and remove 'expected-error-re@*:* 0-1 {{call to deleted constructor of {{.*}}}}' -// once LLVM 22 releases. See See https://llvm.org/PR104885. - // Test the mandates // template constexpr auto transform_error(F&& f) &; @@ -56,43 +46,36 @@ void test() { { std::expected e; e.transform_error(return_unexpected); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} - // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}} + // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}} // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected with a E that is not a valid argument for unexpected is ill-formed}} - // expected-error-re@*:* 0-1 {{call to deleted constructor of {{.*}}}} - // expected-error-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}}} + // expected-error-re@*:* {{union member {{.*}} has reference type {{.*}}}} e.transform_error(return_no_object); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} - // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}} + // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}} // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected with a E that is not a valid argument for unexpected is ill-formed}} - // expected-warning-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}, which is a Microsoft extension}} } // Test const& overload { const std::expected e; e.transform_error(return_unexpected); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} - // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}} + // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}} e.transform_error(return_no_object); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} - // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}} - // expected-error-re@*:* 0-1 {{call to deleted constructor of {{.*}}}} + // expected-error-re@*:* {{no matching constructor for initialization of{{.*}}}} } // Test && overload { std::expected e; std::move(e).transform_error(return_unexpected); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} - // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}} std::move(e).transform_error(return_no_object); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} - // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}} } // Test const&& overload { const std::expected e; std::move(e).transform_error(return_unexpected); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} - // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}} std::move(e).transform_error(return_no_object); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} - // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}} } } // clang-format on diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp index 20e0a5ed66bd0..68fe8b6de41d6 100644 --- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp +++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp @@ -8,8 +8,6 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME -// The tested functionality needs deducing this. -// XFAIL: apple-clang // diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp index 8a79dd4d50f20..4ae63e896caed 100644 --- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp +++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp @@ -8,8 +8,6 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME -// The tested functionality needs deducing this. -// XFAIL: apple-clang // diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp index 146ceba58872e..77df72d3c4c6c 100644 --- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp +++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp @@ -8,7 +8,6 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME -// XFAIL: apple-clang // diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp index d99675a71f321..9b7c8a7f4f8b4 100644 --- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp +++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp @@ -9,6 +9,8 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // template @@ -25,10 +27,6 @@ #include "make_string.h" #include "min_allocator.h" -#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER) -TEST_CLANG_DIAGNOSTIC_IGNORED("-Wdeprecated-declarations") -#endif - template void test(From value) { auto store = std::make_format_args(value); diff --git a/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp index c7dd82d726b3a..cbddc4f437a53 100644 --- a/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp +++ b/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp @@ -32,7 +32,7 @@ void test(From value) { else assert(false); }; -#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER) +#if TEST_STD_VER >= 26 format_args.get(0).visit(visitor); #else std::visit_format_arg(visitor, format_args.get(0)); @@ -47,7 +47,7 @@ void test_handle(T value) { std::basic_format_args format_args{store}; auto visitor = [](auto a) { assert((std::is_same_v::handle>)); }; -#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER) +#if TEST_STD_VER >= 26 format_args.get(0).visit(visitor); #else std::visit_format_arg(visitor, format_args.get(0)); @@ -73,7 +73,7 @@ void test_string_view(From value) { else assert(false); }; -#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER) +#if TEST_STD_VER >= 26 format_args.get(0).visit(visitor); #else std::visit_format_arg(visitor, format_args.get(0)); diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR20855_tuple_ref_binding_diagnostics.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR20855_tuple_ref_binding_diagnostics.pass.cpp index d78de0eec8e53..0f6a6734264c3 100644 --- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR20855_tuple_ref_binding_diagnostics.pass.cpp +++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR20855_tuple_ref_binding_diagnostics.pass.cpp @@ -16,17 +16,6 @@ #include #include #include -#include "test_macros.h" - -#if TEST_HAS_BUILTIN(__reference_constructs_from_temporary) -# define ASSERT_REFERENCE_BINDS_TEMPORARY(...) static_assert(__reference_constructs_from_temporary(__VA_ARGS__), "") -# define ASSERT_NOT_REFERENCE_BINDS_TEMPORARY(...) \ - static_assert(!__reference_constructs_from_temporary(__VA_ARGS__), "") -#else -// TODO(LLVM 22): Remove this as all support compilers should have __reference_constructs_from_temporary implemented. -# define ASSERT_REFERENCE_BINDS_TEMPORARY(...) static_assert(__reference_binds_to_temporary(__VA_ARGS__), "") -# define ASSERT_NOT_REFERENCE_BINDS_TEMPORARY(...) static_assert(!__reference_binds_to_temporary(__VA_ARGS__), "") -#endif template struct ConvertsTo { @@ -42,17 +31,6 @@ struct ConvertsTo { struct Base {}; struct Derived : Base {}; - -static_assert(std::is_same::value, ""); -ASSERT_REFERENCE_BINDS_TEMPORARY(std::string const&, decltype("abc")); -ASSERT_REFERENCE_BINDS_TEMPORARY(std::string const&, decltype(("abc"))); -ASSERT_REFERENCE_BINDS_TEMPORARY(std::string const&, const char*&&); - -ASSERT_NOT_REFERENCE_BINDS_TEMPORARY(int&, const ConvertsTo&); -ASSERT_NOT_REFERENCE_BINDS_TEMPORARY(const int&, ConvertsTo&); -ASSERT_NOT_REFERENCE_BINDS_TEMPORARY(Base&, Derived&); - - static_assert(std::is_constructible>::value, ""); static_assert(std::is_constructible>::value, ""); diff --git a/libcxx/test/std/utilities/variant/variant.visit.member/robust_against_adl.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit.member/robust_against_adl.pass.cpp index 7be7c7ff9122b..38cf34a9c699c 100644 --- a/libcxx/test/std/utilities/variant/variant.visit.member/robust_against_adl.pass.cpp +++ b/libcxx/test/std/utilities/variant/variant.visit.member/robust_against_adl.pass.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23 -// XFAIL: apple-clang // diff --git a/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp index f68112d30fc35..aeb1297c136ae 100644 --- a/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp +++ b/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp @@ -7,8 +7,6 @@ //===----------------------------------------------------------------------===// // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23 -// The tested functionality needs deducing this. -// XFAIL: apple-clang // diff --git a/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp index 90320ae518c34..7ca05908ab340 100644 --- a/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp +++ b/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp @@ -7,8 +7,6 @@ //===----------------------------------------------------------------------===// // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23 -// The tested functionality needs deducing this. -// XFAIL: apple-clang // diff --git a/libcxx/test/support/test_basic_format_arg.h b/libcxx/test/support/test_basic_format_arg.h index f51f6e97cbed0..99cd558c3c5bf 100644 --- a/libcxx/test/support/test_basic_format_arg.h +++ b/libcxx/test/support/test_basic_format_arg.h @@ -21,7 +21,7 @@ bool test_basic_format_arg(std::basic_format_arg arg, T expected) { else return false; }; -#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER) +#if TEST_STD_VER >= 26 return arg.visit(std::move(visitor)); #else return std::visit_format_arg(std::move(visitor), arg); diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h index c4e1600572456..8d88d6fad7d0b 100644 --- a/libcxx/test/support/test_macros.h +++ b/libcxx/test/support/test_macros.h @@ -531,13 +531,6 @@ inline Tp const& DoNotOptimize(Tp const& value) { # define TEST_IF_AIX(arg_true, arg_false) arg_false #endif -// Clang-18 has support for deducing this, but it does not set the FTM. -#ifdef _LIBCPP_USE_FROZEN_CXX03_HEADERS -// This is a C++20 featue, so we don't care whether the compiler could support it -#elif defined(_LIBCPP_VERSION) && _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER -# define TEST_HAS_EXPLICIT_THIS_PARAMETER -#endif - // Placement `operator new`/`operator new[]` are not yet constexpr in C++26 // when using MS ABI, because they are from . #if defined(__cpp_lib_constexpr_new) && __cpp_lib_constexpr_new >= 202406L From b126bceb0ad8e6d48cd0c8322e0f7539386e11cf Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 30 Oct 2025 10:33:44 +0100 Subject: [PATCH 170/539] [GVN] Add tests for pointer replacement with different addr size (NFC) --- llvm/test/Transforms/GVN/assume-equal.ll | 44 ++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/llvm/test/Transforms/GVN/assume-equal.ll b/llvm/test/Transforms/GVN/assume-equal.ll index bbbc5c58584a6..a38980169fc52 100644 --- a/llvm/test/Transforms/GVN/assume-equal.ll +++ b/llvm/test/Transforms/GVN/assume-equal.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -passes=gvn -S | FileCheck %s +target datalayout = "p1:64:64:64:32" + %struct.A = type { ptr } @_ZTV1A = available_externally unnamed_addr constant [4 x ptr] [ptr null, ptr @_ZTI1A, ptr @_ZN1A3fooEv, ptr @_ZN1A3barEv], align 8 @_ZTI1A = external constant ptr @@ -372,6 +374,20 @@ define i1 @assume_ptr_eq_different_prov_does_not_matter_icmp(ptr %p, ptr %p2) { ret i1 %c } +define i1 @assume_ptr_eq_different_prov_does_not_matter_icmp_addrsize(ptr addrspace(1) %p, ptr addrspace(1) %p2) { +; CHECK-LABEL: define i1 @assume_ptr_eq_different_prov_does_not_matter_icmp_addrsize( +; CHECK-SAME: ptr addrspace(1) [[P:%.*]], ptr addrspace(1) [[P2:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(1) [[P]], [[P2]] +; CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) +; CHECK-NEXT: [[C:%.*]] = icmp eq ptr addrspace(1) [[P]], null +; CHECK-NEXT: ret i1 [[C]] +; + %cmp = icmp eq ptr addrspace(1) %p, %p2 + call void @llvm.assume(i1 %cmp) + %c = icmp eq ptr addrspace(1) %p2, null + ret i1 %c +} + ; This is not correct, as it may change the provenance exposed by ptrtoint. ; We still allow it for now. define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoint(ptr %p, ptr %p2) { @@ -388,6 +404,20 @@ define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoint(ptr %p, ptr %p ret i64 %int } +define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoint_addrsize(ptr addrspace(1) %p, ptr addrspace(1) %p2) { +; CHECK-LABEL: define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoint_addrsize( +; CHECK-SAME: ptr addrspace(1) [[P:%.*]], ptr addrspace(1) [[P2:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(1) [[P]], [[P2]] +; CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) +; CHECK-NEXT: [[INT:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +; CHECK-NEXT: ret i64 [[INT]] +; + %cmp = icmp eq ptr addrspace(1) %p, %p2 + call void @llvm.assume(i1 %cmp) + %int = ptrtoint ptr addrspace(1) %p2 to i64 + ret i64 %int +} + define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoaddr(ptr %p, ptr %p2) { ; CHECK-LABEL: define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoaddr( ; CHECK-SAME: ptr [[P:%.*]], ptr [[P2:%.*]]) { @@ -402,6 +432,20 @@ define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoaddr(ptr %p, ptr % ret i64 %int } +define i32 @assume_ptr_eq_different_prov_does_not_matter_ptrtoaddr_addrsize(ptr addrspace(1) %p, ptr addrspace(1) %p2) { +; CHECK-LABEL: define i32 @assume_ptr_eq_different_prov_does_not_matter_ptrtoaddr_addrsize( +; CHECK-SAME: ptr addrspace(1) [[P:%.*]], ptr addrspace(1) [[P2:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(1) [[P]], [[P2]] +; CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) +; CHECK-NEXT: [[INT:%.*]] = ptrtoaddr ptr addrspace(1) [[P]] to i32 +; CHECK-NEXT: ret i32 [[INT]] +; + %cmp = icmp eq ptr addrspace(1) %p, %p2 + call void @llvm.assume(i1 %cmp) + %int = ptrtoaddr ptr addrspace(1) %p2 to i32 + ret i32 %int +} + define i8 @assume_ptr_eq_same_prov(ptr %p, i64 %x) { ; CHECK-LABEL: define i8 @assume_ptr_eq_same_prov( ; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) { From 488d47837e209449f3552cbad7849558b82b24d1 Mon Sep 17 00:00:00 2001 From: Pankaj Dwivedi Date: Thu, 30 Oct 2025 15:05:25 +0530 Subject: [PATCH 171/539] [AMDGPU] insert eof white space (#165673) --- llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp index b5e2d76db662e..65e6ed9d1d428 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -188,4 +188,4 @@ INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE, FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() { return new AMDGPUUniformIntrinsicCombineLegacy(); -} \ No newline at end of file +} From f1cff17977dc5f184439562697723af30c99df80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= Date: Thu, 30 Oct 2025 10:46:37 +0100 Subject: [PATCH 172/539] [ORC] Fix missing include for MemoryAccess interface (NFC) (#165576) MemoryAccess base class was included from Core.h when it was a subclass of ExecutorProcessControl, but this changed in 0faa181434cf959110651fe974bef31e7390eba8 --- llvm/include/llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h index c69b6f736651e..86207265021c5 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// Implements ExecutorProcessControl::MemoryAccess by making calls to +// Implements the MemoryAccess interface by making calls to // ExecutorProcessControl::callWrapperAsync. // // This simplifies the implementaton of new ExecutorProcessControl instances, @@ -19,6 +19,7 @@ #define LLVM_EXECUTIONENGINE_ORC_EPCGENERICMEMORYACCESS_H #include "llvm/ExecutionEngine/Orc/Core.h" +#include "llvm/ExecutionEngine/Orc/MemoryAccess.h" namespace llvm { namespace orc { From 82cc57a41a30631b105ed597d4bc68cb622f7e07 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Thu, 30 Oct 2025 11:16:45 +0100 Subject: [PATCH 173/539] [clang][NFC] Make ellipse strings constexpr (#165680) Also rename map to Map, remove the m_ prefix from member variables and fix the naming of the existing color variables. --- clang/lib/Frontend/TextDiagnostic.cpp | 171 +++++++++++++------------- 1 file changed, 86 insertions(+), 85 deletions(-) diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp index c33d8f8ca9ebd..aea3e72d92a84 100644 --- a/clang/lib/Frontend/TextDiagnostic.cpp +++ b/clang/lib/Frontend/TextDiagnostic.cpp @@ -22,22 +22,16 @@ using namespace clang; -static const enum raw_ostream::Colors noteColor = raw_ostream::CYAN; -static const enum raw_ostream::Colors remarkColor = - raw_ostream::BLUE; -static const enum raw_ostream::Colors fixitColor = - raw_ostream::GREEN; -static const enum raw_ostream::Colors caretColor = - raw_ostream::GREEN; -static const enum raw_ostream::Colors warningColor = - raw_ostream::MAGENTA; -static const enum raw_ostream::Colors templateColor = - raw_ostream::CYAN; -static const enum raw_ostream::Colors errorColor = raw_ostream::RED; -static const enum raw_ostream::Colors fatalColor = raw_ostream::RED; +static constexpr raw_ostream::Colors NoteColor = raw_ostream::CYAN; +static constexpr raw_ostream::Colors RemarkColor = raw_ostream::BLUE; +static constexpr raw_ostream::Colors FixitColor = raw_ostream::GREEN; +static constexpr raw_ostream::Colors CaretColor = raw_ostream::GREEN; +static constexpr raw_ostream::Colors WarningColor = raw_ostream::MAGENTA; +static constexpr raw_ostream::Colors TemplateColor = raw_ostream::CYAN; +static constexpr raw_ostream::Colors ErrorColor = raw_ostream::RED; +static constexpr raw_ostream::Colors FatalColor = raw_ostream::RED; // Used for changing only the bold attribute. -static const enum raw_ostream::Colors savedColor = - raw_ostream::SAVEDCOLOR; +static constexpr raw_ostream::Colors SavedColor = raw_ostream::SAVEDCOLOR; // Magenta is taken for 'warning'. Red is already 'error' and 'cyan' // is already taken for 'note'. Green is already used to underline @@ -95,11 +89,11 @@ static void applyTemplateHighlighting(raw_ostream &OS, StringRef Str, Str = Str.substr(Pos + 1); if (Normal) - OS.changeColor(templateColor, true); + OS.changeColor(TemplateColor, true); else { OS.resetColor(); if (Bold) - OS.changeColor(savedColor, true); + OS.changeColor(SavedColor, true); } Normal = !Normal; } @@ -289,46 +283,46 @@ static void genColumnByteMapping(StringRef SourceLine, unsigned TabStop, namespace { struct SourceColumnMap { SourceColumnMap(StringRef SourceLine, unsigned TabStop) - : m_SourceLine(SourceLine) { + : SourceLine(SourceLine) { - genColumnByteMapping(SourceLine, TabStop, m_columnToByte, m_byteToColumn); + genColumnByteMapping(SourceLine, TabStop, ColumnToByte, ByteToColumn); - assert(m_byteToColumn.size()==SourceLine.size()+1); - assert(0 < m_byteToColumn.size() && 0 < m_columnToByte.size()); - assert(m_byteToColumn.size() == - static_cast(m_columnToByte.back().V + 1)); - assert(static_cast(m_byteToColumn.back().V + 1) == - m_columnToByte.size()); + assert(ByteToColumn.size() == SourceLine.size() + 1); + assert(0 < ByteToColumn.size() && 0 < ColumnToByte.size()); + assert(ByteToColumn.size() == + static_cast(ColumnToByte.back().V + 1)); + assert(static_cast(ByteToColumn.back().V + 1) == + ColumnToByte.size()); } - Columns columns() const { return m_byteToColumn.back(); } - Bytes bytes() const { return m_columnToByte.back(); } + Columns columns() const { return ByteToColumn.back(); } + Bytes bytes() const { return ColumnToByte.back(); } /// Map a byte to the column which it is at the start of, or return -1 /// if it is not at the start of a column (for a UTF-8 trailing byte). Columns byteToColumn(Bytes N) const { - assert(0 <= N.V && N.V < static_cast(m_byteToColumn.size())); - return m_byteToColumn[N.V]; + assert(0 <= N.V && N.V < static_cast(ByteToColumn.size())); + return ByteToColumn[N.V]; } /// Map a byte to the first column which contains it. Columns byteToContainingColumn(Bytes N) const { - assert(0 <= N.V && N.V < static_cast(m_byteToColumn.size())); - while (!m_byteToColumn[N.V].isValid()) + assert(0 <= N.V && N.V < static_cast(ByteToColumn.size())); + while (!ByteToColumn[N.V].isValid()) --N.V; - return m_byteToColumn[N.V]; + return ByteToColumn[N.V]; } /// Map a column to the byte which starts the column, or return -1 if /// the column the second or subsequent column of an expanded tab or similar /// multi-column entity. Bytes columnToByte(Columns N) const { - assert(0 <= N.V && N.V < static_cast(m_columnToByte.size())); - return m_columnToByte[N.V]; + assert(0 <= N.V && N.V < static_cast(ColumnToByte.size())); + return ColumnToByte[N.V]; } /// Map from a byte index to the next byte which starts a column. Bytes startOfNextColumn(Bytes N) const { - assert(0 <= N.V && N.V < static_cast(m_byteToColumn.size() - 1)); + assert(0 <= N.V && N.V < static_cast(ByteToColumn.size() - 1)); N = N.next(); while (!byteToColumn(N).isValid()) N = N.next(); @@ -337,21 +331,19 @@ struct SourceColumnMap { /// Map from a byte index to the previous byte which starts a column. Bytes startOfPreviousColumn(Bytes N) const { - assert(0 < N.V && N.V < static_cast(m_byteToColumn.size())); + assert(0 < N.V && N.V < static_cast(ByteToColumn.size())); N = N.prev(); while (!byteToColumn(N).isValid()) N = N.prev(); return N; } - StringRef getSourceLine() const { - return m_SourceLine; - } + StringRef getSourceLine() const { return SourceLine; } private: - StringRef m_SourceLine; - SmallVector m_byteToColumn; - SmallVector m_columnToByte; + StringRef SourceLine; + SmallVector ByteToColumn; + SmallVector ColumnToByte; }; } // end anonymous namespace @@ -361,12 +353,12 @@ static void selectInterestingSourceRegion(std::string &SourceLine, std::string &CaretLine, std::string &FixItInsertionLine, Columns NonGutterColumns, - const SourceColumnMap &map) { + const SourceColumnMap &Map) { Columns CaretColumns = Columns(CaretLine.size()); Columns FixItColumns = Columns(llvm::sys::locale::columnWidth(FixItInsertionLine)); Columns MaxColumns = - std::max({map.columns().V, CaretColumns.V, FixItColumns.V}); + std::max({Map.columns().V, CaretColumns.V, FixItColumns.V}); // if the number of columns is less than the desired number we're done if (MaxColumns <= NonGutterColumns) return; @@ -415,14 +407,14 @@ static void selectInterestingSourceRegion(std::string &SourceLine, // CaretEnd may have been set at the middle of a character // If it's not at a character's first column then advance it past the current // character. - while (CaretEnd < map.columns() && !map.columnToByte(CaretEnd).isValid()) + while (CaretEnd < Map.columns() && !Map.columnToByte(CaretEnd).isValid()) CaretEnd = CaretEnd.next(); assert( - (CaretStart > map.columns() || map.columnToByte(CaretStart).isValid()) && + (CaretStart > Map.columns() || Map.columnToByte(CaretStart).isValid()) && "CaretStart must not point to a column in the middle of a source" " line character"); - assert((CaretEnd > map.columns() || map.columnToByte(CaretEnd).isValid()) && + assert((CaretEnd > Map.columns() || Map.columnToByte(CaretEnd).isValid()) && "CaretEnd must not point to a column in the middle of a source line" " character"); @@ -431,20 +423,19 @@ static void selectInterestingSourceRegion(std::string &SourceLine, // number of columns we have, try to grow the slice to encompass // more context. - Bytes SourceStart = map.columnToByte(std::min(CaretStart.V, map.columns().V)); - Bytes SourceEnd = map.columnToByte(std::min(CaretEnd.V, map.columns().V)); + Bytes SourceStart = Map.columnToByte(std::min(CaretStart.V, Map.columns().V)); + Bytes SourceEnd = Map.columnToByte(std::min(CaretEnd.V, Map.columns().V)); Columns CaretColumnsOutsideSource = CaretEnd - CaretStart - - (map.byteToColumn(SourceEnd) - map.byteToColumn(SourceStart)); + (Map.byteToColumn(SourceEnd) - Map.byteToColumn(SourceStart)); - char const *front_ellipse = " ..."; - char const *front_space = " "; - char const *back_ellipse = "..."; - Columns EllipsesColumns = - Columns(strlen(front_ellipse) + strlen(back_ellipse)); + constexpr StringRef FrontEllipse = " ..."; + constexpr StringRef FrontSpace = " "; + constexpr StringRef BackEllipse = "..."; + Columns EllipsesColumns = Columns(FrontEllipse.size() + BackEllipse.size()); - Columns TargetColumns = Columns(NonGutterColumns); + Columns TargetColumns = NonGutterColumns; // Give us extra room for the ellipses // and any of the caret line that extends past the source if (TargetColumns > EllipsesColumns + CaretColumnsOutsideSource) @@ -454,25 +445,25 @@ static void selectInterestingSourceRegion(std::string &SourceLine, bool ExpandedRegion = false; if (SourceStart > 0) { - Bytes NewStart = map.startOfPreviousColumn(SourceStart); + Bytes NewStart = Map.startOfPreviousColumn(SourceStart); // Skip over any whitespace we see here; we're looking for // another bit of interesting text. // FIXME: Detect non-ASCII whitespace characters too. while (NewStart > 0 && isWhitespace(SourceLine[NewStart.V])) - NewStart = map.startOfPreviousColumn(NewStart); + NewStart = Map.startOfPreviousColumn(NewStart); // Skip over this bit of "interesting" text. while (NewStart > 0) { - Bytes Prev = map.startOfPreviousColumn(NewStart); + Bytes Prev = Map.startOfPreviousColumn(NewStart); if (isWhitespace(SourceLine[Prev.V])) break; NewStart = Prev; } - assert(map.byteToColumn(NewStart).isValid()); + assert(Map.byteToColumn(NewStart).isValid()); Columns NewColumns = - map.byteToColumn(SourceEnd) - map.byteToColumn(NewStart); + Map.byteToColumn(SourceEnd) - Map.byteToColumn(NewStart); if (NewColumns <= TargetColumns) { SourceStart = NewStart; ExpandedRegion = true; @@ -480,21 +471,21 @@ static void selectInterestingSourceRegion(std::string &SourceLine, } if (SourceEnd < SourceLine.size()) { - Bytes NewEnd = map.startOfNextColumn(SourceEnd); + Bytes NewEnd = Map.startOfNextColumn(SourceEnd); // Skip over any whitespace we see here; we're looking for // another bit of interesting text. // FIXME: Detect non-ASCII whitespace characters too. while (NewEnd < SourceLine.size() && isWhitespace(SourceLine[NewEnd.V])) - NewEnd = map.startOfNextColumn(NewEnd); + NewEnd = Map.startOfNextColumn(NewEnd); // Skip over this bit of "interesting" text. while (NewEnd < SourceLine.size() && isWhitespace(SourceLine[NewEnd.V])) - NewEnd = map.startOfNextColumn(NewEnd); + NewEnd = Map.startOfNextColumn(NewEnd); - assert(map.byteToColumn(NewEnd).isValid()); + assert(Map.byteToColumn(NewEnd).isValid()); Columns NewColumns = - map.byteToColumn(NewEnd) - map.byteToColumn(SourceStart); + Map.byteToColumn(NewEnd) - Map.byteToColumn(SourceStart); if (NewColumns <= TargetColumns) { SourceEnd = NewEnd; ExpandedRegion = true; @@ -505,8 +496,8 @@ static void selectInterestingSourceRegion(std::string &SourceLine, break; } - CaretStart = map.byteToColumn(SourceStart); - CaretEnd = map.byteToColumn(SourceEnd) + CaretColumnsOutsideSource; + CaretStart = Map.byteToColumn(SourceStart); + CaretEnd = Map.byteToColumn(SourceEnd) + CaretColumnsOutsideSource; // [CaretStart, CaretEnd) is the slice we want. Update the various // output lines to show only this slice. @@ -516,8 +507,8 @@ static void selectInterestingSourceRegion(std::string &SourceLine, assert(CaretStart <= CaretEnd); Columns BackColumnsRemoved = - map.byteToColumn(Bytes{static_cast(SourceLine.size())}) - - map.byteToColumn(SourceEnd); + Map.byteToColumn(Bytes{static_cast(SourceLine.size())}) - + Map.byteToColumn(SourceEnd); Columns FrontColumnsRemoved = CaretStart; Columns ColumnsKept = CaretEnd - CaretStart; @@ -527,19 +518,19 @@ static void selectInterestingSourceRegion(std::string &SourceLine, // The line needs some truncation, and we'd prefer to keep the front // if possible, so remove the back - if (BackColumnsRemoved > Columns(strlen(back_ellipse))) - SourceLine.replace(SourceEnd.V, std::string::npos, back_ellipse); + if (BackColumnsRemoved > Columns(BackEllipse.size())) + SourceLine.replace(SourceEnd.V, std::string::npos, BackEllipse); // If that's enough then we're done if (FrontColumnsRemoved + ColumnsKept <= Columns(NonGutterColumns)) return; // Otherwise remove the front as well - if (FrontColumnsRemoved > Columns(strlen(front_ellipse))) { - SourceLine.replace(0, SourceStart.V, front_ellipse); - CaretLine.replace(0, CaretStart.V, front_space); + if (FrontColumnsRemoved > Columns(FrontEllipse.size())) { + SourceLine.replace(0, SourceStart.V, FrontEllipse); + CaretLine.replace(0, CaretStart.V, FrontSpace); if (!FixItInsertionLine.empty()) - FixItInsertionLine.replace(0, CaretStart.V, front_space); + FixItInsertionLine.replace(0, CaretStart.V, FrontSpace); } } @@ -733,11 +724,21 @@ TextDiagnostic::printDiagnosticLevel(raw_ostream &OS, switch (Level) { case DiagnosticsEngine::Ignored: llvm_unreachable("Invalid diagnostic type"); - case DiagnosticsEngine::Note: OS.changeColor(noteColor, true); break; - case DiagnosticsEngine::Remark: OS.changeColor(remarkColor, true); break; - case DiagnosticsEngine::Warning: OS.changeColor(warningColor, true); break; - case DiagnosticsEngine::Error: OS.changeColor(errorColor, true); break; - case DiagnosticsEngine::Fatal: OS.changeColor(fatalColor, true); break; + case DiagnosticsEngine::Note: + OS.changeColor(NoteColor, true); + break; + case DiagnosticsEngine::Remark: + OS.changeColor(RemarkColor, true); + break; + case DiagnosticsEngine::Warning: + OS.changeColor(WarningColor, true); + break; + case DiagnosticsEngine::Error: + OS.changeColor(ErrorColor, true); + break; + case DiagnosticsEngine::Fatal: + OS.changeColor(FatalColor, true); + break; } } @@ -765,7 +766,7 @@ void TextDiagnostic::printDiagnosticMessage(raw_ostream &OS, if (ShowColors && !IsSupplemental) { // Print primary diagnostic messages in bold and without color, to visually // indicate the transition from continuation notes and other output. - OS.changeColor(savedColor, true); + OS.changeColor(SavedColor, true); Bold = true; } @@ -843,7 +844,7 @@ void TextDiagnostic::emitDiagnosticLoc(FullSourceLoc Loc, PresumedLoc PLoc, return; if (DiagOpts.ShowColors) - OS.changeColor(savedColor, true); + OS.changeColor(SavedColor, true); emitFilename(PLoc.getFilename(), Loc.getManager()); switch (DiagOpts.getFormat()) { @@ -1470,7 +1471,7 @@ void TextDiagnostic::emitSnippetAndCaret( if (!CaretLine.empty()) { indentForLineNumbers(); if (DiagOpts.ShowColors) - OS.changeColor(caretColor, true); + OS.changeColor(CaretColor, true); OS << CaretLine << '\n'; if (DiagOpts.ShowColors) OS.resetColor(); @@ -1480,7 +1481,7 @@ void TextDiagnostic::emitSnippetAndCaret( indentForLineNumbers(); if (DiagOpts.ShowColors) // Print fixit line in color - OS.changeColor(fixitColor, false); + OS.changeColor(FixitColor, false); if (DiagOpts.ShowSourceRanges) OS << ' '; OS << FixItInsertionLine << '\n'; From b5fe7c90afff3fc40dadd515e10123f96e06dec9 Mon Sep 17 00:00:00 2001 From: Ritanya-B-Bharadwaj Date: Thu, 30 Oct 2025 15:48:13 +0530 Subject: [PATCH 174/539] [clang][OpenMP] New OpenMP 6.0 threadset clause (#135807) Initial parsing/sema/codegen support for threadset clause in task and taskloop directives [Section 14.8 in in OpenMP 6.0 spec] --------- --- clang/docs/OpenMPSupport.rst | 1324 ++++++++--------- clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/AST/OpenMPClause.h | 80 + clang/include/clang/AST/RecursiveASTVisitor.h | 6 + clang/include/clang/Basic/OpenMPKinds.def | 8 +- clang/include/clang/Basic/OpenMPKinds.h | 7 + clang/include/clang/Sema/SemaOpenMP.h | 6 + clang/lib/AST/OpenMPClause.cpp | 8 + clang/lib/AST/StmtProfile.cpp | 2 + clang/lib/Basic/OpenMPKinds.cpp | 19 + clang/lib/CodeGen/CGOpenMPRuntime.cpp | 6 + clang/lib/Parse/ParseOpenMP.cpp | 1 + clang/lib/Sema/SemaOpenMP.cpp | 21 + clang/lib/Sema/TreeTransform.h | 7 + clang/lib/Serialization/ASTReader.cpp | 14 + clang/lib/Serialization/ASTWriter.cpp | 6 + clang/test/OpenMP/task_ast_print.cpp | 26 +- clang/test/OpenMP/task_codegen.cpp | 33 + clang/test/OpenMP/task_threadset_messages.cpp | 99 ++ clang/test/OpenMP/taskloop_ast_print.cpp | 16 + clang/test/OpenMP/taskloop_codegen.cpp | 53 + clang/tools/libclang/CIndex.cpp | 2 + flang/include/flang/Lower/OpenMP/Clauses.h | 1 + flang/include/flang/Parser/dump-parse-tree.h | 2 + flang/include/flang/Parser/parse-tree.h | 8 + flang/lib/Lower/OpenMP/Clauses.cpp | 15 + flang/lib/Semantics/check-omp-structure.cpp | 1 + llvm/include/llvm/Frontend/OpenMP/ClauseT.h | 14 +- llvm/include/llvm/Frontend/OpenMP/OMP.td | 6 + 29 files changed, 1118 insertions(+), 674 deletions(-) create mode 100755 clang/test/OpenMP/task_threadset_messages.cpp diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index 61b5babbd18a8..10a8d095fede3 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -1,662 +1,662 @@ -.. raw:: html - - - -.. role:: none -.. role:: part -.. role:: good - -.. contents:: - :local: - -============== -OpenMP Support -============== - -Clang fully supports OpenMP 4.5, almost all of 5.0 and most of 5.1/2. -Clang supports offloading to X86_64, AArch64, PPC64[LE], NVIDIA GPUs (all models) and AMD GPUs (all models). - -In addition, the LLVM OpenMP runtime `libomp` supports the OpenMP Tools -Interface (OMPT) on x86, x86_64, AArch64, and PPC64 on Linux, Windows, and macOS. -OMPT is also supported for NVIDIA and AMD GPUs. - -For the list of supported features from OpenMP 5.0 and 5.1 -see `OpenMP implementation details`_ and `OpenMP 51 implementation details`_. - -General improvements -==================== -- New collapse clause scheme to avoid expensive remainder operations. - Compute loop index variables after collapsing a loop nest via the - collapse clause by replacing the expensive remainder operation with - multiplications and additions. - -- When using the collapse clause on a loop nest the default behavior - is to automatically extend the representation of the loop counter to - 64 bits for the cases where the sizes of the collapsed loops are not - known at compile time. To prevent this conservative choice and use - at most 32 bits, compile your program with the - `-fopenmp-optimistic-collapse`. - - -GPU devices support -=================== - -Data-sharing modes ------------------- - -Clang supports two data-sharing models for Cuda devices: `Generic` and `Cuda` -modes. The default mode is `Generic`. `Cuda` mode can give an additional -performance and can be activated using the `-fopenmp-cuda-mode` flag. In -`Generic` mode all local variables that can be shared in the parallel regions -are stored in the global memory. In `Cuda` mode local variables are not shared -between the threads and it is user responsibility to share the required data -between the threads in the parallel regions. Often, the optimizer is able to -reduce the cost of `Generic` mode to the level of `Cuda` mode, but the flag, -as well as other assumption flags, can be used for tuning. - -Features not supported or with limited support for Cuda devices ---------------------------------------------------------------- - -- Cancellation constructs are not supported. - -- Doacross loop nest is not supported. - -- User-defined reductions are supported only for trivial types. - -- Nested parallelism: inner parallel regions are executed sequentially. - -- Debug information for OpenMP target regions is supported, but sometimes it may - be required to manually specify the address class of the inspected variables. - In some cases the local variables are actually allocated in the global memory, - but the debug info may be not aware of it. - - -.. _OpenMP implementation details: - -OpenMP 5.0 Implementation Details -================================= - -The following table provides a quick overview over various OpenMP 5.0 features -and their implementation status. Please post on the -`Discourse forums (Runtimes - OpenMP category)`_ for more -information or if you want to help with the -implementation. - -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -|Category | Feature | Status | Reviews | -+==============================+==============================================================+==========================+=======================================================================+ -| loop | support != in the canonical loop form | :good:`done` | D54441 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| loop | #pragma omp loop (directive) | :part:`partial` | D145823 (combined forms) | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| loop | #pragma omp loop bind | :part:`worked on` | D144634 (needs review) | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| loop | collapse imperfectly nested loop | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| loop | collapse non-rectangular nested loop | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| loop | C++ range-base for loop | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| loop | clause: if for SIMD directives | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| loop | inclusive scan (matching C++17 PSTL) | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| memory management | memory allocators | :good:`done` | r341687,r357929 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| memory management | allocate directive and allocate clause | :good:`done` | r355614,r335952 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| OMPD | OMPD interfaces | :good:`done` | https://reviews.llvm.org/D99914 (Supports only HOST(CPU) and Linux | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| OMPT | OMPT interfaces (callback support) | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| thread affinity | thread affinity | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| task | taskloop reduction | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| task | task affinity | :part:`not upstream` | https://github.com/jklinkenberg/openmp/tree/task-affinity | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| task | clause: depend on the taskwait construct | :good:`done` | D113540 (regular codegen only) | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| task | depend objects and detachable tasks | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| task | mutexinoutset dependence-type for tasks | :good:`done` | D53380,D57576 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| task | combined taskloop constructs | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| task | master taskloop | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| task | parallel master taskloop | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| task | master taskloop simd | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| task | parallel master taskloop simd | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| SIMD | atomic and simd constructs inside SIMD code | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| SIMD | SIMD nontemporal | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | infer target functions from initializers | :part:`worked on` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | infer target variables from initializers | :good:`done` | D146418 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | OMP_TARGET_OFFLOAD environment variable | :good:`done` | D50522 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | support full 'defaultmap' functionality | :good:`done` | D69204 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | device specific functions | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | clause: device_type | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | clause: extended device | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | clause: uses_allocators clause | :good:`done` | https://github.com/llvm/llvm-project/pull/157025 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | clause: in_reduction | :part:`worked on` | r308768 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | omp_get_device_num() | :good:`done` | D54342,D128347 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | structure mapping of references | :none:`unclaimed` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | nested target declare | :good:`done` | D51378 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | implicitly map 'this' (this[:1]) | :good:`done` | D55982 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | allow access to the reference count (omp_target_is_present) | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | requires directive | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | clause: unified_shared_memory | :good:`done` | D52625,D52359 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | clause: unified_address | :part:`partial` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | clause: reverse_offload | :part:`partial` | D52780,D155003 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | clause: atomic_default_mem_order | :good:`done` | D53513 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | clause: dynamic_allocators | :part:`unclaimed parts` | D53079 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | user-defined mappers | :good:`done` | D56326,D58638,D58523,D58074,D60972,D59474 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | map array-section with implicit mapper | :good:`done` | https://github.com/llvm/llvm-project/pull/101101 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | mapping lambda expression | :good:`done` | D51107 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | clause: use_device_addr for target data | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | support close modifier on map clause | :good:`done` | D55719,D55892 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | teams construct on the host device | :good:`done` | r371553 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | support non-contiguous array sections for target update | :good:`done` | https://github.com/llvm/llvm-project/pull/144635 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | pointer attachment | :part:`being repaired` | @abhinavgaba (https://github.com/llvm/llvm-project/pull/153683) | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| atomic | hints for the atomic construct | :good:`done` | D51233 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| base language | C11 support | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| base language | C++11/14/17 support | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| base language | lambda support | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | array shaping | :good:`done` | D74144 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | library shutdown (omp_pause_resource[_all]) | :good:`done` | D55078 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | metadirectives | :part:`mostly done` | D91944, https://github.com/llvm/llvm-project/pull/128640 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | conditional modifier for lastprivate clause | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | iterator and multidependences | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | depobj directive and depobj dependency kind | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | user-defined function variants | :good:`done`. | D67294, D64095, D71847, D71830, D109635 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | pointer/reference to pointer based array reductions | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | prevent new type definitions in clauses | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| memory model | memory model update (seq_cst, acq_rel, release, acquire,...) | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ - - -.. _OpenMP 51 implementation details: - -OpenMP 5.1 Implementation Details -================================= - -The following table provides a quick overview over various OpenMP 5.1 features -and their implementation status. -Please post on the -`Discourse forums (Runtimes - OpenMP category)`_ for more -information or if you want to help with the -implementation. - -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -|Category | Feature | Status | Reviews | -+==============================+==============================================================+==========================+=======================================================================+ -| atomic | 'compare' clause on atomic construct | :good:`done` | D120290, D120007, D118632, D120200, D116261, D118547, D116637 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| atomic | 'fail' clause on atomic construct | :part:`worked on` | D123235 (in progress) | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| base language | C++ attribute specifier syntax | :good:`done` | D105648 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | 'present' map type modifier | :good:`done` | D83061, D83062, D84422 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | 'present' motion modifier | :good:`done` | D84711, D84712 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | 'present' in defaultmap clause | :good:`done` | D92427 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | map clause reordering based on 'present' modifier | :none:`unclaimed` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | device-specific environment variables | :none:`unclaimed` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | omp_target_is_accessible routine | :good:`done` | https://github.com/llvm/llvm-project/pull/138294 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | omp_get_mapped_ptr routine | :good:`done` | D141545 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | new async target memory copy routines | :good:`done` | D136103 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | thread_limit clause on target construct | :part:`partial` | D141540 (offload), D152054 (host, in progress) | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | has_device_addr clause on target construct | :none:`unclaimed` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | iterators in map clause or motion clauses | :none:`unclaimed` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | indirect clause on declare target directive | :part:`In Progress` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | allow virtual functions calls for mapped object on device | :part:`partial` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | interop construct | :part:`partial` | parsing/sema done: D98558, D98834, D98815 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device | assorted routines for querying interoperable properties | :part:`partial` | D106674 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| loop | Loop tiling transformation | :good:`done` | D76342 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| loop | Loop unrolling transformation | :good:`done` | D99459 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| loop | 'reproducible'/'unconstrained' modifiers in 'order' clause | :part:`partial` | D127855 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| memory management | alignment for allocate directive and clause | :good:`done` | D115683 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| memory management | 'allocator' modifier for allocate clause | :good:`done` | https://github.com/llvm/llvm-project/pull/114883 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| memory management | 'align' modifier for allocate clause | :good:`done` | https://github.com/llvm/llvm-project/pull/121814 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| memory management | new memory management routines | :none:`unclaimed` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| memory management | changes to omp_alloctrait_key enum | :none:`unclaimed` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| memory model | seq_cst clause on flush construct | :good:`done` | https://github.com/llvm/llvm-project/pull/114072 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | 'omp_all_memory' keyword and use in 'depend' clause | :good:`done` | D125828, D126321 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | error directive | :good:`done` | D139166 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | scope construct | :good:`done` | D157933, https://github.com/llvm/llvm-project/pull/109197 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | routines for controlling and querying team regions | :part:`partial` | D95003 (libomp only) | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | changes to ompt_scope_endpoint_t enum | :none:`unclaimed` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | omp_display_env routine | :good:`done` | D74956 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | extended OMP_PLACES syntax | :none:`unclaimed` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | OMP_NUM_TEAMS and OMP_TEAMS_THREAD_LIMIT env vars | :good:`done` | D138769 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | 'target_device' selector in context specifier | :none:`worked on` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | begin/end declare variant | :good:`done` | D71179 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | dispatch construct and function variant argument adjustment | :part:`worked on` | D99537, D99679 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | assumes directives | :part:`worked on` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | assume directive | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | nothing directive | :good:`done` | D123286 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | masked construct and related combined constructs | :good:`done` | D99995, D100514, PR-121741(parallel_masked_taskloop) | -| | | | PR-121746(parallel_masked_task_loop_simd),PR-121914(masked_taskloop) | -| | | | PR-121916(masked_taskloop_simd) | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc | default(firstprivate) & default(private) | :good:`done` | D75591 (firstprivate), D125912 (private) | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| other | deprecating master construct | :none:`unclaimed` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| OMPT | new barrier types added to ompt_sync_region_t enum | :none:`unclaimed` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| OMPT | async data transfers added to ompt_target_data_op_t enum | :none:`unclaimed` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| OMPT | new barrier state values added to ompt_state_t enum | :none:`unclaimed` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| OMPT | new 'emi' callbacks for external monitoring interfaces | :good:`done` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| OMPT | device tracing interface | :none:`in progress` | jplehr | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| task | 'strict' modifier for taskloop construct | :none:`unclaimed` | | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| task | inoutset in depend clause | :good:`done` | D97085, D118383 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| task | nowait clause on taskwait | :part:`partial` | parsing/sema done: D131830, D141531 | -+------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ - - -.. _OpenMP 5.2 implementation details: - -OpenMP 5.2 Implementation Details -================================= - -The following table provides a quick overview of various OpenMP 5.2 features -and their implementation status. Please post on the -`Discourse forums (Runtimes - OpenMP category)`_ for more -information or if you want to help with the -implementation. - - - -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -|Feature | C/C++ Status | Fortran Status | Reviews | -+=============================================================+===========================+===========================+==========================================================================+ -| omp_in_explicit_task() | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| semantics of explicit_task_var and implicit_task_var | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| ompx sentinel for C/C++ directive extensions | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| ompx prefix for clause extensions | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| if clause on teams construct | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| step modifier added | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| declare mapper: Add iterator modifier on map clause | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| declare mapper: Add iterator modifier on map clause | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| memspace and traits modifiers to uses allocator i | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Add otherwise clause to metadirectives | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| doacross clause with support for omp_cur_iteration | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| position of interop_type in init clause on iterop | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| implicit map type for target enter/exit data | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| work OMPT type for work-sharing loop constructs | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| allocate and firstprivate on scope directive | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Change loop consistency for order clause | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Add memspace and traits modifiers to uses_allocators | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Keep original base pointer on map w/o matched candidate | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Pure procedure support for certain directives | :none:`N/A` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| ALLOCATE statement support for allocators | :none:`N/A` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| dispatch construct extension to support end directive | :none:`N/A` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ - - - -.. _OpenMP 5.2 Deprecations: - -OpenMP 5.2 Deprecations -======================= - - - -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| | C/C++ Status | Fortran Status | Reviews | -+=============================================================+===========================+===========================+==========================================================================+ -| Linear clause syntax | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| The minus operator | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Map clause modifiers without commas | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| The use of allocate directives with ALLOCATE statement | :good:`N/A` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| uses_allocators list syntax | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| The default clause on metadirectives | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| The delimited form of the declare target directive | :none:`unclaimed` | :good:`N/A` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| The use of the to clause on the declare target directive | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| The syntax of the destroy clause on the depobj construct | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| keyword source and sink as task-dependence modifiers | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| interop types in any position on init clause of interop | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| ompd prefix usage for some ICVs | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ - -.. _OpenMP 6.0 implementation details: - -OpenMP 6.0 Implementation Details -================================= - -The following table provides a quick overview of various OpenMP 6.0 features -and their implementation status. Please post on the -`Discourse forums (Runtimes - OpenMP category)`_ for more -information or if you want to help with the -implementation. - - -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -|Feature | C/C++ Status | Fortran Status | Reviews | -+=============================================================+===========================+===========================+==========================================================================+ -| free-agent threads | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| threadset clause | :part:`in progress` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Recording of task graphs | :part:`in progress` | :part:`in progress` | clang: jtb20, flang: kparzysz | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Parallel inductions | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| init_complete for scan directive | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| loop interchange transformation | :good:`done` | :none:`unclaimed` | Clang (interchange): https://github.com/llvm/llvm-project/pull/93022 | -| | | | Clang (permutation): https://github.com/llvm/llvm-project/pull/92030 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| loop reverse transformation | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/92916 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| loop stripe transformation | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/119891 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| loop fusion transformation | :part:`in progress` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/139293 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| loop index set splitting transformation | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| loop transformation apply clause | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| loop fuse transformation | :good:`done` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| workdistribute construct | | :none:`in progress` | @skc7, @mjklemm | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| task_iteration | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| memscope clause for atomic and flush | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| transparent clause (hull tasks) | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| rule-based compound directives | :part:`In Progress` | :part:`In Progress` | kparzysz | -| | | | Testing for Fortran missing | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| C23, C++23 | :none:`unclaimed` | | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Fortran 2023 | | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| decl attribute for declarative directives | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| C attribute syntax | :none:`unclaimed` | | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| pure directives in DO CONCURRENT | | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Optional argument for all clauses | :none:`partial` | :none:`In Progress` | Parse/Sema (nowait): https://github.com/llvm/llvm-project/pull/159628 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Function references for locator list items | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| All clauses accept directive name modifier | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Extensions to depobj construct | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Extensions to atomic construct | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Private reductions | :good:`mostly` | :none:`unclaimed` | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938 | -| | | | Codegen: https://github.com/llvm/llvm-project/pull/134709 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Self maps | :part:`partial` | :none:`unclaimed` | parsing/sema done: https://github.com/llvm/llvm-project/pull/129888 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Release map type for declare mapper | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Extensions to interop construct | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| no_openmp_constructs | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/125933 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| safe_sync and progress with identifier and API | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| OpenMP directives in concurrent loop regions | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/125621 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| atomics constructs on concurrent loop regions | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/125621 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Loop construct with DO CONCURRENT | | :part:`In Progress` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| device_type clause for target construct | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| nowait for ancestor target directives | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| New API for devices' num_teams/thread_limit | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Host and device environment variables | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| num_threads ICV and clause accepts list | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Numeric names for environment variables | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Increment between places for OMP_PLACES | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| OMP_AVAILABLE_DEVICES envirable | :none:`unclaimed` | :none:`unclaimed` | (should wait for "Traits for default device envirable" being done) | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Traits for default device envirable | :part:`in progress` | :none:`unclaimed` | ro-i | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Optionally omit array length expression | :good:`done` | :none:`unclaimed` | (Parse) https://github.com/llvm/llvm-project/pull/148048, | -| | | | (Sema) https://github.com/llvm/llvm-project/pull/152786 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Canonical loop sequences | :part:`in progress` | :part:`in progress` | Clang: https://github.com/llvm/llvm-project/pull/139293 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Clarifications to Fortran map semantics | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| default clause at target construct | :part:`In Progress` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| ref count update use_device_{ptr, addr} | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Clarifications to implicit reductions | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| ref modifier for map clauses | :part:`In Progress` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| map-type modifiers in arbitrary position | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/90499 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Lift nesting restriction on concurrent loop | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/125621 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| priority clause for target constructs | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| changes to target_data construct | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Non-const do_not_sync for nowait/nogroup | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| need_device_addr modifier for adjust_args clause | :part:`partial` | :none:`unclaimed` | Parsing/Sema: https://github.com/llvm/llvm-project/pull/143442 | -| | | | https://github.com/llvm/llvm-project/pull/149586 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Prescriptive num_threads | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/160659 | -| | | | https://github.com/llvm/llvm-project/pull/146403 | -| | | | https://github.com/llvm/llvm-project/pull/146404 | -| | | | https://github.com/llvm/llvm-project/pull/146405 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Message and severity clauses | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/146093 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Local clause on declare target | :part:`In Progress` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| groupprivate directive | :part:`In Progress` | :part:`partial` | Flang: kparzysz, mjklemm | -| | | | | -| | | | Flang parser: https://github.com/llvm/llvm-project/pull/153807 | -| | | | Flang sema: https://github.com/llvm/llvm-project/pull/154779 | -| | | | Clang parse/sema: https://github.com/llvm/llvm-project/pull/158134 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| variable-category on default clause | :good:`done` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Changes to omp_target_is_accessible | :part:`In Progress` | :part:`In Progress` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| defaultmap implicit-behavior 'storage' | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/158336 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| defaultmap implicit-behavior 'private' | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/158712 | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ - -.. _OpenMP 6.1 implementation details: - -OpenMP 6.1 Implementation Details (Experimental) -================================================ - -The following table provides a quick overview over various OpenMP 6.1 features -and their implementation status. Since OpenMP 6.1 has not yet been released, the -following features are experimental and are subject to change at any time. -Please post on the `Discourse forums (Runtimes - OpenMP category)`_ for more -information or if you want to help with the -implementation. - -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -|Feature | C/C++ Status | Fortran Status | Reviews | -+=============================================================+===========================+===========================+==========================================================================+ -| dyn_groupprivate clause | :part:`In Progress` | :part:`In Progress` | C/C++: kevinsala (https://github.com/llvm/llvm-project/pull/152651 | -| | | | https://github.com/llvm/llvm-project/pull/152830 | -| | | | https://github.com/llvm/llvm-project/pull/152831) | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| loop flatten transformation | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| loop grid/tile modifiers for sizes clause | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| attach map-type modifier | :part:`In Progress` | :none:`unclaimed` | C/C++: @abhinavgaba; | -| | | | RT: @abhinavgaba (https://github.com/llvm/llvm-project/pull/149036, | -| | | | https://github.com/llvm/llvm-project/pull/158370) | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ - - -OpenMP Extensions -================= - -The following table provides a quick overview over various OpenMP -extensions and their implementation status. These extensions are not -currently defined by any standard, so links to associated LLVM -documentation are provided. As these extensions mature, they will be -considered for standardization. Please post on the -`Discourse forums (Runtimes - OpenMP category)`_ to provide feedback. - -+------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+ -|Category | Feature | Status | Reviews | -+==============================+===================================================================================+==========================+========================================================+ -| atomic extension | `'atomic' strictly nested within 'teams' | :good:`prototyped` | D126323 | -| | `_ | | | -+------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+ -| device extension | `'ompx_hold' map type modifier | :good:`prototyped` | D106509, D106510 | -| | `_ | | | -+------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+ -| device extension | `'ompx_bare' clause on 'target teams' construct | :good:`prototyped` | #66844, #70612 | -| | `_ | | | -+------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+ -| device extension | Multi-dim 'num_teams' and 'thread_limit' clause on 'target teams ompx_bare' | :good:`partial` | #99732, #101407, #102715 | -| | construct | | | -+------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+ - -.. _Discourse forums (Runtimes - OpenMP category): https://discourse.llvm.org/c/runtimes/openmp/35 +.. raw:: html + + + +.. role:: none +.. role:: part +.. role:: good + +.. contents:: + :local: + +============== +OpenMP Support +============== + +Clang fully supports OpenMP 4.5, almost all of 5.0 and most of 5.1/2. +Clang supports offloading to X86_64, AArch64, PPC64[LE], NVIDIA GPUs (all models) and AMD GPUs (all models). + +In addition, the LLVM OpenMP runtime `libomp` supports the OpenMP Tools +Interface (OMPT) on x86, x86_64, AArch64, and PPC64 on Linux, Windows, and macOS. +OMPT is also supported for NVIDIA and AMD GPUs. + +For the list of supported features from OpenMP 5.0 and 5.1 +see `OpenMP implementation details`_ and `OpenMP 51 implementation details`_. + +General improvements +==================== +- New collapse clause scheme to avoid expensive remainder operations. + Compute loop index variables after collapsing a loop nest via the + collapse clause by replacing the expensive remainder operation with + multiplications and additions. + +- When using the collapse clause on a loop nest the default behavior + is to automatically extend the representation of the loop counter to + 64 bits for the cases where the sizes of the collapsed loops are not + known at compile time. To prevent this conservative choice and use + at most 32 bits, compile your program with the + `-fopenmp-optimistic-collapse`. + + +GPU devices support +=================== + +Data-sharing modes +------------------ + +Clang supports two data-sharing models for Cuda devices: `Generic` and `Cuda` +modes. The default mode is `Generic`. `Cuda` mode can give an additional +performance and can be activated using the `-fopenmp-cuda-mode` flag. In +`Generic` mode all local variables that can be shared in the parallel regions +are stored in the global memory. In `Cuda` mode local variables are not shared +between the threads and it is user responsibility to share the required data +between the threads in the parallel regions. Often, the optimizer is able to +reduce the cost of `Generic` mode to the level of `Cuda` mode, but the flag, +as well as other assumption flags, can be used for tuning. + +Features not supported or with limited support for Cuda devices +--------------------------------------------------------------- + +- Cancellation constructs are not supported. + +- Doacross loop nest is not supported. + +- User-defined reductions are supported only for trivial types. + +- Nested parallelism: inner parallel regions are executed sequentially. + +- Debug information for OpenMP target regions is supported, but sometimes it may + be required to manually specify the address class of the inspected variables. + In some cases the local variables are actually allocated in the global memory, + but the debug info may be not aware of it. + + +.. _OpenMP implementation details: + +OpenMP 5.0 Implementation Details +================================= + +The following table provides a quick overview over various OpenMP 5.0 features +and their implementation status. Please post on the +`Discourse forums (Runtimes - OpenMP category)`_ for more +information or if you want to help with the +implementation. + ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +|Category | Feature | Status | Reviews | ++==============================+==============================================================+==========================+=======================================================================+ +| loop | support != in the canonical loop form | :good:`done` | D54441 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| loop | #pragma omp loop (directive) | :part:`partial` | D145823 (combined forms) | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| loop | #pragma omp loop bind | :part:`worked on` | D144634 (needs review) | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| loop | collapse imperfectly nested loop | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| loop | collapse non-rectangular nested loop | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| loop | C++ range-base for loop | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| loop | clause: if for SIMD directives | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| loop | inclusive scan (matching C++17 PSTL) | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| memory management | memory allocators | :good:`done` | r341687,r357929 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| memory management | allocate directive and allocate clause | :good:`done` | r355614,r335952 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| OMPD | OMPD interfaces | :good:`done` | https://reviews.llvm.org/D99914 (Supports only HOST(CPU) and Linux | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| OMPT | OMPT interfaces (callback support) | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| thread affinity | thread affinity | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| task | taskloop reduction | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| task | task affinity | :part:`not upstream` | https://github.com/jklinkenberg/openmp/tree/task-affinity | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| task | clause: depend on the taskwait construct | :good:`done` | D113540 (regular codegen only) | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| task | depend objects and detachable tasks | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| task | mutexinoutset dependence-type for tasks | :good:`done` | D53380,D57576 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| task | combined taskloop constructs | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| task | master taskloop | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| task | parallel master taskloop | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| task | master taskloop simd | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| task | parallel master taskloop simd | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| SIMD | atomic and simd constructs inside SIMD code | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| SIMD | SIMD nontemporal | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | infer target functions from initializers | :part:`worked on` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | infer target variables from initializers | :good:`done` | D146418 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | OMP_TARGET_OFFLOAD environment variable | :good:`done` | D50522 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | support full 'defaultmap' functionality | :good:`done` | D69204 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | device specific functions | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | clause: device_type | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | clause: extended device | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | clause: uses_allocators clause | :good:`done` | https://github.com/llvm/llvm-project/pull/157025 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | clause: in_reduction | :part:`worked on` | r308768 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | omp_get_device_num() | :good:`done` | D54342,D128347 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | structure mapping of references | :none:`unclaimed` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | nested target declare | :good:`done` | D51378 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | implicitly map 'this' (this[:1]) | :good:`done` | D55982 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | allow access to the reference count (omp_target_is_present) | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | requires directive | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | clause: unified_shared_memory | :good:`done` | D52625,D52359 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | clause: unified_address | :part:`partial` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | clause: reverse_offload | :part:`partial` | D52780,D155003 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | clause: atomic_default_mem_order | :good:`done` | D53513 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | clause: dynamic_allocators | :part:`unclaimed parts` | D53079 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | user-defined mappers | :good:`done` | D56326,D58638,D58523,D58074,D60972,D59474 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | map array-section with implicit mapper | :good:`done` | https://github.com/llvm/llvm-project/pull/101101 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | mapping lambda expression | :good:`done` | D51107 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | clause: use_device_addr for target data | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | support close modifier on map clause | :good:`done` | D55719,D55892 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | teams construct on the host device | :good:`done` | r371553 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | support non-contiguous array sections for target update | :good:`done` | https://github.com/llvm/llvm-project/pull/144635 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | pointer attachment | :part:`being repaired` | @abhinavgaba (https://github.com/llvm/llvm-project/pull/153683) | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| atomic | hints for the atomic construct | :good:`done` | D51233 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| base language | C11 support | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| base language | C++11/14/17 support | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| base language | lambda support | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | array shaping | :good:`done` | D74144 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | library shutdown (omp_pause_resource[_all]) | :good:`done` | D55078 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | metadirectives | :part:`mostly done` | D91944, https://github.com/llvm/llvm-project/pull/128640 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | conditional modifier for lastprivate clause | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | iterator and multidependences | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | depobj directive and depobj dependency kind | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | user-defined function variants | :good:`done`. | D67294, D64095, D71847, D71830, D109635 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | pointer/reference to pointer based array reductions | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | prevent new type definitions in clauses | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| memory model | memory model update (seq_cst, acq_rel, release, acquire,...) | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ + + +.. _OpenMP 51 implementation details: + +OpenMP 5.1 Implementation Details +================================= + +The following table provides a quick overview over various OpenMP 5.1 features +and their implementation status. +Please post on the +`Discourse forums (Runtimes - OpenMP category)`_ for more +information or if you want to help with the +implementation. + ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +|Category | Feature | Status | Reviews | ++==============================+==============================================================+==========================+=======================================================================+ +| atomic | 'compare' clause on atomic construct | :good:`done` | D120290, D120007, D118632, D120200, D116261, D118547, D116637 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| atomic | 'fail' clause on atomic construct | :part:`worked on` | D123235 (in progress) | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| base language | C++ attribute specifier syntax | :good:`done` | D105648 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | 'present' map type modifier | :good:`done` | D83061, D83062, D84422 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | 'present' motion modifier | :good:`done` | D84711, D84712 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | 'present' in defaultmap clause | :good:`done` | D92427 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | map clause reordering based on 'present' modifier | :none:`unclaimed` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | device-specific environment variables | :none:`unclaimed` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | omp_target_is_accessible routine | :good:`done` | https://github.com/llvm/llvm-project/pull/138294 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | omp_get_mapped_ptr routine | :good:`done` | D141545 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | new async target memory copy routines | :good:`done` | D136103 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | thread_limit clause on target construct | :part:`partial` | D141540 (offload), D152054 (host, in progress) | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | has_device_addr clause on target construct | :none:`unclaimed` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | iterators in map clause or motion clauses | :none:`unclaimed` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | indirect clause on declare target directive | :part:`In Progress` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | allow virtual functions calls for mapped object on device | :part:`partial` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | interop construct | :part:`partial` | parsing/sema done: D98558, D98834, D98815 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device | assorted routines for querying interoperable properties | :part:`partial` | D106674 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| loop | Loop tiling transformation | :good:`done` | D76342 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| loop | Loop unrolling transformation | :good:`done` | D99459 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| loop | 'reproducible'/'unconstrained' modifiers in 'order' clause | :part:`partial` | D127855 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| memory management | alignment for allocate directive and clause | :good:`done` | D115683 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| memory management | 'allocator' modifier for allocate clause | :good:`done` | https://github.com/llvm/llvm-project/pull/114883 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| memory management | 'align' modifier for allocate clause | :good:`done` | https://github.com/llvm/llvm-project/pull/121814 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| memory management | new memory management routines | :none:`unclaimed` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| memory management | changes to omp_alloctrait_key enum | :none:`unclaimed` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| memory model | seq_cst clause on flush construct | :good:`done` | https://github.com/llvm/llvm-project/pull/114072 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | 'omp_all_memory' keyword and use in 'depend' clause | :good:`done` | D125828, D126321 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | error directive | :good:`done` | D139166 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | scope construct | :good:`done` | D157933, https://github.com/llvm/llvm-project/pull/109197 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | routines for controlling and querying team regions | :part:`partial` | D95003 (libomp only) | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | changes to ompt_scope_endpoint_t enum | :none:`unclaimed` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | omp_display_env routine | :good:`done` | D74956 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | extended OMP_PLACES syntax | :none:`unclaimed` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | OMP_NUM_TEAMS and OMP_TEAMS_THREAD_LIMIT env vars | :good:`done` | D138769 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | 'target_device' selector in context specifier | :none:`worked on` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | begin/end declare variant | :good:`done` | D71179 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | dispatch construct and function variant argument adjustment | :part:`worked on` | D99537, D99679 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | assumes directives | :part:`worked on` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | assume directive | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | nothing directive | :good:`done` | D123286 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | masked construct and related combined constructs | :good:`done` | D99995, D100514, PR-121741(parallel_masked_taskloop) | +| | | | PR-121746(parallel_masked_task_loop_simd),PR-121914(masked_taskloop) | +| | | | PR-121916(masked_taskloop_simd) | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| misc | default(firstprivate) & default(private) | :good:`done` | D75591 (firstprivate), D125912 (private) | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| other | deprecating master construct | :none:`unclaimed` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| OMPT | new barrier types added to ompt_sync_region_t enum | :none:`unclaimed` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| OMPT | async data transfers added to ompt_target_data_op_t enum | :none:`unclaimed` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| OMPT | new barrier state values added to ompt_state_t enum | :none:`unclaimed` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| OMPT | new 'emi' callbacks for external monitoring interfaces | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| OMPT | device tracing interface | :none:`in progress` | jplehr | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| task | 'strict' modifier for taskloop construct | :none:`unclaimed` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| task | inoutset in depend clause | :good:`done` | D97085, D118383 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| task | nowait clause on taskwait | :part:`partial` | parsing/sema done: D131830, D141531 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ + + +.. _OpenMP 5.2 implementation details: + +OpenMP 5.2 Implementation Details +================================= + +The following table provides a quick overview of various OpenMP 5.2 features +and their implementation status. Please post on the +`Discourse forums (Runtimes - OpenMP category)`_ for more +information or if you want to help with the +implementation. + + + ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +|Feature | C/C++ Status | Fortran Status | Reviews | ++=============================================================+===========================+===========================+==========================================================================+ +| omp_in_explicit_task() | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| semantics of explicit_task_var and implicit_task_var | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| ompx sentinel for C/C++ directive extensions | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| ompx prefix for clause extensions | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| if clause on teams construct | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| step modifier added | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| declare mapper: Add iterator modifier on map clause | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| declare mapper: Add iterator modifier on map clause | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| memspace and traits modifiers to uses allocator i | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Add otherwise clause to metadirectives | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| doacross clause with support for omp_cur_iteration | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| position of interop_type in init clause on iterop | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| implicit map type for target enter/exit data | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| work OMPT type for work-sharing loop constructs | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| allocate and firstprivate on scope directive | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Change loop consistency for order clause | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Add memspace and traits modifiers to uses_allocators | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Keep original base pointer on map w/o matched candidate | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Pure procedure support for certain directives | :none:`N/A` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| ALLOCATE statement support for allocators | :none:`N/A` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| dispatch construct extension to support end directive | :none:`N/A` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ + + + +.. _OpenMP 5.2 Deprecations: + +OpenMP 5.2 Deprecations +======================= + + + ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| | C/C++ Status | Fortran Status | Reviews | ++=============================================================+===========================+===========================+==========================================================================+ +| Linear clause syntax | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| The minus operator | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Map clause modifiers without commas | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| The use of allocate directives with ALLOCATE statement | :good:`N/A` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| uses_allocators list syntax | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| The default clause on metadirectives | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| The delimited form of the declare target directive | :none:`unclaimed` | :good:`N/A` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| The use of the to clause on the declare target directive | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| The syntax of the destroy clause on the depobj construct | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| keyword source and sink as task-dependence modifiers | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| interop types in any position on init clause of interop | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| ompd prefix usage for some ICVs | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ + +.. _OpenMP 6.0 implementation details: + +OpenMP 6.0 Implementation Details +================================= + +The following table provides a quick overview of various OpenMP 6.0 features +and their implementation status. Please post on the +`Discourse forums (Runtimes - OpenMP category)`_ for more +information or if you want to help with the +implementation. + + ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +|Feature | C/C++ Status | Fortran Status | Reviews | ++=============================================================+===========================+===========================+==========================================================================+ +| free-agent threads | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| threadset clause | :part:`partial` | :none:`unclaimed` | Parse/Sema/Codegen : https://github.com/llvm/llvm-project/pull/13580 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Recording of task graphs | :part:`in progress` | :part:`in progress` | clang: jtb20, flang: kparzysz | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Parallel inductions | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| init_complete for scan directive | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| loop interchange transformation | :good:`done` | :none:`unclaimed` | Clang (interchange): https://github.com/llvm/llvm-project/pull/93022 | +| | | | Clang (permutation): https://github.com/llvm/llvm-project/pull/92030 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| loop reverse transformation | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/92916 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| loop stripe transformation | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/119891 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| loop fusion transformation | :part:`in progress` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/139293 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| loop index set splitting transformation | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| loop transformation apply clause | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| loop fuse transformation | :good:`done` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| workdistribute construct | | :none:`in progress` | @skc7, @mjklemm | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| task_iteration | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| memscope clause for atomic and flush | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| transparent clause (hull tasks) | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| rule-based compound directives | :part:`In Progress` | :part:`In Progress` | kparzysz | +| | | | Testing for Fortran missing | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| C23, C++23 | :none:`unclaimed` | | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Fortran 2023 | | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| decl attribute for declarative directives | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| C attribute syntax | :none:`unclaimed` | | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| pure directives in DO CONCURRENT | | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Optional argument for all clauses | :none:`partial` | :none:`In Progress` | Parse/Sema (nowait): https://github.com/llvm/llvm-project/pull/159628 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Function references for locator list items | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| All clauses accept directive name modifier | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Extensions to depobj construct | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Extensions to atomic construct | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Private reductions | :good:`mostly` | :none:`unclaimed` | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938 | +| | | | Codegen: https://github.com/llvm/llvm-project/pull/134709 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Self maps | :part:`partial` | :none:`unclaimed` | parsing/sema done: https://github.com/llvm/llvm-project/pull/129888 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Release map type for declare mapper | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Extensions to interop construct | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| no_openmp_constructs | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/125933 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| safe_sync and progress with identifier and API | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| OpenMP directives in concurrent loop regions | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/125621 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| atomics constructs on concurrent loop regions | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/125621 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Loop construct with DO CONCURRENT | | :part:`In Progress` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| device_type clause for target construct | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| nowait for ancestor target directives | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| New API for devices' num_teams/thread_limit | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Host and device environment variables | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| num_threads ICV and clause accepts list | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Numeric names for environment variables | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Increment between places for OMP_PLACES | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| OMP_AVAILABLE_DEVICES envirable | :none:`unclaimed` | :none:`unclaimed` | (should wait for "Traits for default device envirable" being done) | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Traits for default device envirable | :part:`in progress` | :none:`unclaimed` | ro-i | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Optionally omit array length expression | :good:`done` | :none:`unclaimed` | (Parse) https://github.com/llvm/llvm-project/pull/148048, | +| | | | (Sema) https://github.com/llvm/llvm-project/pull/152786 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Canonical loop sequences | :part:`in progress` | :part:`in progress` | Clang: https://github.com/llvm/llvm-project/pull/139293 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Clarifications to Fortran map semantics | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| default clause at target construct | :part:`In Progress` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| ref count update use_device_{ptr, addr} | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Clarifications to implicit reductions | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| ref modifier for map clauses | :part:`In Progress` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| map-type modifiers in arbitrary position | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/90499 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Lift nesting restriction on concurrent loop | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/125621 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| priority clause for target constructs | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| changes to target_data construct | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Non-const do_not_sync for nowait/nogroup | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| need_device_addr modifier for adjust_args clause | :part:`partial` | :none:`unclaimed` | Parsing/Sema: https://github.com/llvm/llvm-project/pull/143442 | +| | | | https://github.com/llvm/llvm-project/pull/149586 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Prescriptive num_threads | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/160659 | +| | | | https://github.com/llvm/llvm-project/pull/146403 | +| | | | https://github.com/llvm/llvm-project/pull/146404 | +| | | | https://github.com/llvm/llvm-project/pull/146405 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Message and severity clauses | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/146093 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Local clause on declare target | :part:`In Progress` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| groupprivate directive | :part:`In Progress` | :part:`partial` | Flang: kparzysz, mjklemm | +| | | | | +| | | | Flang parser: https://github.com/llvm/llvm-project/pull/153807 | +| | | | Flang sema: https://github.com/llvm/llvm-project/pull/154779 | +| | | | Clang parse/sema: https://github.com/llvm/llvm-project/pull/158134 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| variable-category on default clause | :good:`done` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| Changes to omp_target_is_accessible | :part:`In Progress` | :part:`In Progress` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| defaultmap implicit-behavior 'storage' | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/158336 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| defaultmap implicit-behavior 'private' | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/158712 | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ + +.. _OpenMP 6.1 implementation details: + +OpenMP 6.1 Implementation Details (Experimental) +================================================ + +The following table provides a quick overview over various OpenMP 6.1 features +and their implementation status. Since OpenMP 6.1 has not yet been released, the +following features are experimental and are subject to change at any time. +Please post on the `Discourse forums (Runtimes - OpenMP category)`_ for more +information or if you want to help with the +implementation. + ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +|Feature | C/C++ Status | Fortran Status | Reviews | ++=============================================================+===========================+===========================+==========================================================================+ +| dyn_groupprivate clause | :part:`In Progress` | :part:`In Progress` | C/C++: kevinsala (https://github.com/llvm/llvm-project/pull/152651 | +| | | | https://github.com/llvm/llvm-project/pull/152830 | +| | | | https://github.com/llvm/llvm-project/pull/152831) | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| loop flatten transformation | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| loop grid/tile modifiers for sizes clause | :none:`unclaimed` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| attach map-type modifier | :part:`In Progress` | :none:`unclaimed` | C/C++: @abhinavgaba; | +| | | | RT: @abhinavgaba (https://github.com/llvm/llvm-project/pull/149036, | +| | | | https://github.com/llvm/llvm-project/pull/158370) | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ + + +OpenMP Extensions +================= + +The following table provides a quick overview over various OpenMP +extensions and their implementation status. These extensions are not +currently defined by any standard, so links to associated LLVM +documentation are provided. As these extensions mature, they will be +considered for standardization. Please post on the +`Discourse forums (Runtimes - OpenMP category)`_ to provide feedback. + ++------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+ +|Category | Feature | Status | Reviews | ++==============================+===================================================================================+==========================+========================================================+ +| atomic extension | `'atomic' strictly nested within 'teams' | :good:`prototyped` | D126323 | +| | `_ | | | ++------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+ +| device extension | `'ompx_hold' map type modifier | :good:`prototyped` | D106509, D106510 | +| | `_ | | | ++------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+ +| device extension | `'ompx_bare' clause on 'target teams' construct | :good:`prototyped` | #66844, #70612 | +| | `_ | | | ++------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+ +| device extension | Multi-dim 'num_teams' and 'thread_limit' clause on 'target teams ompx_bare' | :good:`partial` | #99732, #101407, #102715 | +| | construct | | | ++------------------------------+-----------------------------------------------------------------------------------+--------------------------+--------------------------------------------------------+ + +.. _Discourse forums (Runtimes - OpenMP category): https://discourse.llvm.org/c/runtimes/openmp/35 diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index add1582344a0e..8435f367029a5 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -661,6 +661,7 @@ OpenMP Support modifier in the ``adjust_args`` clause. - Allow array length to be omitted in array section subscript expression. - Fixed non-contiguous strided update in the ``omp target update`` directive with the ``from`` clause. +- Added support for threadset clause in task and taskloop directives. - Properly handle array section/assumed-size array privatization in C/C++. - Added support to handle new syntax of the ``uses_allocators`` clause. - Added support for ``variable-category`` modifier in ``default clause``. diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h index bc791e46e7c92..4f507485968cd 100644 --- a/clang/include/clang/AST/OpenMPClause.h +++ b/clang/include/clang/AST/OpenMPClause.h @@ -1424,6 +1424,86 @@ class OMPDefaultClause : public OMPClause { } }; +/// This represents 'threadset' clause in the '#pragma omp task ...' directive. +/// +/// \code +/// #pragma omp task threadset(omp_pool) +/// \endcode +/// In this example directive '#pragma omp task' has simple 'threadset' +/// clause with kind 'omp_pool'. +class OMPThreadsetClause final : public OMPClause { + friend class OMPClauseReader; + + /// Location of '('. + SourceLocation LParenLoc; + + /// A kind of the 'threadset' clause. + OpenMPThreadsetKind Kind = OMPC_THREADSET_unknown; + + /// Start location of the kind in source code. + SourceLocation KindLoc; + + /// Set kind of the clauses. + /// + /// \param K Argument of clause. + void setThreadsetKind(OpenMPThreadsetKind K) { Kind = K; } + + /// Set argument location. + /// + /// \param KLoc Argument location. + void setThreadsetKindLoc(SourceLocation KLoc) { KindLoc = KLoc; } + +public: + /// Build 'threadset' clause with argument \a A ('omp_team' or 'omp_pool'). + /// + /// \param A Argument of the clause ('omp_team' or 'omp_pool'). + /// \param ALoc Starting location of the argument. + /// \param StartLoc Starting location of the clause. + /// \param LParenLoc Location of '('. + /// \param EndLoc Ending location of the clause. + OMPThreadsetClause(OpenMPThreadsetKind A, SourceLocation ALoc, + SourceLocation StartLoc, SourceLocation LParenLoc, + SourceLocation EndLoc) + : OMPClause(llvm::omp::OMPC_threadset, StartLoc, EndLoc), + LParenLoc(LParenLoc), Kind(A), KindLoc(ALoc) {} + + /// Build an empty clause. + OMPThreadsetClause() + : OMPClause(llvm::omp::OMPC_threadset, SourceLocation(), + SourceLocation()) {} + + /// Sets the location of '('. + void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; } + + /// Returns the location of '('. + SourceLocation getLParenLoc() const { return LParenLoc; } + + /// Returns kind of the clause. + OpenMPThreadsetKind getThreadsetKind() const { return Kind; } + + /// Returns location of clause kind. + SourceLocation getThreadsetKindLoc() const { return KindLoc; } + + child_range children() { + return child_range(child_iterator(), child_iterator()); + } + + const_child_range children() const { + return const_child_range(const_child_iterator(), const_child_iterator()); + } + + child_range used_children() { + return child_range(child_iterator(), child_iterator()); + } + const_child_range used_children() const { + return const_child_range(const_child_iterator(), const_child_iterator()); + } + + static bool classof(const OMPClause *T) { + return T->getClauseKind() == llvm::omp::OMPC_threadset; + } +}; + /// This represents 'proc_bind' clause in the '#pragma omp ...' /// directive. /// diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h index 32b2b6bdb989c..8cb0a657023b4 100644 --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -3523,6 +3523,12 @@ bool RecursiveASTVisitor::VisitOMPDefaultClause(OMPDefaultClause *) { return true; } +template +bool RecursiveASTVisitor::VisitOMPThreadsetClause( + OMPThreadsetClause *) { + return true; +} + template bool RecursiveASTVisitor::VisitOMPProcBindClause(OMPProcBindClause *) { return true; diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def index 202d06fa1fcaa..328a0747a82a8 100644 --- a/clang/include/clang/Basic/OpenMPKinds.def +++ b/clang/include/clang/Basic/OpenMPKinds.def @@ -98,6 +98,9 @@ #ifndef OPENMP_ALLOCATE_MODIFIER #define OPENMP_ALLOCATE_MODIFIER(Name) #endif +#ifndef OPENMP_THREADSET_KIND +#define OPENMP_THREADSET_KIND(Name) +#endif // Static attributes for 'schedule' clause. OPENMP_SCHEDULE_KIND(static) @@ -255,6 +258,9 @@ OPENMP_DOACROSS_MODIFIER(sink) OPENMP_DOACROSS_MODIFIER(sink_omp_cur_iteration) OPENMP_DOACROSS_MODIFIER(source_omp_cur_iteration) +OPENMP_THREADSET_KIND(omp_pool) +OPENMP_THREADSET_KIND(omp_team) + #undef OPENMP_NUMTASKS_MODIFIER #undef OPENMP_NUMTHREADS_MODIFIER #undef OPENMP_GRAINSIZE_MODIFIER @@ -284,4 +290,4 @@ OPENMP_DOACROSS_MODIFIER(source_omp_cur_iteration) #undef OPENMP_DEFAULTMAP_MODIFIER #undef OPENMP_DOACROSS_MODIFIER #undef OPENMP_ALLOCATE_MODIFIER - +#undef OPENMP_THREADSET_KIND diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h index ed89a31e2684b..c9ddbcd6d46c1 100644 --- a/clang/include/clang/Basic/OpenMPKinds.h +++ b/clang/include/clang/Basic/OpenMPKinds.h @@ -250,6 +250,13 @@ enum OpenMPAllocateClauseModifier { OMPC_ALLOCATE_unknown }; +/// OpenMP modifiers for 'threadset' clause. +enum OpenMPThreadsetKind { +#define OPENMP_THREADSET_KIND(Name) OMPC_THREADSET_##Name, +#include "clang/Basic/OpenMPKinds.def" + OMPC_THREADSET_unknown +}; + /// Number of allowed allocate-modifiers. static constexpr unsigned NumberOfOMPAllocateClauseModifiers = OMPC_ALLOCATE_unknown; diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h index f9baeed03c347..ba12b403d9b9a 100644 --- a/clang/include/clang/Sema/SemaOpenMP.h +++ b/clang/include/clang/Sema/SemaOpenMP.h @@ -975,6 +975,12 @@ class SemaOpenMP : public SemaBase { OpenMPDefaultClauseVariableCategory VCKind, SourceLocation VCKindLoc, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc); + /// Called on well-formed 'threadset' clause. + OMPClause *ActOnOpenMPThreadsetClause(OpenMPThreadsetKind Kind, + SourceLocation KindLoc, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); /// Called on well-formed 'proc_bind' clause. OMPClause *ActOnOpenMPProcBindClause(llvm::omp::ProcBindKind Kind, SourceLocation KindLoc, diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp index 791df7ee1c3d4..59d94590e04d1 100644 --- a/clang/lib/AST/OpenMPClause.cpp +++ b/clang/lib/AST/OpenMPClause.cpp @@ -124,6 +124,7 @@ const OMPClauseWithPreInit *OMPClauseWithPreInit::get(const OMPClause *C) { case OMPC_nowait: case OMPC_untied: case OMPC_mergeable: + case OMPC_threadset: case OMPC_threadprivate: case OMPC_groupprivate: case OMPC_flush: @@ -2035,6 +2036,13 @@ void OMPClausePrinter::VisitOMPDefaultClause(OMPDefaultClause *Node) { OS << ")"; } +void OMPClausePrinter::VisitOMPThreadsetClause(OMPThreadsetClause *Node) { + OS << "threadset(" + << getOpenMPSimpleClauseTypeName(OMPC_threadset, + unsigned(Node->getThreadsetKind())) + << ")"; +} + void OMPClausePrinter::VisitOMPProcBindClause(OMPProcBindClause *Node) { OS << "proc_bind(" << getOpenMPSimpleClauseTypeName(OMPC_proc_bind, diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp index 05b64ccda0d01..c909e1bcecd38 100644 --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -546,6 +546,8 @@ void OMPClauseProfiler::VisitOMPNocontextClause(const OMPNocontextClause *C) { void OMPClauseProfiler::VisitOMPDefaultClause(const OMPDefaultClause *C) { } +void OMPClauseProfiler::VisitOMPThreadsetClause(const OMPThreadsetClause *C) {} + void OMPClauseProfiler::VisitOMPProcBindClause(const OMPProcBindClause *C) { } void OMPClauseProfiler::VisitOMPUnifiedAddressClause( diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp index 64b2bff063340..3d41f2d197b81 100644 --- a/clang/lib/Basic/OpenMPKinds.cpp +++ b/clang/lib/Basic/OpenMPKinds.cpp @@ -210,6 +210,15 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str, #define OPENMP_ALLOCATE_MODIFIER(Name) .Case(#Name, OMPC_ALLOCATE_##Name) #include "clang/Basic/OpenMPKinds.def" .Default(OMPC_ALLOCATE_unknown); + case OMPC_threadset: { + unsigned Type = llvm::StringSwitch(Str) +#define OPENMP_THREADSET_KIND(Name) .Case(#Name, OMPC_THREADSET_##Name) +#include "clang/Basic/OpenMPKinds.def" + .Default(OMPC_THREADSET_unknown); + if (LangOpts.OpenMP < 60) + return OMPC_THREADSET_unknown; + return Type; + } case OMPC_num_threads: { unsigned Type = llvm::StringSwitch(Str) #define OPENMP_NUMTHREADS_MODIFIER(Name) .Case(#Name, OMPC_NUMTHREADS_##Name) @@ -565,6 +574,16 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind, #include "clang/Basic/OpenMPKinds.def" } llvm_unreachable("Invalid OpenMP 'num_threads' clause modifier"); + case OMPC_threadset: + switch (Type) { + case OMPC_THREADSET_unknown: + return "unknown"; +#define OPENMP_THREADSET_KIND(Name) \ + case OMPC_THREADSET_##Name: \ + return #Name; +#include "clang/Basic/OpenMPKinds.def" + } + llvm_unreachable("Invalid OpenMP 'threadset' clause modifier"); case OMPC_unknown: case OMPC_threadprivate: case OMPC_groupprivate: diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 66fea920812c2..121de42248e3b 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -3731,6 +3731,7 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc, DestructorsFlag = 0x8, PriorityFlag = 0x20, DetachableFlag = 0x40, + FreeAgentFlag = 0x80, }; unsigned Flags = Data.Tied ? TiedFlag : 0; bool NeedsCleanup = false; @@ -3740,6 +3741,11 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc, if (NeedsCleanup) Flags = Flags | DestructorsFlag; } + if (const auto *Clause = D.getSingleClause()) { + OpenMPThreadsetKind Kind = Clause->getThreadsetKind(); + if (Kind == OMPC_THREADSET_omp_pool) + Flags = Flags | FreeAgentFlag; + } if (Data.Priority.getInt()) Flags = Flags | PriorityFlag; if (D.hasClausesOfKind()) diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index 25199c739ace9..31bc941e6a015 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -3221,6 +3221,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind, else Clause = ParseOpenMPSingleExprClause(CKind, WrongDirective); break; + case OMPC_threadset: case OMPC_fail: case OMPC_proc_bind: case OMPC_atomic_default_mem_order: diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 6d5cb0fcaea24..256f9521b3a7e 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -17216,6 +17216,10 @@ OMPClause *SemaOpenMP::ActOnOpenMPSimpleClause( static_cast(Argument), ArgumentLoc, StartLoc, LParenLoc, EndLoc); break; + case OMPC_threadset: + Res = ActOnOpenMPThreadsetClause(static_cast(Argument), + ArgumentLoc, StartLoc, LParenLoc, EndLoc); + break; case OMPC_if: case OMPC_final: case OMPC_num_threads: @@ -17355,6 +17359,23 @@ OMPClause *SemaOpenMP::ActOnOpenMPDefaultClause( OMPDefaultClause(M, MLoc, VCKind, VCKindLoc, StartLoc, LParenLoc, EndLoc); } +OMPClause *SemaOpenMP::ActOnOpenMPThreadsetClause(OpenMPThreadsetKind Kind, + SourceLocation KindLoc, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { + if (Kind == OMPC_THREADSET_unknown) { + Diag(KindLoc, diag::err_omp_unexpected_clause_value) + << getListOfPossibleValues(OMPC_threadset, /*First=*/0, + /*Last=*/unsigned(OMPC_THREADSET_unknown)) + << getOpenMPClauseName(OMPC_threadset); + return nullptr; + } + + return new (getASTContext()) + OMPThreadsetClause(Kind, KindLoc, StartLoc, LParenLoc, EndLoc); +} + OMPClause *SemaOpenMP::ActOnOpenMPProcBindClause(ProcBindKind Kind, SourceLocation KindKwLoc, SourceLocation StartLoc, diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 0c8c1d18d317e..8c20078e97a13 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -10622,6 +10622,13 @@ TreeTransform::TransformOMPDefaultClause(OMPDefaultClause *C) { C->getEndLoc()); } +template +OMPClause * +TreeTransform::TransformOMPThreadsetClause(OMPThreadsetClause *C) { + // No need to rebuild this clause, no template-dependent parameters. + return C; +} + template OMPClause * TreeTransform::TransformOMPProcBindClause(OMPProcBindClause *C) { diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index c1b5cb730e4a4..e3106f8d8e13c 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -11255,6 +11255,9 @@ OMPClause *OMPClauseReader::readClause() { case llvm::omp::OMPC_mergeable: C = new (Context) OMPMergeableClause(); break; + case llvm::omp::OMPC_threadset: + C = new (Context) OMPThreadsetClause(); + break; case llvm::omp::OMPC_read: C = new (Context) OMPReadClause(); break; @@ -11658,6 +11661,17 @@ void OMPClauseReader::VisitOMPDefaultClause(OMPDefaultClause *C) { C->setDefaultVariableCategoryLocation(Record.readSourceLocation()); } +// Read the parameter of threadset clause. This will have been saved when +// OMPClauseWriter is called. +void OMPClauseReader::VisitOMPThreadsetClause(OMPThreadsetClause *C) { + C->setLParenLoc(Record.readSourceLocation()); + SourceLocation ThreadsetKindLoc = Record.readSourceLocation(); + C->setThreadsetKindLoc(ThreadsetKindLoc); + OpenMPThreadsetKind TKind = + static_cast(Record.readInt()); + C->setThreadsetKind(TKind); +} + void OMPClauseReader::VisitOMPProcBindClause(OMPProcBindClause *C) { C->setProcBindKind(static_cast(Record.readInt())); C->setLParenLoc(Record.readSourceLocation()); diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 377e3966874f3..3ac338e013deb 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -7913,6 +7913,12 @@ void OMPClauseWriter::VisitOMPDefaultClause(OMPDefaultClause *C) { Record.AddSourceLocation(C->getDefaultVCLoc()); } +void OMPClauseWriter::VisitOMPThreadsetClause(OMPThreadsetClause *C) { + Record.AddSourceLocation(C->getLParenLoc()); + Record.AddSourceLocation(C->getThreadsetKindLoc()); + Record.writeEnum(C->getThreadsetKind()); +} + void OMPClauseWriter::VisitOMPProcBindClause(OMPProcBindClause *C) { Record.push_back(unsigned(C->getProcBindKind())); Record.AddSourceLocation(C->getLParenLoc()); diff --git a/clang/test/OpenMP/task_ast_print.cpp b/clang/test/OpenMP/task_ast_print.cpp index 30fb7ab75cc87..b059f187156ee 100644 --- a/clang/test/OpenMP/task_ast_print.cpp +++ b/clang/test/OpenMP/task_ast_print.cpp @@ -1,8 +1,10 @@ // RUN: %clang_cc1 -verify -Wno-vla -fopenmp -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -verify -Wno-vla -fopenmp -fopenmp-version=60 -DOMP60 -ast-print %s | FileCheck %s --check-prefix=CHECK60 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s // RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd -fopenmp-version=60 -DOMP60 -ast-print %s | FileCheck %s --check-prefix=CHECK60 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s // RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -ast-dump %s | FileCheck %s --check-prefix=DUMP @@ -101,8 +103,8 @@ T tmain(T argc, T *argv) { a = 2; #pragma omp task default(none), private(argc, b) firstprivate(argv) shared(d) if (argc > 0) final(S::TS > 0) priority(argc) affinity(argc, argv[b:argc], arr[:], ([argc][sizeof(T)])argv) foo(); -#pragma omp taskgroup task_reduction(-: argc) -#pragma omp task if (C) mergeable priority(C) in_reduction(-: argc) +#pragma omp taskgroup task_reduction(+: argc) +#pragma omp task if (C) mergeable priority(C) in_reduction(+: argc) foo(); return 0; } @@ -119,8 +121,8 @@ T tmain(T argc, T *argv) { // CHECK-NEXT: a = 2; // CHECK-NEXT: #pragma omp task default(none) private(argc,b) firstprivate(argv) shared(d) if(argc > 0) final(S::TS > 0) priority(argc) affinity(argc,argv[b:argc],arr[:],([argc][sizeof(T)])argv) // CHECK-NEXT: foo() -// CHECK-NEXT: #pragma omp taskgroup task_reduction(-: argc) -// CHECK-NEXT: #pragma omp task if(C) mergeable priority(C) in_reduction(-: argc) +// CHECK-NEXT: #pragma omp taskgroup task_reduction(+: argc) +// CHECK-NEXT: #pragma omp task if(C) mergeable priority(C) in_reduction(+: argc) // CHECK-NEXT: foo() // CHECK: template<> int tmain(int argc, int *argv) { // CHECK-NEXT: int b = argc, c, d, e, f, g; @@ -134,8 +136,8 @@ T tmain(T argc, T *argv) { // CHECK-NEXT: a = 2; // CHECK-NEXT: #pragma omp task default(none) private(argc,b) firstprivate(argv) shared(d) if(argc > 0) final(S::TS > 0) priority(argc) affinity(argc,argv[b:argc],arr[:],([argc][sizeof(int)])argv) // CHECK-NEXT: foo() -// CHECK-NEXT: #pragma omp taskgroup task_reduction(-: argc) -// CHECK-NEXT: #pragma omp task if(5) mergeable priority(5) in_reduction(-: argc) +// CHECK-NEXT: #pragma omp taskgroup task_reduction(+: argc) +// CHECK-NEXT: #pragma omp task if(5) mergeable priority(5) in_reduction(+: argc) // CHECK-NEXT: foo() // CHECK: template<> long tmain(long argc, long *argv) { // CHECK-NEXT: long b = argc, c, d, e, f, g; @@ -149,8 +151,8 @@ T tmain(T argc, T *argv) { // CHECK-NEXT: a = 2; // CHECK-NEXT: #pragma omp task default(none) private(argc,b) firstprivate(argv) shared(d) if(argc > 0) final(S::TS > 0) priority(argc) affinity(argc,argv[b:argc],arr[:],([argc][sizeof(long)])argv) // CHECK-NEXT: foo() -// CHECK-NEXT: #pragma omp taskgroup task_reduction(-: argc) -// CHECK-NEXT: #pragma omp task if(1) mergeable priority(1) in_reduction(-: argc) +// CHECK-NEXT: #pragma omp taskgroup task_reduction(+: argc) +// CHECK-NEXT: #pragma omp task if(1) mergeable priority(1) in_reduction(+: argc) // CHECK-NEXT: foo() enum Enum {}; @@ -199,6 +201,14 @@ int main(int argc, char **argv) { #pragma omp task depend(inout: omp_all_memory) foo(); // CHECK-NEXT: foo(); +#ifdef OMP60 +#pragma omp task threadset(omp_pool) +#pragma omp task threadset(omp_team) + foo(); +#endif + // CHECK60: #pragma omp task threadset(omp_pool) + // CHECK60: #pragma omp task threadset(omp_team) + // CHECK60-NEXT: foo(); return tmain(b, &b) + tmain(x, &x); } diff --git a/clang/test/OpenMP/task_codegen.cpp b/clang/test/OpenMP/task_codegen.cpp index c3e6d9e6b1cf7..ba8e6945de9d0 100644 --- a/clang/test/OpenMP/task_codegen.cpp +++ b/clang/test/OpenMP/task_codegen.cpp @@ -41,6 +41,9 @@ // RUN: -emit-llvm -o - -DOMP51 | FileCheck %s \ // RUN: --implicit-check-not="{{__kmpc|__tgt}}" +// RUN: %clang_cc1 -verify -Wno-vla -triple x86_64-apple-darwin10 -fopenmp -fopenmp-version=60 -DOMP60 -fopenmp-enable-irbuilder -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK6 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=60 -DOMP60 -fopenmp-enable-irbuilder -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=60 -DOMP60 -fopenmp-enable-irbuilder -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK6 // expected-no-diagnostics #ifndef HEADER @@ -65,6 +68,7 @@ struct S { S(const S &s) : a(s.a) {} ~S() {} }; + int a; int main() { char b; @@ -147,6 +151,7 @@ int main() { + // s1 = S(); @@ -215,6 +220,19 @@ void test_omp_all_memory() } } #endif // OMP51 + +#ifdef OMP60 +void test_threadset() +{ +#pragma omp task threadset(omp_team) + { + } +#pragma omp task threadset(omp_pool) + { + } +} +#endif // OMP60 + #endif // CHECK1-LABEL: define {{[^@]+}}@main // CHECK1-SAME: () #[[ATTR0:[0-9]+]] { @@ -10243,3 +10261,18 @@ void test_omp_all_memory() // CHECK4-51-NEXT: call void @__cxx_global_var_init() // CHECK4-51-NEXT: ret void // +// CHECK6-LABEL: define void @_Z14test_threadsetv() +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON_23:%.*]], align 1 +// CHECK6-NEXT: [[AGG_CAPTURED2:%.*]] = alloca [[STRUCT_ANON_25:%.*]], align 1 +// CHECK6-NEXT: call i32 @__kmpc_global_thread_num(ptr @[[GLOB_PTR:[0-9]+]]) +// CHECK6-NEXT: [[TMP0:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %omp_global_thread_num, i32 1, i64 40, i64 1, ptr @.omp_task_entry..[[ENTRY1:[0-9]+]]) +// CHECK6-NEXT: getelementptr inbounds nuw %struct.kmp_task_t_with_privates{{.*}}, ptr %0, i32 0, i32 0 +// CHECK6-NEXT: call i32 @__kmpc_global_thread_num(ptr @[[GLOB_PTR:[0-9]+]]) +// CHECK6-NEXT: call i32 @__kmpc_omp_task(ptr @1, i32 %omp_global_thread_num1, ptr %0) +// CHECK6-NEXT: call i32 @__kmpc_global_thread_num(ptr @[[GLOB_PTR2:[0-9]+]]) +// CHECK6-NEXT: [[TMP3:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %omp_global_thread_num3, i32 129, i64 40, i64 1, ptr @.omp_task_entry..[[ENTRY2:[0-9]+]]) +// CHECK6-NEXT: getelementptr inbounds nuw %struct.kmp_task_t_with_privates{{.*}}, ptr %3, i32 0, i32 0 +// CHECK6-NEXT: call i32 @__kmpc_global_thread_num(ptr @[[GLOB_PTR2:[0-9]+]]) +// CHECK6-NEXT: call i32 @__kmpc_omp_task(ptr @1, i32 %omp_global_thread_num4, ptr %3) +// CHECK6-NEXT: ret void diff --git a/clang/test/OpenMP/task_threadset_messages.cpp b/clang/test/OpenMP/task_threadset_messages.cpp new file mode 100755 index 0000000000000..f553a2da17ab9 --- /dev/null +++ b/clang/test/OpenMP/task_threadset_messages.cpp @@ -0,0 +1,99 @@ +// RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -std=c++11 -ferror-limit 200 -o - %s +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -std=c++11 -ferror-limit 200 -o - %s +// RUN: %clang_cc1 -verify=expected,omp51 -fopenmp -fopenmp-version=51 -std=c++11 -ferror-limit 200 -o - %s +// RUN: %clang_cc1 -verify=expected -DOMP60 -fopenmp -fopenmp-version=60 -std=c++11 -ferror-limit 200 -o - %s + +// RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -std=c++11 -ferror-limit 200 -o - %s +// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -std=c++11 -ferror-limit 200 -o - %s +// RUN: %clang_cc1 -verify=expected,omp51 -fopenmp-simd -fopenmp-version=51 -std=c++11 -ferror-limit 200 -o - %s +// RUN: %clang_cc1 -verify=expected -DOMP60 -fopenmp-simd -fopenmp-version=60 -std=c++11 -ferror-limit 200 -o - %s + +#ifdef OMP60 +struct ComplexStruct { + int data[10]; + struct InnerStruct { + float value; + } inner; +}; + +// Template class with member functions using 'threadset'. +template +class TemplateClass { +public: + void foo() { + #pragma omp task threadset(omp_pool) + { + T temp; + } + } + void bar() { + #pragma omp taskloop threadset(omp_team) + for (int i = 0; i < 10; ++i) {} + } +}; + +// Valid uses of 'threadset' with 'omp_pool' and 'omp_team' in task directive. +void test_task_threadset_valid() { + int a; + #pragma omp task threadset(omp_pool) + #pragma omp task threadset(omp_team) + #pragma omp task threadset(omp_pool) if(1) + #pragma omp task threadset(omp_team) priority(5) + #pragma omp task threadset(omp_pool) depend(out: a) + #pragma omp parallel + { + #pragma omp task threadset(omp_pool) + { + #pragma omp taskloop threadset(omp_team) + for (int i = 0; i < 5; ++i) {} + } + } + + TemplateClass obj; + obj.foo(); + obj.bar(); +} + +// Invalid uses of 'threadset' with incorrect arguments in task directive. +void test_task_threadset_invalid_args() { + #pragma omp task threadset(invalid_arg) // expected-error {{expected 'omp_pool' or 'omp_team' in OpenMP clause 'threadset'}} + #pragma omp task threadset(123) // expected-error {{expected 'omp_pool' or 'omp_team' in OpenMP clause 'threadset'}} + #pragma omp task threadset(omp_pool, omp_team) // expected-error {{expected ')'}} expected-note {{to match this '('}} + #pragma omp task threadset() // expected-error {{expected 'omp_pool' or 'omp_team' in OpenMP clause 'threadset'}} + {} +} + +// Valid uses of 'threadset' with 'omp_pool' and 'omp_team' in taskloop directive. +void test_taskloop_threadset_valid() { + #pragma omp taskloop threadset(omp_pool) + for (int i = 0; i < 10; ++i) {} + #pragma omp taskloop threadset(omp_team) + for (int i = 0; i < 10; ++i) {} + #pragma omp taskloop threadset(omp_pool) grainsize(5) + for (int i = 0; i < 10; ++i) {} + #pragma omp taskloop threadset(omp_team) num_tasks(2) + for (int i = 0; i < 10; ++i) {} +} + +// Invalid uses of 'threadset' with incorrect arguments in taskloop directive. +void test_taskloop_threadset_invalid_args() { + #pragma omp taskloop threadset(invalid_arg) // expected-error {{expected 'omp_pool' or 'omp_team' in OpenMP clause 'threadset'}} + for (int i = 0; i < 10; ++i) {} + #pragma omp taskloop threadset(123) // expected-error {{expected 'omp_pool' or 'omp_team' in OpenMP clause 'threadset'}} + for (int i = 0; i < 10; ++i) {} + #pragma omp taskloop threadset(omp_pool, omp_team) // expected-error {{expected ')'}} expected-note {{to match this '('}} + for (int i = 0; i < 10; ++i) {} + #pragma omp taskloop threadset() // expected-error {{expected 'omp_pool' or 'omp_team' in OpenMP clause 'threadset'}} + for (int i = 0; i < 10; ++i) {} +} + +#else +void test_threadset_not_supported() { + #pragma omp task threadset(omp_pool) // omp45-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp task'}} omp50-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp task'}} omp51-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp task'}} + #pragma omp task threadset(omp_team) // omp45-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp task'}} omp50-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp task'}} omp51-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp task'}} + #pragma omp taskloop threadset(omp_team) // omp45-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp taskloop'}} omp50-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp taskloop'}} omp51-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp taskloop'}} + for (int i = 0; i < 10; ++i) {} + #pragma omp taskloop threadset(omp_pool) // omp45-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp taskloop'}} omp50-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp taskloop'}} omp51-error {{unexpected OpenMP clause 'threadset' in directive '#pragma omp taskloop'}} + for (int i = 0; i < 10; ++i) {} +} +#endif diff --git a/clang/test/OpenMP/taskloop_ast_print.cpp b/clang/test/OpenMP/taskloop_ast_print.cpp index 1b6d7240fa66c..e4bf20af5d78e 100644 --- a/clang/test/OpenMP/taskloop_ast_print.cpp +++ b/clang/test/OpenMP/taskloop_ast_print.cpp @@ -1,8 +1,10 @@ // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=60 -DOMP60 -ast-print %s | FileCheck %s --check-prefix=CHECK60 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=60 -DOMP60 -ast-print %s | FileCheck %s --check-prefix=CHECK60 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s // expected-no-diagnostics @@ -87,6 +89,20 @@ int main(int argc, char **argv) { // CHECK-NEXT: #pragma omp cancel taskgroup // CHECK-NEXT: #pragma omp cancellation point taskgroup // CHECK-NEXT: foo(); +#ifdef OMP60 +#pragma omp taskloop threadset(omp_team) + for (int i = 0; i < 10; ++i) { +#pragma omp taskloop threadset(omp_pool) + for (int j = 0; j < 10; ++j) { + foo(); + } +} +#endif + // CHECK60: #pragma omp taskloop threadset(omp_team) + // CHECK60-NEXT: for (int i = 0; i < 10; ++i) { + // CHECK60: #pragma omp taskloop threadset(omp_pool) + // CHECK60-NEXT: for (int j = 0; j < 10; ++j) { + // CHECK60-NEXT: foo(); return (tmain(argc) + tmain(argv[0][0])); } diff --git a/clang/test/OpenMP/taskloop_codegen.cpp b/clang/test/OpenMP/taskloop_codegen.cpp index 69f8d3b160bfd..d1197607a2684 100644 --- a/clang/test/OpenMP/taskloop_codegen.cpp +++ b/clang/test/OpenMP/taskloop_codegen.cpp @@ -5,7 +5,12 @@ // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -x c++ -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s // RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s + // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} + +// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fopenmp-version=60 -DOMP60 -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK6 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=60 -DOMP60 -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=60 -DOMP60 -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK6 // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -241,4 +246,52 @@ void taskloop_with_class() { } } +#ifdef OMP60 +void test_threadset() +{ +#pragma omp taskloop threadset(omp_team) + for (int i = 0; i < 10; ++i) { + } +#pragma omp taskloop threadset(omp_pool) + for (int i = 0; i < 10; ++i) { + } +} +#endif // OMP60 +// CHECK6-LABEL: define void @_Z14test_threadsetv() +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON_14:%.*]], align 1 +// CHECK6-NEXT: %[[TMP:.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[AGG_CAPTURED1:%.*]] = alloca [[STRUCT_ANON_16:%.*]], align 1 +// CHECK6-NEXT: %[[TMP2:.*]] = alloca i32, align 4 +// CHECK6-NEXT: %[[TID0:.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB_PTR:[0-9]+]]) +// CHECK6-NEXT: call void @__kmpc_taskgroup(ptr @1, i32 %[[TID0:.*]]) +// CHECK6-NEXT: %[[TID1:.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %[[TID0:.*]], i32 1, i64 80, i64 1, ptr @.omp_task_entry..[[ENTRY1:[0-9]+]]) +// CHECK6-NEXT: %[[TID2:.*]] = getelementptr inbounds nuw %struct.kmp_task_t_with_privates{{.*}}, ptr %[[TID1:.*]], i32 0, i32 0 +// CHECK6-NEXT: %[[TID3:.*]] = getelementptr inbounds nuw %struct.kmp_task_t{{.*}}, ptr %[[TID2:.*]], i32 0, i32 5 +// CHECK6-NEXT: store i64 0, ptr %[[TID3:.*]], align 8 +// CHECK6-NEXT: %[[TID4:.*]] = getelementptr inbounds nuw %struct.kmp_task_t{{.*}}, ptr %[[TID2:.*]], i32 0, i32 6 +// CHECK6-NEXT: store i64 9, ptr %[[TID4:.*]], align 8 +// CHECK6-NEXT: %[[TID5:.*]] = getelementptr inbounds nuw %struct.kmp_task_t{{.*}}, ptr %[[TID2:.*]], i32 0, i32 7 +// CHECK6-NEXT: store i64 1, ptr %[[TID5:.*]], align 8 +// CHECK6-NEXT: %[[TID6:.*]] = getelementptr inbounds nuw %struct.kmp_task_t{{.*}}, ptr %[[TID2:.*]], i32 0, i32 9 +// CHECK6-NEXT: call void @llvm.memset.p0.i64(ptr align 8 %[[TID6:.*]], i8 0, i64 8, i1 false) +// CHECK6-NEXT: %[[TID7:.*]] = load i64, ptr %[[TID5:.*]], align 8 +// CHECK6-NEXT: call void @__kmpc_taskloop(ptr @1, i32 %[[TID0:.*]], ptr %[[TID1:.*]], i32 1, ptr %[[TID3:.*]], ptr %4, i64 %[[TID7:.*]], i32 1, i32 0, i64 0, ptr null) +// CHECK6-NEXT: call void @__kmpc_end_taskgroup(ptr @1, i32 %[[TID0:.*]]) +// CHECK6-NEXT: call void @__kmpc_taskgroup(ptr @1, i32 %[[TID0:.*]]) +// CHECK6-NEXT: %[[TID8:.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %[[TID0:.*]], i32 129, i64 80, i64 1, ptr @.omp_task_entry..[[ENTRY1:[0-9]+]]) +// CHECK6-NEXT: %[[TID9:.*]] = getelementptr inbounds nuw %struct.kmp_task_t_with_privates{{.*}}, ptr %[[TID8:.*]], i32 0, i32 0 +// CHECK6-NEXT: %[[TID10:.*]] = getelementptr inbounds nuw %struct.kmp_task_t{{.*}}, ptr %[[TID9:.*]], i32 0, i32 5 +// CHECK6-NEXT: store i64 0, ptr %[[TID10:.*]], align 8 +// CHECK6-NEXT: %[[TID11:.*]] = getelementptr inbounds nuw %struct.kmp_task_t{{.*}}, ptr %[[TID9:.*]], i32 0, i32 6 +// CHECK6-NEXT: store i64 9, ptr %[[TID11:.*]], align 8 +// CHECK6-NEXT: %[[TID12:.*]] = getelementptr inbounds nuw %struct.kmp_task_t{{.*}}, ptr %[[TID9:.*]], i32 0, i32 7 +// CHECK6-NEXT: store i64 1, ptr %[[TID12:.*]], align 8 +// CHECK6-NEXT: %[[TID13:.*]] = getelementptr inbounds nuw %struct.kmp_task_t{{.*}}, ptr %[[TID9:.*]], i32 0, i32 9 +// CHECK6-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TID13:.*]], i8 0, i64 8, i1 false) +// CHECK6-NEXT: %[[TID14:.*]] = load i64, ptr [[TID12:.*]], align 8 +// CHECK6-NEXT: call void @__kmpc_taskloop(ptr @1, i32 %[[TID0:.*]], ptr %[[TID8:.*]], i32 1, ptr %[[TID10:.*]], ptr %[[TID11:.*]], i64 %[[TID14:.*]], i32 1, i32 0, i64 0, ptr null) +// CHECK6-NEXT: call void @__kmpc_end_taskgroup(ptr @1, i32 %[[TID0:.*]]) +// CHECK6-NEXT: ret void + #endif diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index fc27fd29da933..08776d9bcabfc 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -2406,6 +2406,8 @@ void OMPClauseEnqueue::VisitOMPCompareClause(const OMPCompareClause *) {} void OMPClauseEnqueue::VisitOMPFailClause(const OMPFailClause *) {} +void OMPClauseEnqueue::VisitOMPThreadsetClause(const OMPThreadsetClause *) {} + void OMPClauseEnqueue::VisitOMPAbsentClause(const OMPAbsentClause *) {} void OMPClauseEnqueue::VisitOMPHoldsClause(const OMPHoldsClause *) {} diff --git a/flang/include/flang/Lower/OpenMP/Clauses.h b/flang/include/flang/Lower/OpenMP/Clauses.h index 74924661d9a03..688d01704370d 100644 --- a/flang/include/flang/Lower/OpenMP/Clauses.h +++ b/flang/include/flang/Lower/OpenMP/Clauses.h @@ -294,6 +294,7 @@ using Permutation = tomp::clause::PermutationT; using TaskReduction = tomp::clause::TaskReductionT; using ThreadLimit = tomp::clause::ThreadLimitT; using Threads = tomp::clause::ThreadsT; +using Threadset = tomp::clause::ThreadsetT; using Transparent = tomp::clause::TransparentT; using To = tomp::clause::ToT; using UnifiedAddress = tomp::clause::UnifiedAddressT; diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index bb970691c85c9..a7398a4ef970f 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -685,6 +685,8 @@ class ParseTreeDumper { NODE_ENUM(OmpTaskDependenceType, Value) NODE(parser, OmpTaskReductionClause) NODE(OmpTaskReductionClause, Modifier) + NODE(parser, OmpThreadsetClause) + NODE_ENUM(OmpThreadsetClause, ThreadsetPolicy) NODE(parser, OmpToClause) NODE(OmpToClause, Modifier) NODE(parser, OmpTraitProperty) diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index c3a8c2eab15f2..375790af90b74 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -4825,6 +4825,14 @@ struct OmpTaskReductionClause { std::tuple t; }; +// Ref: [6.0:442] +// threadset-clause -> +// THREADSET(omp_pool|omp_team) +struct OmpThreadsetClause { + ENUM_CLASS(ThreadsetPolicy, Omp_Pool, Omp_Team) + WRAPPER_CLASS_BOILERPLATE(OmpThreadsetClause, ThreadsetPolicy); +}; + // Ref: [4.5:107-109], [5.0:176-180], [5.1:205-210], [5.2:167-168] // // to-clause (in DECLARE TARGET) -> diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp index d39f9dda92a28..0f60b47991004 100644 --- a/flang/lib/Lower/OpenMP/Clauses.cpp +++ b/flang/lib/Lower/OpenMP/Clauses.cpp @@ -1482,6 +1482,21 @@ ThreadLimit make(const parser::OmpClause::ThreadLimit &inp, return ThreadLimit{/*Threadlim=*/makeExpr(inp.v, semaCtx)}; } +Threadset make(const parser::OmpClause::Threadset &inp, + semantics::SemanticsContext &semaCtx) { + // inp.v -> parser::OmpThreadsetClause + using wrapped = parser::OmpThreadsetClause; + + CLAUSET_ENUM_CONVERT( // + convert, wrapped::ThreadsetPolicy, Threadset::ThreadsetPolicy, + // clang-format off + MS(Omp_Pool, Omp_Pool) + MS(Omp_Team, Omp_Team) + // clang-format on + ); + return Threadset{/*ThreadsetPolicy=*/convert(inp.v.v)}; +} + // Threadprivate: empty // Threads: empty diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index e094458f001e3..aaaf1ec5d4626 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -3390,6 +3390,7 @@ CHECK_SIMPLE_CLAUSE(Read, OMPC_read) CHECK_SIMPLE_CLAUSE(Threadprivate, OMPC_threadprivate) CHECK_SIMPLE_CLAUSE(Groupprivate, OMPC_groupprivate) CHECK_SIMPLE_CLAUSE(Threads, OMPC_threads) +CHECK_SIMPLE_CLAUSE(Threadset, OMPC_threadset) CHECK_SIMPLE_CLAUSE(Inbranch, OMPC_inbranch) CHECK_SIMPLE_CLAUSE(Link, OMPC_link) CHECK_SIMPLE_CLAUSE(Indirect, OMPC_indirect) diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h index 87b95200b2459..d7f0e3a3d49da 100644 --- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h +++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h @@ -1167,6 +1167,14 @@ struct ThreadsT { using EmptyTrait = std::true_type; }; +// V6.0: [14.8] `threadset` clause +template // +struct ThreadsetT { + ENUM(ThreadsetPolicy, Omp_Pool, Omp_Team); + using WrapperTrait = std::true_type; + ThreadsetPolicy v; +}; + // V5.2: [5.9.1] `to` clause template // struct ToT { @@ -1352,9 +1360,9 @@ using WrapperClausesT = std::variant< ProcBindT, ReverseOffloadT, SafelenT, SelfMapsT, SeverityT, SharedT, SimdlenT, SizesT, PermutationT, ThreadLimitT, - UnifiedAddressT, UnifiedSharedMemoryT, UniformT, - UpdateT, UseDeviceAddrT, UseDevicePtrT, - UsesAllocatorsT>; + ThreadsetT, UnifiedAddressT, + UnifiedSharedMemoryT, UniformT, UpdateT, + UseDeviceAddrT, UseDevicePtrT, UsesAllocatorsT>; template using UnionOfAllClausesT = typename type::Union< // diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td index 61a1a05f6e904..208609f64f418 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMP.td +++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td @@ -539,6 +539,10 @@ def OMPC_GroupPrivate : Clause<[Spelling<"groupprivate">]> { def OMPC_Threads : Clause<[Spelling<"threads">]> { let clangClass = "OMPThreadsClause"; } +def OMPC_Threadset : Clause<[Spelling<"threadset">]> { + let clangClass = "OMPThreadsetClause"; + let flangClass = "OmpThreadsetClause"; +} def OMPC_To : Clause<[Spelling<"to">]> { let clangClass = "OMPToClause"; let flangClass = "OmpToClause"; @@ -1254,6 +1258,7 @@ def OMP_Task : Directive<[Spelling<"task">]> { VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, ]; @@ -1297,6 +1302,7 @@ def OMP_TaskLoop : Directive<[Spelling<"taskloop">]> { VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, ]; From 61d1bd980814db50c917a5e2a25dae5a261d4fb2 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 30 Oct 2025 10:26:27 +0000 Subject: [PATCH 175/539] Revert "[lldb-dap] Improving consistency of tests by removing concurrency." (#165688) Reverts llvm/llvm-project#165496 Due to flaky failures on Arm 32-bit since this change. Detailed in https://github.com/llvm/llvm-project/pull/165496#issuecomment-3467209089. --- .../test/tools/lldb-dap/dap_server.py | 206 +++++++++++------- .../test/tools/lldb-dap/lldbdap_testcase.py | 2 +- .../TestDAP_breakpointEvents.py | 30 +-- .../tools/lldb-dap/launch/TestDAP_launch.py | 2 +- .../module-event/TestDAP_module_event.py | 88 ++++---- .../tools/lldb-dap/module/TestDAP_module.py | 8 +- .../restart/TestDAP_restart_console.py | 24 +- .../lldb-dap/send-event/TestDAP_sendEvent.py | 2 +- 8 files changed, 203 insertions(+), 159 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py index 8f3652172dfdf..d892c01f0bc71 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py @@ -10,8 +10,8 @@ import subprocess import signal import sys +import threading import warnings -import selectors import time from typing import ( Any, @@ -139,6 +139,35 @@ def dump_memory(base_addr, data, num_per_line, outfile): outfile.write("\n") +def read_packet( + f: IO[bytes], trace_file: Optional[IO[str]] = None +) -> Optional[ProtocolMessage]: + """Decode a JSON packet that starts with the content length and is + followed by the JSON bytes from a file 'f'. Returns None on EOF. + """ + line = f.readline().decode("utf-8") + if len(line) == 0: + return None # EOF. + + # Watch for line that starts with the prefix + prefix = "Content-Length: " + if line.startswith(prefix): + # Decode length of JSON bytes + length = int(line[len(prefix) :]) + # Skip empty line + separator = f.readline().decode() + if separator != "": + Exception("malformed DAP content header, unexpected line: " + separator) + # Read JSON bytes + json_str = f.read(length).decode() + if trace_file: + trace_file.write("from adapter:\n%s\n" % (json_str)) + # Decode the JSON bytes into a python dictionary + return json.loads(json_str) + + raise Exception("unexpected malformed message from lldb-dap: " + line) + + def packet_type_is(packet, packet_type): return "type" in packet and packet["type"] == packet_type @@ -170,8 +199,16 @@ def __init__( self.log_file = log_file self.send = send self.recv = recv - self.selector = selectors.DefaultSelector() - self.selector.register(recv, selectors.EVENT_READ) + + # Packets that have been received and processed but have not yet been + # requested by a test case. + self._pending_packets: List[Optional[ProtocolMessage]] = [] + # Received packets that have not yet been processed. + self._recv_packets: List[Optional[ProtocolMessage]] = [] + # Used as a mutex for _recv_packets and for notify when _recv_packets + # changes. + self._recv_condition = threading.Condition() + self._recv_thread = threading.Thread(target=self._read_packet_thread) # session state self.init_commands = init_commands @@ -197,6 +234,9 @@ def __init__( # keyed by breakpoint id self.resolved_breakpoints: dict[str, Breakpoint] = {} + # trigger enqueue thread + self._recv_thread.start() + @classmethod def encode_content(cls, s: str) -> bytes: return ("Content-Length: %u\r\n\r\n%s" % (len(s), s)).encode("utf-8") @@ -212,46 +252,17 @@ def validate_response(cls, command, response): f"seq mismatch in response {command['seq']} != {response['request_seq']}" ) - def _read_packet( - self, - timeout: float = DEFAULT_TIMEOUT, - ) -> Optional[ProtocolMessage]: - """Decode a JSON packet that starts with the content length and is - followed by the JSON bytes from self.recv. Returns None on EOF. - """ - - ready = self.selector.select(timeout) - if not ready: - warnings.warn( - "timeout occurred waiting for a packet, check if the test has a" - " negative assertion and see if it can be inverted.", - stacklevel=4, - ) - return None # timeout - - line = self.recv.readline().decode("utf-8") - if len(line) == 0: - return None # EOF. - - # Watch for line that starts with the prefix - prefix = "Content-Length: " - if line.startswith(prefix): - # Decode length of JSON bytes - length = int(line[len(prefix) :]) - # Skip empty line - separator = self.recv.readline().decode() - if separator != "": - Exception("malformed DAP content header, unexpected line: " + separator) - # Read JSON bytes - json_str = self.recv.read(length).decode() - if self.trace_file: - self.trace_file.write( - "%s from adapter:\n%s\n" % (time.time(), json_str) - ) - # Decode the JSON bytes into a python dictionary - return json.loads(json_str) - - raise Exception("unexpected malformed message from lldb-dap: " + line) + def _read_packet_thread(self): + try: + while True: + packet = read_packet(self.recv, trace_file=self.trace_file) + # `packet` will be `None` on EOF. We want to pass it down to + # handle_recv_packet anyway so the main thread can handle unexpected + # termination of lldb-dap and stop waiting for new packets. + if not self._handle_recv_packet(packet): + break + finally: + dump_dap_log(self.log_file) def get_modules( self, start_module: Optional[int] = None, module_count: Optional[int] = None @@ -299,6 +310,34 @@ def collect_output( output += self.get_output(category, clear=clear) return output + def _enqueue_recv_packet(self, packet: Optional[ProtocolMessage]): + with self.recv_condition: + self.recv_packets.append(packet) + self.recv_condition.notify() + + def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool: + """Handles an incoming packet. + + Called by the read thread that is waiting for all incoming packets + to store the incoming packet in "self._recv_packets" in a thread safe + way. This function will then signal the "self._recv_condition" to + indicate a new packet is available. + + Args: + packet: A new packet to store. + + Returns: + True if the caller should keep calling this function for more + packets. + """ + with self._recv_condition: + self._recv_packets.append(packet) + self._recv_condition.notify() + # packet is None on EOF + return packet is not None and not ( + packet["type"] == "response" and packet["command"] == "disconnect" + ) + def _recv_packet( self, *, @@ -322,34 +361,46 @@ def _recv_packet( The first matching packet for the given predicate, if specified, otherwise None. """ - deadline = time.time() + timeout - - while time.time() < deadline: - packet = self._read_packet(timeout=deadline - time.time()) - if packet is None: - return None - self._process_recv_packet(packet) - if not predicate or predicate(packet): - return packet - - def _process_recv_packet(self, packet) -> None: + assert ( + threading.current_thread != self._recv_thread + ), "Must not be called from the _recv_thread" + + def process_until_match(): + self._process_recv_packets() + for i, packet in enumerate(self._pending_packets): + if packet is None: + # We need to return a truthy value to break out of the + # wait_for, use `EOFError` as an indicator of EOF. + return EOFError() + if predicate and predicate(packet): + self._pending_packets.pop(i) + return packet + + with self._recv_condition: + packet = self._recv_condition.wait_for(process_until_match, timeout) + return None if isinstance(packet, EOFError) else packet + + def _process_recv_packets(self) -> None: """Process received packets, updating the session state.""" - if packet and ("seq" not in packet or packet["seq"] == 0): - warnings.warn( - f"received a malformed packet, expected 'seq != 0' for {packet!r}" - ) - # Handle events that may modify any stateful properties of - # the DAP session. - if packet and packet["type"] == "event": - self._handle_event(packet) - elif packet and packet["type"] == "request": - # Handle reverse requests and keep processing. - self._handle_reverse_request(packet) + with self._recv_condition: + for packet in self._recv_packets: + if packet and ("seq" not in packet or packet["seq"] == 0): + warnings.warn( + f"received a malformed packet, expected 'seq != 0' for {packet!r}" + ) + # Handle events that may modify any stateful properties of + # the DAP session. + if packet and packet["type"] == "event": + self._handle_event(packet) + elif packet and packet["type"] == "request": + # Handle reverse requests and keep processing. + self._handle_reverse_request(packet) + # Move the packet to the pending queue. + self._pending_packets.append(packet) + self._recv_packets.clear() def _handle_event(self, packet: Event) -> None: """Handle any events that modify debug session state we track.""" - self.events.append(packet) - event = packet["event"] body: Optional[Dict] = packet.get("body", None) @@ -402,8 +453,6 @@ def _handle_event(self, packet: Event) -> None: self.invalidated_event = packet elif event == "memory": self.memory_event = packet - elif event == "module": - self.module_events.append(packet) def _handle_reverse_request(self, request: Request) -> None: if request in self.reverse_requests: @@ -472,14 +521,18 @@ def send_packet(self, packet: ProtocolMessage) -> int: Returns the seq number of the request. """ - packet["seq"] = self.sequence - self.sequence += 1 + # Set the seq for requests. + if packet["type"] == "request": + packet["seq"] = self.sequence + self.sequence += 1 + else: + packet["seq"] = 0 # Encode our command dictionary as a JSON string json_str = json.dumps(packet, separators=(",", ":")) if self.trace_file: - self.trace_file.write("%s to adapter:\n%s\n" % (time.time(), json_str)) + self.trace_file.write("to adapter:\n%s\n" % (json_str)) length = len(json_str) if length > 0: @@ -860,8 +913,6 @@ def request_restart(self, restartArguments=None): if restartArguments: command_dict["arguments"] = restartArguments - # Clear state, the process is about to restart... - self._process_continued(True) response = self._send_recv(command_dict) # Caller must still call wait_for_stopped. return response @@ -1428,10 +1479,8 @@ def request_testGetTargetBreakpoints(self): def terminate(self): self.send.close() - self.recv.close() - self.selector.close() - if self.log_file: - dump_dap_log(self.log_file) + if self._recv_thread.is_alive(): + self._recv_thread.join() def request_setInstructionBreakpoints(self, memory_reference=[]): breakpoints = [] @@ -1528,7 +1577,6 @@ def launch( stdout=subprocess.PIPE, stderr=sys.stderr, env=adapter_env, - bufsize=0, ) if connection is None: diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py index fd07324d2ddda..29935bb8046ff 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py @@ -416,7 +416,7 @@ def continue_to_next_stop(self): return self.dap_server.wait_for_stopped() def continue_to_breakpoint(self, breakpoint_id: str): - self.continue_to_breakpoints([breakpoint_id]) + self.continue_to_breakpoints((breakpoint_id)) def continue_to_breakpoints(self, breakpoint_ids): self.do_continue() diff --git a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py index 7b78541fb4f8e..beab4d6c1f5a6 100644 --- a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py +++ b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py @@ -81,20 +81,24 @@ def test_breakpoint_events(self): breakpoint["verified"], "expect foo breakpoint to not be verified" ) + # Flush the breakpoint events. + self.dap_server.wait_for_breakpoint_events() + # Continue to the breakpoint - self.continue_to_breakpoint(foo_bp_id) - self.continue_to_next_stop() # foo_bp2 - self.continue_to_breakpoint(main_bp_id) - self.continue_to_exit() + self.continue_to_breakpoints(dap_breakpoint_ids) - bp_events = [e for e in self.dap_server.events if e["event"] == "breakpoint"] + verified_breakpoint_ids = [] + unverified_breakpoint_ids = [] + for breakpoint_event in self.dap_server.wait_for_breakpoint_events(): + breakpoint = breakpoint_event["body"]["breakpoint"] + id = breakpoint["id"] + if breakpoint["verified"]: + verified_breakpoint_ids.append(id) + else: + unverified_breakpoint_ids.append(id) - main_bp_events = [ - e for e in bp_events if e["body"]["breakpoint"]["id"] == main_bp_id - ] - foo_bp_events = [ - e for e in bp_events if e["body"]["breakpoint"]["id"] == foo_bp_id - ] + self.assertIn(main_bp_id, unverified_breakpoint_ids) + self.assertIn(foo_bp_id, unverified_breakpoint_ids) - self.assertTrue(main_bp_events) - self.assertTrue(foo_bp_events) + self.assertIn(main_bp_id, verified_breakpoint_ids) + self.assertIn(foo_bp_id, verified_breakpoint_ids) diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py index dc6bf38303204..8db2316e73fc8 100644 --- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py +++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py @@ -156,7 +156,6 @@ def test_debuggerRoot(self): self.build_and_launch( program, debuggerRoot=program_parent_dir, initCommands=commands ) - self.continue_to_exit() output = self.get_console() self.assertTrue(output and len(output) > 0, "expect console output") lines = output.splitlines() @@ -172,6 +171,7 @@ def test_debuggerRoot(self): % (program_parent_dir, line[len(prefix) :]), ) self.assertTrue(found, "verified lldb-dap working directory") + self.continue_to_exit() def test_sourcePath(self): """ diff --git a/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py b/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py index 9d1d17b704f76..1f4afabbd161e 100644 --- a/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py +++ b/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py @@ -1,58 +1,58 @@ -""" -Test 'module' events for dynamically loaded libraries. -""" - +import dap_server from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil import lldbdap_testcase +import re class TestDAP_module_event(lldbdap_testcase.DAPTestCaseBase): - def lookup_module_id(self, name): - """Returns the identifier for the first module event starting with the given name.""" - for event in self.dap_server.module_events: - if self.get_dict_value(event, ["body", "module", "name"]).startswith(name): - return self.get_dict_value(event, ["body", "module", "id"]) - self.fail(f"No module events matching name={name}") - - def module_events(self, id): - """Finds all module events by identifier.""" - return [ - event - for event in self.dap_server.module_events - if self.get_dict_value(event, ["body", "module", "id"]) == id - ] - - def module_reasons(self, events): - """Returns the list of 'reason' values from the given events.""" - return [event["body"]["reason"] for event in events] - @skipIfWindows def test_module_event(self): - """ - Test that module events are fired on target load and when the list of - dynamic libraries updates while running. - """ program = self.getBuildArtifact("a.out") self.build_and_launch(program) - # We can analyze the order of events after the process exits. - self.continue_to_exit() - a_out_id = self.lookup_module_id("a.out") - a_out_events = self.module_events(id=a_out_id) + source = "main.cpp" + breakpoint1_line = line_number(source, "// breakpoint 1") + breakpoint2_line = line_number(source, "// breakpoint 2") + breakpoint3_line = line_number(source, "// breakpoint 3") - self.assertIn( - "new", - self.module_reasons(a_out_events), - "Expected a.out to load during the debug session.", + breakpoint_ids = self.set_source_breakpoints( + source, [breakpoint1_line, breakpoint2_line, breakpoint3_line] ) + self.continue_to_breakpoints(breakpoint_ids) + + # We're now stopped at breakpoint 1 before the dlopen. Flush all the module events. + event = self.dap_server.wait_for_event(["module"]) + while event is not None: + event = self.dap_server.wait_for_event(["module"]) + + # Continue to the second breakpoint, before the dlclose. + self.continue_to_breakpoints(breakpoint_ids) + + # Make sure we got a module event for libother. + event = self.dap_server.wait_for_event(["module"]) + self.assertIsNotNone(event, "didn't get a module event") + module_name = event["body"]["module"]["name"] + module_id = event["body"]["module"]["id"] + self.assertEqual(event["body"]["reason"], "new") + self.assertIn("libother", module_name) + + # Continue to the third breakpoint, after the dlclose. + self.continue_to_breakpoints(breakpoint_ids) + + # Make sure we got a module event for libother. + event = self.dap_server.wait_for_event(["module"]) + self.assertIsNotNone(event, "didn't get a module event") + reason = event["body"]["reason"] + self.assertEqual(reason, "removed") + self.assertEqual(event["body"]["module"]["id"], module_id) + + # The removed module event should omit everything but the module id and name + # as they are required fields. + module_data = event["body"]["module"] + required_keys = ["id", "name"] + self.assertListEqual(list(module_data.keys()), required_keys) + self.assertEqual(module_data["name"], "", "expects empty name.") - libother_id = self.lookup_module_id( - "libother." # libother.so or libother.dylib based on OS. - ) - libother_events = self.module_events(id=libother_id) - self.assertEqual( - self.module_reasons(libother_events), - ["new", "removed"], - "Expected libother to be loaded then unloaded during the debug session.", - ) + self.continue_to_exit() diff --git a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py index 2d00c512721c6..0ed53dac5d869 100644 --- a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py +++ b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py @@ -64,18 +64,19 @@ def check_symbols_loaded_with_size(): self.assertEqual(program, program_module["path"]) self.assertIn("addressRange", program_module) - self.continue_to_exit() - # Collect all the module names we saw as events. module_new_names = [] module_changed_names = [] - for module_event in self.dap_server.module_events: + module_event = self.dap_server.wait_for_event(["module"]) + while module_event is not None: reason = module_event["body"]["reason"] if reason == "new": module_new_names.append(module_event["body"]["module"]["name"]) elif reason == "changed": module_changed_names.append(module_event["body"]["module"]["name"]) + module_event = self.dap_server.wait_for_event(["module"]) + # Make sure we got an event for every active module. self.assertNotEqual(len(module_new_names), 0) for module in active_modules: @@ -85,6 +86,7 @@ def check_symbols_loaded_with_size(): # symbols got added. self.assertNotEqual(len(module_changed_names), 0) self.assertIn(program_module["name"], module_changed_names) + self.continue_to_exit() @skipIfWindows def test_modules(self): diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py index fa62ec243f5c5..e1ad1425a993d 100644 --- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py +++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py @@ -30,11 +30,7 @@ def verify_stopped_on_entry(self, stopped_events: List[Dict[str, Any]]): if reason == "entry": seen_stopped_event += 1 - self.assertEqual( - seen_stopped_event, - 1, - f"expect only one stopped entry event in {stopped_events}", - ) + self.assertEqual(seen_stopped_event, 1, "expect only one stopped entry event.") @skipIfAsan @skipIfWindows @@ -96,13 +92,11 @@ def test_stopOnEntry(self): self.build_and_launch(program, console="integratedTerminal", stopOnEntry=True) [bp_main] = self.set_function_breakpoints(["main"]) - self.dap_server.request_configurationDone() - stopped_threads = list(self.dap_server.thread_stop_reasons.values()) + self.dap_server.request_continue() # sends configuration done + stopped_events = self.dap_server.wait_for_stopped() # We should be stopped at the entry point. - self.assertEqual( - len(stopped_threads), 1, "Expected the main thread to be stopped on entry." - ) - self.assertEqual(stopped_threads[0]["reason"], "entry") + self.assertGreaterEqual(len(stopped_events), 0, "expect stopped events") + self.verify_stopped_on_entry(stopped_events) # Then, if we continue, we should hit the breakpoint at main. self.dap_server.request_continue() @@ -111,12 +105,8 @@ def test_stopOnEntry(self): # Restart and check that we still get a stopped event before reaching # main. self.dap_server.request_restart() - stopped_threads = list(self.dap_server.thread_stop_reasons.values()) - # We should be stopped at the entry point. - self.assertEqual( - len(stopped_threads), 1, "Expected the main thread to be stopped on entry." - ) - self.assertEqual(stopped_threads[0]["reason"], "entry") + stopped_events = self.dap_server.wait_for_stopped() + self.verify_stopped_on_entry(stopped_events) # continue to main self.dap_server.request_continue() diff --git a/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py index 0184020589176..a01845669666f 100644 --- a/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py +++ b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py @@ -32,7 +32,7 @@ def test_send_event(self): ], ) self.set_source_breakpoints(source, [breakpoint_line]) - self.do_continue() + self.continue_to_next_stop() custom_event = self.dap_server.wait_for_event( filter=["my-custom-event-no-body"] From 00965d672bd1bf6995b544e8504ec8a8c58e1adf Mon Sep 17 00:00:00 2001 From: Ebuka Ezike Date: Thu, 30 Oct 2025 10:41:17 +0000 Subject: [PATCH 176/539] [lldb-dap][test] skip io_redirection in debug builds (#165593) Currently all `runInTerminal` test are skipped in debug builds because, when attaching it times out parsing the debug symbols of lldb-dap. Add this test since it is running in teminal. --- lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py index 8db2316e73fc8..ca881f1d817c5 100644 --- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py +++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py @@ -642,6 +642,7 @@ def test_stdio_redirection(self): @skipIfAsan @skipIfWindows @skipIf(oslist=["linux"], archs=no_match(["x86_64"])) + @skipIfBuildType(["debug"]) def test_stdio_redirection_and_console(self): """ Test stdio redirection and console. From 1d8ee1e516e15cfbb1deefa6ef4a91d9341969b3 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Thu, 30 Oct 2025 10:46:37 +0000 Subject: [PATCH 177/539] [Clang][AArch64] Lower NEON vaddv/vminv/vmaxv builtins to llvm.vector.reduce intrinsics. (#165400) This is the first step in removing some NEON reduction intrinsics that duplicate the behaviour of their llvm.vector.reduce counterpart. NOTE: The i8/i16 variants differ in that the NEON versions return an i32 result. However, this looks more about making their code generation convenient with SelectionDAG disgarding the extra bits. This is only relevant for the next phase because the Clang usage always truncate their result, making llvm.vector.reduce a drop in replacement. --- clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 249 +++---------------- clang/test/CodeGen/AArch64/neon-across.c | 132 ++++------ clang/test/CodeGen/AArch64/neon-intrinsics.c | 20 +- 3 files changed, 104 insertions(+), 297 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index 60f9b86333670..15fa78ddba715 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -1193,14 +1193,22 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = { NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType), NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType), NEONMAP1(vaddv_f32, aarch64_neon_faddv, AddRetType | Add1ArgType), - NEONMAP1(vaddv_s32, aarch64_neon_saddv, AddRetType | Add1ArgType), - NEONMAP1(vaddv_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType), + NEONMAP1(vaddv_s16, vector_reduce_add, Add1ArgType), + NEONMAP1(vaddv_s32, vector_reduce_add, Add1ArgType), + NEONMAP1(vaddv_s8, vector_reduce_add, Add1ArgType), + NEONMAP1(vaddv_u16, vector_reduce_add, Add1ArgType), + NEONMAP1(vaddv_u32, vector_reduce_add, Add1ArgType), + NEONMAP1(vaddv_u8, vector_reduce_add, Add1ArgType), NEONMAP1(vaddvq_f32, aarch64_neon_faddv, AddRetType | Add1ArgType), NEONMAP1(vaddvq_f64, aarch64_neon_faddv, AddRetType | Add1ArgType), - NEONMAP1(vaddvq_s32, aarch64_neon_saddv, AddRetType | Add1ArgType), - NEONMAP1(vaddvq_s64, aarch64_neon_saddv, AddRetType | Add1ArgType), - NEONMAP1(vaddvq_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType), - NEONMAP1(vaddvq_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType), + NEONMAP1(vaddvq_s16, vector_reduce_add, Add1ArgType), + NEONMAP1(vaddvq_s32, vector_reduce_add, Add1ArgType), + NEONMAP1(vaddvq_s64, vector_reduce_add, Add1ArgType), + NEONMAP1(vaddvq_s8, vector_reduce_add, Add1ArgType), + NEONMAP1(vaddvq_u16, vector_reduce_add, Add1ArgType), + NEONMAP1(vaddvq_u32, vector_reduce_add, Add1ArgType), + NEONMAP1(vaddvq_u64, vector_reduce_add, Add1ArgType), + NEONMAP1(vaddvq_u8, vector_reduce_add, Add1ArgType), NEONMAP1(vcaged_f64, aarch64_neon_facge, AddRetType | Add1ArgType), NEONMAP1(vcages_f32, aarch64_neon_facge, AddRetType | Add1ArgType), NEONMAP1(vcagtd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType), @@ -1243,27 +1251,43 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = { NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), NEONMAP1(vmaxnmvq_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), NEONMAP1(vmaxv_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType), - NEONMAP1(vmaxv_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType), - NEONMAP1(vmaxv_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType), + NEONMAP1(vmaxv_s16, vector_reduce_smax, Add1ArgType), + NEONMAP1(vmaxv_s32, vector_reduce_smax, Add1ArgType), + NEONMAP1(vmaxv_s8, vector_reduce_smax, Add1ArgType), + NEONMAP1(vmaxv_u16, vector_reduce_umax, Add1ArgType), + NEONMAP1(vmaxv_u32, vector_reduce_umax, Add1ArgType), + NEONMAP1(vmaxv_u8, vector_reduce_umax, Add1ArgType), NEONMAP1(vmaxvq_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType), NEONMAP1(vmaxvq_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType), - NEONMAP1(vmaxvq_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType), - NEONMAP1(vmaxvq_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType), + NEONMAP1(vmaxvq_s16, vector_reduce_smax, Add1ArgType), + NEONMAP1(vmaxvq_s32, vector_reduce_smax, Add1ArgType), + NEONMAP1(vmaxvq_s8, vector_reduce_smax, Add1ArgType), + NEONMAP1(vmaxvq_u16, vector_reduce_umax, Add1ArgType), + NEONMAP1(vmaxvq_u32, vector_reduce_umax, Add1ArgType), + NEONMAP1(vmaxvq_u8, vector_reduce_umax, Add1ArgType), NEONMAP1(vminnmv_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType), NEONMAP1(vminnmvq_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType), NEONMAP1(vminnmvq_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType), NEONMAP1(vminv_f32, aarch64_neon_fminv, AddRetType | Add1ArgType), - NEONMAP1(vminv_s32, aarch64_neon_sminv, AddRetType | Add1ArgType), - NEONMAP1(vminv_u32, aarch64_neon_uminv, AddRetType | Add1ArgType), + NEONMAP1(vminv_s16, vector_reduce_smin, Add1ArgType), + NEONMAP1(vminv_s32, vector_reduce_smin, Add1ArgType), + NEONMAP1(vminv_s8, vector_reduce_smin, Add1ArgType), + NEONMAP1(vminv_u16, vector_reduce_umin, Add1ArgType), + NEONMAP1(vminv_u32, vector_reduce_umin, Add1ArgType), + NEONMAP1(vminv_u8, vector_reduce_umin, Add1ArgType), NEONMAP1(vminvq_f32, aarch64_neon_fminv, AddRetType | Add1ArgType), NEONMAP1(vminvq_f64, aarch64_neon_fminv, AddRetType | Add1ArgType), - NEONMAP1(vminvq_s32, aarch64_neon_sminv, AddRetType | Add1ArgType), - NEONMAP1(vminvq_u32, aarch64_neon_uminv, AddRetType | Add1ArgType), + NEONMAP1(vminvq_s16, vector_reduce_smin, Add1ArgType), + NEONMAP1(vminvq_s32, vector_reduce_smin, Add1ArgType), + NEONMAP1(vminvq_s8, vector_reduce_smin, Add1ArgType), + NEONMAP1(vminvq_u16, vector_reduce_umin, Add1ArgType), + NEONMAP1(vminvq_u32, vector_reduce_umin, Add1ArgType), + NEONMAP1(vminvq_u8, vector_reduce_umin, Add1ArgType), NEONMAP1(vmull_p64, aarch64_neon_pmull64, 0), NEONMAP1(vmulxd_f64, aarch64_neon_fmulx, Add1ArgType), NEONMAP1(vmulxs_f32, aarch64_neon_fmulx, Add1ArgType), - NEONMAP1(vpaddd_s64, aarch64_neon_uaddv, AddRetType | Add1ArgType), - NEONMAP1(vpaddd_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType), + NEONMAP1(vpaddd_s64, vector_reduce_add, Add1ArgType), + NEONMAP1(vpaddd_u64, vector_reduce_add, Add1ArgType), NEONMAP1(vpmaxnmqd_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), NEONMAP1(vpmaxnms_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), NEONMAP1(vpmaxqd_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType), @@ -7067,127 +7091,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Int = Intrinsic::bitreverse; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit"); } - case NEON::BI__builtin_neon_vaddv_u8: - // FIXME: These are handled by the AArch64 scalar code. - usgn = true; - [[fallthrough]]; - case NEON::BI__builtin_neon_vaddv_s8: { - Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int8Ty, 8); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv"); - return Builder.CreateTrunc(Ops[0], Int8Ty); - } - case NEON::BI__builtin_neon_vaddv_u16: - usgn = true; - [[fallthrough]]; - case NEON::BI__builtin_neon_vaddv_s16: { - Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int16Ty, 4); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv"); - return Builder.CreateTrunc(Ops[0], Int16Ty); - } - case NEON::BI__builtin_neon_vaddvq_u8: - usgn = true; - [[fallthrough]]; - case NEON::BI__builtin_neon_vaddvq_s8: { - Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int8Ty, 16); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv"); - return Builder.CreateTrunc(Ops[0], Int8Ty); - } - case NEON::BI__builtin_neon_vaddvq_u16: - usgn = true; - [[fallthrough]]; - case NEON::BI__builtin_neon_vaddvq_s16: { - Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int16Ty, 8); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv"); - return Builder.CreateTrunc(Ops[0], Int16Ty); - } - case NEON::BI__builtin_neon_vmaxv_u8: { - Int = Intrinsic::aarch64_neon_umaxv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int8Ty, 8); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); - return Builder.CreateTrunc(Ops[0], Int8Ty); - } - case NEON::BI__builtin_neon_vmaxv_u16: { - Int = Intrinsic::aarch64_neon_umaxv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int16Ty, 4); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); - return Builder.CreateTrunc(Ops[0], Int16Ty); - } - case NEON::BI__builtin_neon_vmaxvq_u8: { - Int = Intrinsic::aarch64_neon_umaxv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int8Ty, 16); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); - return Builder.CreateTrunc(Ops[0], Int8Ty); - } - case NEON::BI__builtin_neon_vmaxvq_u16: { - Int = Intrinsic::aarch64_neon_umaxv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int16Ty, 8); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); - return Builder.CreateTrunc(Ops[0], Int16Ty); - } - case NEON::BI__builtin_neon_vmaxv_s8: { - Int = Intrinsic::aarch64_neon_smaxv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int8Ty, 8); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); - return Builder.CreateTrunc(Ops[0], Int8Ty); - } - case NEON::BI__builtin_neon_vmaxv_s16: { - Int = Intrinsic::aarch64_neon_smaxv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int16Ty, 4); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); - return Builder.CreateTrunc(Ops[0], Int16Ty); - } - case NEON::BI__builtin_neon_vmaxvq_s8: { - Int = Intrinsic::aarch64_neon_smaxv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int8Ty, 16); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); - return Builder.CreateTrunc(Ops[0], Int8Ty); - } - case NEON::BI__builtin_neon_vmaxvq_s16: { - Int = Intrinsic::aarch64_neon_smaxv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int16Ty, 8); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); - return Builder.CreateTrunc(Ops[0], Int16Ty); - } case NEON::BI__builtin_neon_vmaxv_f16: { Int = Intrinsic::aarch64_neon_fmaxv; Ty = HalfTy; @@ -7206,78 +7109,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); return Builder.CreateTrunc(Ops[0], HalfTy); } - case NEON::BI__builtin_neon_vminv_u8: { - Int = Intrinsic::aarch64_neon_uminv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int8Ty, 8); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); - return Builder.CreateTrunc(Ops[0], Int8Ty); - } - case NEON::BI__builtin_neon_vminv_u16: { - Int = Intrinsic::aarch64_neon_uminv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int16Ty, 4); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); - return Builder.CreateTrunc(Ops[0], Int16Ty); - } - case NEON::BI__builtin_neon_vminvq_u8: { - Int = Intrinsic::aarch64_neon_uminv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int8Ty, 16); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); - return Builder.CreateTrunc(Ops[0], Int8Ty); - } - case NEON::BI__builtin_neon_vminvq_u16: { - Int = Intrinsic::aarch64_neon_uminv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int16Ty, 8); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); - return Builder.CreateTrunc(Ops[0], Int16Ty); - } - case NEON::BI__builtin_neon_vminv_s8: { - Int = Intrinsic::aarch64_neon_sminv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int8Ty, 8); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); - return Builder.CreateTrunc(Ops[0], Int8Ty); - } - case NEON::BI__builtin_neon_vminv_s16: { - Int = Intrinsic::aarch64_neon_sminv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int16Ty, 4); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); - return Builder.CreateTrunc(Ops[0], Int16Ty); - } - case NEON::BI__builtin_neon_vminvq_s8: { - Int = Intrinsic::aarch64_neon_sminv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int8Ty, 16); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); - return Builder.CreateTrunc(Ops[0], Int8Ty); - } - case NEON::BI__builtin_neon_vminvq_s16: { - Int = Intrinsic::aarch64_neon_sminv; - Ty = Int32Ty; - VTy = llvm::FixedVectorType::get(Int16Ty, 8); - llvm::Type *Tys[2] = { Ty, VTy }; - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); - return Builder.CreateTrunc(Ops[0], Int16Ty); - } case NEON::BI__builtin_neon_vminv_f16: { Int = Intrinsic::aarch64_neon_fminv; Ty = HalfTy; diff --git a/clang/test/CodeGen/AArch64/neon-across.c b/clang/test/CodeGen/AArch64/neon-across.c index aa0387d89dfef..aae5097da7789 100644 --- a/clang/test/CodeGen/AArch64/neon-across.c +++ b/clang/test/CodeGen/AArch64/neon-across.c @@ -113,9 +113,8 @@ uint64_t test_vaddlvq_u32(uint32x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxv_s8 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8 -// CHECK-NEXT: ret i8 [[TMP0]] +// CHECK-NEXT: [[VMAXV_S8_I:%.*]] = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret i8 [[VMAXV_S8_I]] // int8_t test_vmaxv_s8(int8x8_t a) { return vmaxv_s8(a); @@ -124,9 +123,8 @@ int8_t test_vmaxv_s8(int8x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxv_s16 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16 -// CHECK-NEXT: ret i16 [[TMP0]] +// CHECK-NEXT: [[VMAXV_S16_I:%.*]] = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret i16 [[VMAXV_S16_I]] // int16_t test_vmaxv_s16(int16x4_t a) { return vmaxv_s16(a); @@ -135,9 +133,8 @@ int16_t test_vmaxv_s16(int16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxv_u8 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8 -// CHECK-NEXT: ret i8 [[TMP0]] +// CHECK-NEXT: [[VMAXV_U8_I:%.*]] = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret i8 [[VMAXV_U8_I]] // uint8_t test_vmaxv_u8(uint8x8_t a) { return vmaxv_u8(a); @@ -146,9 +143,8 @@ uint8_t test_vmaxv_u8(uint8x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxv_u16 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16 -// CHECK-NEXT: ret i16 [[TMP0]] +// CHECK-NEXT: [[VMAXV_U16_I:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret i16 [[VMAXV_U16_I]] // uint16_t test_vmaxv_u16(uint16x4_t a) { return vmaxv_u16(a); @@ -157,9 +153,8 @@ uint16_t test_vmaxv_u16(uint16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s8 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8 -// CHECK-NEXT: ret i8 [[TMP0]] +// CHECK-NEXT: [[VMAXVQ_S8_I:%.*]] = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret i8 [[VMAXVQ_S8_I]] // int8_t test_vmaxvq_s8(int8x16_t a) { return vmaxvq_s8(a); @@ -168,9 +163,8 @@ int8_t test_vmaxvq_s8(int8x16_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s16 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16 -// CHECK-NEXT: ret i16 [[TMP0]] +// CHECK-NEXT: [[VMAXVQ_S16_I:%.*]] = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret i16 [[VMAXVQ_S16_I]] // int16_t test_vmaxvq_s16(int16x8_t a) { return vmaxvq_s16(a); @@ -179,7 +173,7 @@ int16_t test_vmaxvq_s16(int16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s32 // CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VMAXVQ_S32_I]] // int32_t test_vmaxvq_s32(int32x4_t a) { @@ -189,9 +183,8 @@ int32_t test_vmaxvq_s32(int32x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u8 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8 -// CHECK-NEXT: ret i8 [[TMP0]] +// CHECK-NEXT: [[VMAXVQ_U8_I:%.*]] = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret i8 [[VMAXVQ_U8_I]] // uint8_t test_vmaxvq_u8(uint8x16_t a) { return vmaxvq_u8(a); @@ -200,9 +193,8 @@ uint8_t test_vmaxvq_u8(uint8x16_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u16 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16 -// CHECK-NEXT: ret i16 [[TMP0]] +// CHECK-NEXT: [[VMAXVQ_U16_I:%.*]] = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret i16 [[VMAXVQ_U16_I]] // uint16_t test_vmaxvq_u16(uint16x8_t a) { return vmaxvq_u16(a); @@ -211,7 +203,7 @@ uint16_t test_vmaxvq_u16(uint16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u32 // CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VMAXVQ_U32_I]] // uint32_t test_vmaxvq_u32(uint32x4_t a) { @@ -221,9 +213,8 @@ uint32_t test_vmaxvq_u32(uint32x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminv_s8 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8 -// CHECK-NEXT: ret i8 [[TMP0]] +// CHECK-NEXT: [[VMINV_S8_I:%.*]] = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret i8 [[VMINV_S8_I]] // int8_t test_vminv_s8(int8x8_t a) { return vminv_s8(a); @@ -232,9 +223,8 @@ int8_t test_vminv_s8(int8x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminv_s16 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16 -// CHECK-NEXT: ret i16 [[TMP0]] +// CHECK-NEXT: [[VMINV_S16_I:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret i16 [[VMINV_S16_I]] // int16_t test_vminv_s16(int16x4_t a) { return vminv_s16(a); @@ -243,9 +233,8 @@ int16_t test_vminv_s16(int16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminv_u8 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8 -// CHECK-NEXT: ret i8 [[TMP0]] +// CHECK-NEXT: [[VMINV_U8_I:%.*]] = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret i8 [[VMINV_U8_I]] // uint8_t test_vminv_u8(uint8x8_t a) { return vminv_u8(a); @@ -254,9 +243,8 @@ uint8_t test_vminv_u8(uint8x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminv_u16 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16 -// CHECK-NEXT: ret i16 [[TMP0]] +// CHECK-NEXT: [[VMINV_U16_I:%.*]] = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret i16 [[VMINV_U16_I]] // uint16_t test_vminv_u16(uint16x4_t a) { return vminv_u16(a); @@ -265,9 +253,8 @@ uint16_t test_vminv_u16(uint16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminvq_s8 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8 -// CHECK-NEXT: ret i8 [[TMP0]] +// CHECK-NEXT: [[VMINVQ_S8_I:%.*]] = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret i8 [[VMINVQ_S8_I]] // int8_t test_vminvq_s8(int8x16_t a) { return vminvq_s8(a); @@ -276,9 +263,8 @@ int8_t test_vminvq_s8(int8x16_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminvq_s16 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16 -// CHECK-NEXT: ret i16 [[TMP0]] +// CHECK-NEXT: [[VMINVQ_S16_I:%.*]] = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret i16 [[VMINVQ_S16_I]] // int16_t test_vminvq_s16(int16x8_t a) { return vminvq_s16(a); @@ -287,7 +273,7 @@ int16_t test_vminvq_s16(int16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminvq_s32 // CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: [[VMINVQ_S32_I:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VMINVQ_S32_I]] // int32_t test_vminvq_s32(int32x4_t a) { @@ -297,9 +283,8 @@ int32_t test_vminvq_s32(int32x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminvq_u8 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8 -// CHECK-NEXT: ret i8 [[TMP0]] +// CHECK-NEXT: [[VMINVQ_U8_I:%.*]] = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret i8 [[VMINVQ_U8_I]] // uint8_t test_vminvq_u8(uint8x16_t a) { return vminvq_u8(a); @@ -308,9 +293,8 @@ uint8_t test_vminvq_u8(uint8x16_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminvq_u16 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16 -// CHECK-NEXT: ret i16 [[TMP0]] +// CHECK-NEXT: [[VMINVQ_U16_I:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret i16 [[VMINVQ_U16_I]] // uint16_t test_vminvq_u16(uint16x8_t a) { return vminvq_u16(a); @@ -319,7 +303,7 @@ uint16_t test_vminvq_u16(uint16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminvq_u32 // CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: [[VMINVQ_U32_I:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VMINVQ_U32_I]] // uint32_t test_vminvq_u32(uint32x4_t a) { @@ -329,9 +313,8 @@ uint32_t test_vminvq_u32(uint32x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vaddv_s8 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8 -// CHECK-NEXT: ret i8 [[TMP0]] +// CHECK-NEXT: [[VADDV_S8_I:%.*]] = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret i8 [[VADDV_S8_I]] // int8_t test_vaddv_s8(int8x8_t a) { return vaddv_s8(a); @@ -340,9 +323,8 @@ int8_t test_vaddv_s8(int8x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vaddv_s16 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16 -// CHECK-NEXT: ret i16 [[TMP0]] +// CHECK-NEXT: [[VADDV_S16_I:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret i16 [[VADDV_S16_I]] // int16_t test_vaddv_s16(int16x4_t a) { return vaddv_s16(a); @@ -351,9 +333,8 @@ int16_t test_vaddv_s16(int16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vaddv_u8 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8 -// CHECK-NEXT: ret i8 [[TMP0]] +// CHECK-NEXT: [[VADDV_U8_I:%.*]] = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret i8 [[VADDV_U8_I]] // uint8_t test_vaddv_u8(uint8x8_t a) { return vaddv_u8(a); @@ -362,9 +343,8 @@ uint8_t test_vaddv_u8(uint8x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vaddv_u16 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16 -// CHECK-NEXT: ret i16 [[TMP0]] +// CHECK-NEXT: [[VADDV_U16_I:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret i16 [[VADDV_U16_I]] // uint16_t test_vaddv_u16(uint16x4_t a) { return vaddv_u16(a); @@ -373,9 +353,8 @@ uint16_t test_vaddv_u16(uint16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s8 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8 -// CHECK-NEXT: ret i8 [[TMP0]] +// CHECK-NEXT: [[VADDVQ_S8_I:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret i8 [[VADDVQ_S8_I]] // int8_t test_vaddvq_s8(int8x16_t a) { return vaddvq_s8(a); @@ -384,9 +363,8 @@ int8_t test_vaddvq_s8(int8x16_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s16 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16 -// CHECK-NEXT: ret i16 [[TMP0]] +// CHECK-NEXT: [[VADDVQ_S16_I:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret i16 [[VADDVQ_S16_I]] // int16_t test_vaddvq_s16(int16x8_t a) { return vaddvq_s16(a); @@ -395,7 +373,7 @@ int16_t test_vaddvq_s16(int16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s32 // CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: [[VADDVQ_S32_I:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VADDVQ_S32_I]] // int32_t test_vaddvq_s32(int32x4_t a) { @@ -405,9 +383,8 @@ int32_t test_vaddvq_s32(int32x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u8 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8 -// CHECK-NEXT: ret i8 [[TMP0]] +// CHECK-NEXT: [[VADDVQ_U8_I:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret i8 [[VADDVQ_U8_I]] // uint8_t test_vaddvq_u8(uint8x16_t a) { return vaddvq_u8(a); @@ -416,9 +393,8 @@ uint8_t test_vaddvq_u8(uint8x16_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u16 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> [[A]]) -// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16 -// CHECK-NEXT: ret i16 [[TMP0]] +// CHECK-NEXT: [[VADDVQ_U16_I:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret i16 [[VADDVQ_U16_I]] // uint16_t test_vaddvq_u16(uint16x8_t a) { return vaddvq_u16(a); @@ -427,7 +403,7 @@ uint16_t test_vaddvq_u16(uint16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u32 // CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: [[VADDVQ_U32_I:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VADDVQ_U32_I]] // uint32_t test_vaddvq_u32(uint32x4_t a) { diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c b/clang/test/CodeGen/AArch64/neon-intrinsics.c index 035e1ca1b45e8..1c628bbba483f 100644 --- a/clang/test/CodeGen/AArch64/neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c @@ -12643,7 +12643,7 @@ uint64_t test_vqrshld_u64(uint64_t a, int64_t b) { // CHECK-LABEL: define dso_local i64 @test_vpaddd_s64( // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VPADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A]]) +// CHECK-NEXT: [[VPADDD_S64_I:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[A]]) // CHECK-NEXT: ret i64 [[VPADDD_S64_I]] // int64_t test_vpaddd_s64(int64x2_t a) { @@ -23227,7 +23227,7 @@ uint64x2_t test_vpaddq_u64(uint64x2_t a, uint64x2_t b) { // CHECK-LABEL: define dso_local i64 @test_vpaddd_u64( // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VPADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A]]) +// CHECK-NEXT: [[VPADDD_U64_I:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[A]]) // CHECK-NEXT: ret i64 [[VPADDD_U64_I]] // uint64_t test_vpaddd_u64(uint64x2_t a) { @@ -23237,7 +23237,7 @@ uint64_t test_vpaddd_u64(uint64x2_t a) { // CHECK-LABEL: define dso_local i64 @test_vaddvq_s64( // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VADDVQ_S64_I:%.*]] = call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> [[A]]) +// CHECK-NEXT: [[VADDVQ_S64_I:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[A]]) // CHECK-NEXT: ret i64 [[VADDVQ_S64_I]] // int64_t test_vaddvq_s64(int64x2_t a) { @@ -23247,7 +23247,7 @@ int64_t test_vaddvq_s64(int64x2_t a) { // CHECK-LABEL: define dso_local i64 @test_vaddvq_u64( // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VADDVQ_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A]]) +// CHECK-NEXT: [[VADDVQ_U64_I:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[A]]) // CHECK-NEXT: ret i64 [[VADDVQ_U64_I]] // uint64_t test_vaddvq_u64(uint64x2_t a) { @@ -23878,7 +23878,7 @@ float64x1_t test_vrsqrts_f64(float64x1_t a, float64x1_t b) { // CHECK-LABEL: define dso_local i32 @test_vminv_s32( // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VMINV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: [[VMINV_S32_I:%.*]] = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VMINV_S32_I]] // int32_t test_vminv_s32(int32x2_t a) { @@ -23888,7 +23888,7 @@ int32_t test_vminv_s32(int32x2_t a) { // CHECK-LABEL: define dso_local i32 @test_vminv_u32( // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VMINV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: [[VMINV_U32_I:%.*]] = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VMINV_U32_I]] // uint32_t test_vminv_u32(uint32x2_t a) { @@ -23898,7 +23898,7 @@ uint32_t test_vminv_u32(uint32x2_t a) { // CHECK-LABEL: define dso_local i32 @test_vmaxv_s32( // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VMAXV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: [[VMAXV_S32_I:%.*]] = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VMAXV_S32_I]] // int32_t test_vmaxv_s32(int32x2_t a) { @@ -23908,7 +23908,7 @@ int32_t test_vmaxv_s32(int32x2_t a) { // CHECK-LABEL: define dso_local i32 @test_vmaxv_u32( // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VMAXV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: [[VMAXV_U32_I:%.*]] = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VMAXV_U32_I]] // uint32_t test_vmaxv_u32(uint32x2_t a) { @@ -23918,7 +23918,7 @@ uint32_t test_vmaxv_u32(uint32x2_t a) { // CHECK-LABEL: define dso_local i32 @test_vaddv_s32( // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VADDV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: [[VADDV_S32_I:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VADDV_S32_I]] // int32_t test_vaddv_s32(int32x2_t a) { @@ -23928,7 +23928,7 @@ int32_t test_vaddv_s32(int32x2_t a) { // CHECK-LABEL: define dso_local i32 @test_vaddv_u32( // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VADDV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: [[VADDV_U32_I:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[A]]) // CHECK-NEXT: ret i32 [[VADDV_U32_I]] // uint32_t test_vaddv_u32(uint32x2_t a) { From e3acab34c425a7d7741619d642501ec40387e824 Mon Sep 17 00:00:00 2001 From: Ryotaro Kasuga Date: Thu, 30 Oct 2025 19:49:57 +0900 Subject: [PATCH 178/539] [DA] Add tests where dependencies are missed due to overflow (NFC) (#164246) This patch adds test cases that demonstrate missing dependencies in DA caused by the lack of overflow handling. These issues will be addressed by properly inserting overflow checks and bailing out when one is detected. It covers the following dependence test functions: - Strong SIV - Weak-Crossing SIV - Weak-Zero SIV - Symbolic RDIV - GCD MIV It does NOT cover: - Exact SIV - Exact RDIV - Banerjee MIV --- .../DependenceAnalysis/gcd-miv-overflow.ll | 66 +++++++++ .../DependenceAnalysis/strong-siv-overflow.ll | 68 +++++++++ .../symbolic-rdiv-overflow.ll | 137 ++++++++++++++++++ .../weak-crossing-siv-overflow.ll | 125 ++++++++++++++++ .../weak-zero-siv-overflow.ll | 122 ++++++++++++++++ 5 files changed, 518 insertions(+) create mode 100644 llvm/test/Analysis/DependenceAnalysis/gcd-miv-overflow.ll create mode 100644 llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll create mode 100644 llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll create mode 100644 llvm/test/Analysis/DependenceAnalysis/weak-crossing-siv-overflow.ll create mode 100644 llvm/test/Analysis/DependenceAnalysis/weak-zero-siv-overflow.ll diff --git a/llvm/test/Analysis/DependenceAnalysis/gcd-miv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/gcd-miv-overflow.ll new file mode 100644 index 0000000000000..43f66dd7d0974 --- /dev/null +++ b/llvm/test/Analysis/DependenceAnalysis/gcd-miv-overflow.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -disable-output "-passes=print" 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-ALL +; RUN: opt < %s -disable-output "-passes=print" -da-enable-dependence-test=gcd-miv 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-GCD-MIV + +; offset0 = 4; +; offset1 = 0; +; for (i = 0; i < 100; i++) { +; A[offset0] = 1; +; A[offset1] = 2; +; offset0 += 3*m; +; offset1 += 3; +; } +; +; FIXME: DependenceAnalysis currently detects no dependency between the two +; stores, but it does exist. E.g., consider `m` is 12297829382473034411, which +; is a modular multiplicative inverse of 3 under modulo 2^64. Then `offset0` is +; effectively `i + 4`, so accesses will be as follows: +; +; - A[offset0] : A[4], A[5], A[6], ... +; - A[offset1] : A[0], A[3], A[6], ... +; +; The root cause is that DA interprets `3*m` in non-modular arithmetic, which +; isn't necessarily true due to overflow. +; +define void @gcdmiv_coef_ovfl(ptr %A, i64 %m) { +; CHECK-ALL-LABEL: 'gcdmiv_coef_ovfl' +; CHECK-ALL-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; +; CHECK-GCD-MIV-LABEL: 'gcdmiv_coef_ovfl' +; CHECK-GCD-MIV-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1 +; CHECK-GCD-MIV-NEXT: da analyze - consistent output [*]! +; CHECK-GCD-MIV-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-GCD-MIV-NEXT: da analyze - none! +; CHECK-GCD-MIV-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-GCD-MIV-NEXT: da analyze - consistent output [*]! +; +entry: + %step = mul i64 3, %m + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ] + %offset.0 = phi i64 [ 4, %entry ] , [ %offset.0.next, %loop ] + %offset.1 = phi i64 [ 0, %entry ] , [ %offset.1.next, %loop ] + %gep.0 = getelementptr inbounds i8, ptr %A, i64 %offset.0 + %gep.1 = getelementptr inbounds i8, ptr %A, i64 %offset.1 + store i8 1, ptr %gep.0 + store i8 2, ptr %gep.1 + %i.inc = add nuw nsw i64 %i, 1 + %offset.0.next = add nsw i64 %offset.0, %step + %offset.1.next = add nsw i64 %offset.1, 3 + %ec = icmp eq i64 %i.inc, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll new file mode 100644 index 0000000000000..bf0fafcbfd6c9 --- /dev/null +++ b/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -disable-output "-passes=print" 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-ALL +; RUN: opt < %s -disable-output "-passes=print" -da-enable-dependence-test=strong-siv 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-STRONG-SIV + +; for (i = 0; i < (1LL << 62); i++) { +; if (0 <= 2*i - 2) +; A[2*i - 2] = 1; +; +; if (0 <= 2*i - 4) +; A[2*i - 4] = 2; +; } +; +; FIXME: DependenceAnalysis currently detects no dependency between the two +; stores, but it does exist. For example, each store will access A[0] when i +; is 1 and 2 respectively. +; The root cause is that the product of the BTC and the coefficient +; ((1LL << 62) - 1 and 2) overflows in a signed sense. +define void @strongsiv_const_ovfl(ptr %A) { +; CHECK-LABEL: 'strongsiv_const_ovfl' +; CHECK-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1 +; CHECK-NEXT: da analyze - none! +; CHECK-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-NEXT: da analyze - none! +; CHECK-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-NEXT: da analyze - none! +; +entry: + br label %loop.header + +loop.header: + %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ] + %offset.0 = phi i64 [ -2, %entry ], [ %offset.0.next, %loop.latch ] + %offset.1 = phi i64 [ -4, %entry ], [ %offset.1.next, %loop.latch ] + %ec = icmp eq i64 %i, 4611686018427387904 + br i1 %ec, label %exit, label %loop.body + +loop.body: + %cond.0 = icmp sge i64 %offset.0, 0 + %cond.1 = icmp sge i64 %offset.1, 0 + br i1 %cond.0, label %if.then.0, label %loop.middle + +if.then.0: + %gep.0 = getelementptr inbounds i8, ptr %A, i64 %offset.0 + store i8 1, ptr %gep.0 + br label %loop.middle + +loop.middle: + br i1 %cond.1, label %if.then.1, label %loop.latch + +if.then.1: + %gep.1 = getelementptr inbounds i8, ptr %A, i64 %offset.1 + store i8 2, ptr %gep.1 + br label %loop.latch + +loop.latch: + %i.inc = add nuw nsw i64 %i, 1 + %offset.0.next = add nsw i64 %offset.0, 2 + %offset.1.next = add nsw i64 %offset.1, 2 + br label %loop.header + +exit: + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-ALL: {{.*}} +; CHECK-STRONG-SIV: {{.*}} diff --git a/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll new file mode 100644 index 0000000000000..c5ff9884a0c62 --- /dev/null +++ b/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -disable-output "-passes=print" 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-ALL +; RUN: opt < %s -disable-output "-passes=print" -da-enable-dependence-test=symbolic-rdiv 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-SYMBOLIC-RDIV + +; for (i = 0; i < (1LL << 62); i++) { +; if (0 <= 2*i - 2) +; A[2*i - 2] = 1; +; A[i] = 2; +; } +; +; FIXME: DependenceAnalysis currently detects no dependency between the two +; stores, but it does exist. For example, each store will access A[0] when i +; is 1 and 0 respectively. +; The root cause is that the product of the BTC and the coefficient +; ((1LL << 62) - 1 and 2) overflows in a signed sense. +define void @symbolicrdiv_prod_ovfl(ptr %A) { +; CHECK-ALL-LABEL: 'symbolicrdiv_prod_ovfl' +; CHECK-ALL-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; +; CHECK-SYMBOLIC-RDIV-LABEL: 'symbolicrdiv_prod_ovfl' +; CHECK-SYMBOLIC-RDIV-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1 +; CHECK-SYMBOLIC-RDIV-NEXT: da analyze - none! +; CHECK-SYMBOLIC-RDIV-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-SYMBOLIC-RDIV-NEXT: da analyze - none! +; CHECK-SYMBOLIC-RDIV-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-SYMBOLIC-RDIV-NEXT: da analyze - consistent output [*]! +; +entry: + br label %loop.header + +loop.header: + %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ] + %offset = phi i64 [ -2, %entry ], [ %offset.next, %loop.latch ] + %ec = icmp eq i64 %i, 4611686018427387904 + br i1 %ec, label %exit, label %loop.body + +loop.body: + %cond = icmp sge i64 %offset, 0 + br i1 %cond, label %if.then, label %loop.latch + +if.then: + %gep.0 = getelementptr inbounds i8, ptr %A, i64 %offset + store i8 1, ptr %gep.0 + br label %loop.latch + +loop.latch: + %gep.1 = getelementptr inbounds i8, ptr %A, i64 %i + store i8 2, ptr %gep.1 + %i.inc = add nuw nsw i64 %i, 1 + %offset.next = add nsw i64 %offset, 2 + br label %loop.header + +exit: + ret void +} + +; offset0 = -4611686018427387904; // -2^62 +; offset1 = 4611686018427387904; // 2^62 +; for (i = 0; i < (1LL << 62) - 100; i++) { +; if (0 <= offset0) +; A[offset0] = 1; +; if (0 <= offset1) +; A[offset1] = 2; +; offset0 += 2; +; offset1 -= 1; +; } +; +; FIXME: DependenceAnalysis currently detects no dependency between the two +; stores, but it does exist. For example, +; +; memory access | i == 2^61 | i == 2^61 + 2^59 | i == 2^61 + 2^60 +; -------------------------|-----------|------------------|------------------- +; A[2*i - 2^62] (offset0) | | A[2^60] | A[2^61] +; A[-i + 2^62] (offset1) | A[2^61] | | A[2^60] +; +; The root cause is that the calculation of the differenct between the two +; constants (-2^62 and 2^62) overflows in a signed sense. +define void @symbolicrdiv_delta_ovfl(ptr %A) { +; CHECK-ALL-LABEL: 'symbolicrdiv_delta_ovfl' +; CHECK-ALL-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; +; CHECK-SYMBOLIC-RDIV-LABEL: 'symbolicrdiv_delta_ovfl' +; CHECK-SYMBOLIC-RDIV-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1 +; CHECK-SYMBOLIC-RDIV-NEXT: da analyze - consistent output [*]! +; CHECK-SYMBOLIC-RDIV-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-SYMBOLIC-RDIV-NEXT: da analyze - none! +; CHECK-SYMBOLIC-RDIV-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-SYMBOLIC-RDIV-NEXT: da analyze - consistent output [*]! +; +entry: + br label %loop.header + +loop.header: + %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ] + %offset.0 = phi i64 [ -4611686018427387904, %entry ], [ %offset.0.next, %loop.latch ] + %offset.1 = phi i64 [ 4611686018427387904, %entry ], [ %offset.1.next, %loop.latch ] + %cond.0 = icmp sge i64 %offset.0, 0 + %cond.1 = icmp sge i64 %offset.1, 0 + br i1 %cond.0, label %if.then.0, label %loop.middle + +if.then.0: + %gep.0 = getelementptr inbounds i8, ptr %A, i64 %offset.0 + store i8 1, ptr %gep.0 + br label %loop.middle + +loop.middle: + br i1 %cond.1, label %if.then.1, label %loop.latch + +if.then.1: + %gep.1 = getelementptr inbounds i8, ptr %A, i64 %offset.1 + store i8 2, ptr %gep.1 + br label %loop.latch + +loop.latch: + %i.inc = add nuw nsw i64 %i, 1 + %offset.0.next = add nsw i64 %offset.0, 2 + %offset.1.next = sub nsw i64 %offset.1, 1 + %ec = icmp eq i64 %i.inc, 4611686018427387804 ; 2^62 - 100 + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Analysis/DependenceAnalysis/weak-crossing-siv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/weak-crossing-siv-overflow.ll new file mode 100644 index 0000000000000..ba57c7bf5736a --- /dev/null +++ b/llvm/test/Analysis/DependenceAnalysis/weak-crossing-siv-overflow.ll @@ -0,0 +1,125 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -disable-output "-passes=print" 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-ALL +; RUN: opt < %s -disable-output "-passes=print" -da-enable-dependence-test=weak-crossing-siv 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-WEAK-CROSSING-SIV + +; max_i = INT64_MAX/3 // 3074457345618258602 +; for (long long i = 0; i <= max_i; i++) { +; A[-3*i + INT64_MAX] = 0; +; if (i) +; A[3*i - 2] = 1; +; } +; +; FIXME: DependenceAnalysis currently detects no dependency between +; `A[-3*i + INT64_MAX]` and `A[3*i - 2]`, but it does exist. For example, +; +; memory access | i == 1 | i == max_i +; ---------------------|------------------|------------------ +; A[-3*i + INT64_MAX] | A[INT64_MAX - 3] | A[1] +; A[3*i - 2] | A[1] | A[INT64_MAX - 3] +; +; The root cause is that the calculation of the differenct between the two +; constants (INT64_MAX and -2) triggers an overflow. + +define void @weakcorssing_delta_ovfl(ptr %A) { +; CHECK-ALL-LABEL: 'weakcorssing_delta_ovfl' +; CHECK-ALL-NEXT: Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; +; CHECK-WEAK-CROSSING-SIV-LABEL: 'weakcorssing_delta_ovfl' +; CHECK-WEAK-CROSSING-SIV-NEXT: Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1 +; CHECK-WEAK-CROSSING-SIV-NEXT: da analyze - consistent output [*]! +; CHECK-WEAK-CROSSING-SIV-NEXT: Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1 +; CHECK-WEAK-CROSSING-SIV-NEXT: da analyze - none! +; CHECK-WEAK-CROSSING-SIV-NEXT: Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1 +; CHECK-WEAK-CROSSING-SIV-NEXT: da analyze - consistent output [*]! +; +entry: + br label %loop.header + +loop.header: + %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ] + %subscript.0 = phi i64 [ 9223372036854775807, %entry ], [ %subscript.0.next, %loop.latch ] + %subscript.1 = phi i64 [ -2, %entry ], [ %subscript.1.next, %loop.latch ] + %idx.0 = getelementptr inbounds i8, ptr %A, i64 %subscript.0 + store i8 0, ptr %idx.0 + %cond.store = icmp ne i64 %i, 0 + br i1 %cond.store, label %if.store, label %loop.latch + +if.store: + %idx.1 = getelementptr inbounds i8, ptr %A, i64 %subscript.1 + store i8 1, ptr %idx.1 + br label %loop.latch + +loop.latch: + %i.inc = add nuw nsw i64 %i, 1 + %subscript.0.next = add nsw i64 %subscript.0, -3 + %subscript.1.next = add nsw i64 %subscript.1, 3 + %ec = icmp sgt i64 %i.inc, 3074457345618258602 + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} + +; max_i = INT64_MAX/3 // 3074457345618258602 +; for (long long i = 0; i <= max_i; i++) { +; A[-3*i + INT64_MAX] = 0; +; A[3*i + 1] = 1; +; } +; +; FIXME: DependenceAnalysis currently detects no dependency between +; `A[-3*i + INT64_MAX]` and `A[3*i - 2]`, but it does exist. For example, +; +; memory access | i == 0 | i == 1 | i == max_i - 1 | i == max_i +; ---------------------|--------|------------------|----------------|------------------ +; A[-3*i + INT64_MAX] | | A[INT64_MAX - 3] | A[1] | +; A[3*i + 1] | A[1] | | | A[INT64_MAX - 3] +; +; The root cause is that the product of the BTC, the coefficient, and 2 +; triggers an overflow. +; +define void @weakcorssing_prod_ovfl(ptr %A) { +; CHECK-ALL-LABEL: 'weakcorssing_prod_ovfl' +; CHECK-ALL-NEXT: Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; +; CHECK-WEAK-CROSSING-SIV-LABEL: 'weakcorssing_prod_ovfl' +; CHECK-WEAK-CROSSING-SIV-NEXT: Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1 +; CHECK-WEAK-CROSSING-SIV-NEXT: da analyze - consistent output [*]! +; CHECK-WEAK-CROSSING-SIV-NEXT: Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1 +; CHECK-WEAK-CROSSING-SIV-NEXT: da analyze - none! +; CHECK-WEAK-CROSSING-SIV-NEXT: Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1 +; CHECK-WEAK-CROSSING-SIV-NEXT: da analyze - consistent output [*]! +; +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ] + %subscript.0 = phi i64 [ 9223372036854775807, %entry ], [ %subscript.0.next, %loop ] + %subscript.1 = phi i64 [ 1, %entry ], [ %subscript.1.next, %loop ] + %idx.0 = getelementptr inbounds i8, ptr %A, i64 %subscript.0 + %idx.1 = getelementptr inbounds i8, ptr %A, i64 %subscript.1 + store i8 0, ptr %idx.0 + store i8 1, ptr %idx.1 + %i.inc = add nuw nsw i64 %i, 1 + %subscript.0.next = add nsw i64 %subscript.0, -3 + %subscript.1.next = add nsw i64 %subscript.1, 3 + %ec = icmp sgt i64 %i.inc, 3074457345618258602 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Analysis/DependenceAnalysis/weak-zero-siv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/weak-zero-siv-overflow.ll new file mode 100644 index 0000000000000..6317c387858d3 --- /dev/null +++ b/llvm/test/Analysis/DependenceAnalysis/weak-zero-siv-overflow.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -disable-output "-passes=print" 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-ALL +; RUN: opt < %s -disable-output "-passes=print" -da-enable-dependence-test=weak-zero-siv 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-WEAK-ZERO-SIV + +; for (i = 0; i < (1LL << 62); i++) { +; if (0 <= 2*i - 2) +; A[2*i - 2] = 1; +; A[2] = 2; +; } +; +; FIXME: DependenceAnalysis currently detects no dependency between the two +; stores, but it does exist. The root cause is that the product of the BTC and +; the coefficient ((1LL << 62) - 1 and 2) overflows in a signed sense. +; +define void @weakzero_dst_siv_prod_ovfl(ptr %A) { +; CHECK-ALL-LABEL: 'weakzero_dst_siv_prod_ovfl' +; CHECK-ALL-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-ALL-NEXT: da analyze - consistent output [S]! +; +; CHECK-WEAK-ZERO-SIV-LABEL: 'weakzero_dst_siv_prod_ovfl' +; CHECK-WEAK-ZERO-SIV-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1 +; CHECK-WEAK-ZERO-SIV-NEXT: da analyze - consistent output [*]! +; CHECK-WEAK-ZERO-SIV-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-WEAK-ZERO-SIV-NEXT: da analyze - none! +; CHECK-WEAK-ZERO-SIV-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-WEAK-ZERO-SIV-NEXT: da analyze - consistent output [S]! +; +entry: + br label %loop.header + +loop.header: + %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ] + %offset = phi i64 [ -2, %entry ], [ %offset.next, %loop.latch ] + %ec = icmp eq i64 %i, 4611686018427387904 + br i1 %ec, label %exit, label %loop.body + +loop.body: + %cond = icmp sge i64 %offset, 0 + br i1 %cond, label %if.then, label %loop.latch + +if.then: + %gep.0 = getelementptr inbounds i8, ptr %A, i64 %offset + store i8 1, ptr %gep.0 + br label %loop.latch + +loop.latch: + %gep.1 = getelementptr inbounds i8, ptr %A, i64 2 + store i8 2, ptr %gep.1 + %i.inc = add nuw nsw i64 %i, 1 + %offset.next = add nsw i64 %offset, 2 + br label %loop.header + +exit: + ret void +} + +; for (i = 0; i < n; i++) { +; if (0 <= 2*i - 1) +; A[2*i - 1] = 1; +; A[INT64_MAX] = 2; +; } +; +; FIXME: DependenceAnalysis currently detects no dependency between the two +; stores, but it does exist. When `%n` is 2^62, the value of `%offset` will be +; the same as INT64_MAX at the last iteration. +; The root cause is that the calculation of the difference between the two +; constants (INT64_MAX and -1) overflows in a signed sense. +; +define void @weakzero_dst_siv_delta_ovfl(ptr %A, i64 %n) { +; CHECK-ALL-LABEL: 'weakzero_dst_siv_delta_ovfl' +; CHECK-ALL-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-ALL-NEXT: da analyze - consistent output [S]! +; +; CHECK-WEAK-ZERO-SIV-LABEL: 'weakzero_dst_siv_delta_ovfl' +; CHECK-WEAK-ZERO-SIV-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1 +; CHECK-WEAK-ZERO-SIV-NEXT: da analyze - consistent output [*]! +; CHECK-WEAK-ZERO-SIV-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-WEAK-ZERO-SIV-NEXT: da analyze - none! +; CHECK-WEAK-ZERO-SIV-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 +; CHECK-WEAK-ZERO-SIV-NEXT: da analyze - consistent output [S]! +; +entry: + %guard = icmp sgt i64 %n, 0 + br i1 %guard, label %loop.header, label %exit + +loop.header: + %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ] + %offset = phi i64 [ -2, %entry ], [ %offset.next, %loop.latch ] + %ec = icmp eq i64 %i, %n + br i1 %ec, label %exit, label %loop.body + +loop.body: + %cond = icmp sge i64 %offset, 0 + br i1 %cond, label %if.then, label %loop.latch + +if.then: + %gep.0 = getelementptr inbounds i8, ptr %A, i64 %offset + store i8 1, ptr %gep.0 + br label %loop.latch + +loop.latch: + %gep.1 = getelementptr inbounds i8, ptr %A, i64 9223372036854775807 + store i8 2, ptr %gep.1 + %i.inc = add nuw nsw i64 %i, 1 + %offset.next = add nsw i64 %offset, 2 + br label %loop.header + +exit: + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} From 5b7b69699e667f97868f8b59757b0b7c848b0d62 Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Thu, 30 Oct 2025 19:23:53 +0800 Subject: [PATCH 179/539] [LoongArch][NFC] Pre-commit tests for vector type average (#161076) --- .../LoongArch/lasx/ir-instruction/avg.ll | 307 ++++++++++++++++++ .../LoongArch/lsx/ir-instruction/avg.ll | 307 ++++++++++++++++++ 2 files changed, 614 insertions(+) create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll new file mode 100644 index 0000000000000..2a5a8fa05d646 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll @@ -0,0 +1,307 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +define void @xvavg_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadd.b $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.b $xr0, $xr0, 1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %add = add <32 x i8> %va, %vb + %shr = ashr <32 x i8> %add, + store <32 x i8> %shr, ptr %res + ret void +} + +define void @xvavg_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadd.h $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.h $xr0, $xr0, 1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %add = add <16 x i16> %va, %vb + %shr = ashr <16 x i16> %add, + store <16 x i16> %shr, ptr %res + ret void +} + +define void @xvavg_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadd.w $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.w $xr0, $xr0, 1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %add = add <8 x i32> %va, %vb + %shr = ashr <8 x i32> %add, + store <8 x i32> %shr, ptr %res + ret void +} + +define void @xvavg_d(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadd.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.d $xr0, $xr0, 1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %add = add <4 x i64> %va, %vb + %shr = ashr <4 x i64> %add, + store <4 x i64> %shr, ptr %res + ret void +} + +define void @xvavg_bu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_bu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadd.b $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.b $xr0, $xr0, 1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %add = add <32 x i8> %va, %vb + %shr = lshr <32 x i8> %add, + store <32 x i8> %shr, ptr %res + ret void +} + +define void @xvavg_hu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_hu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadd.h $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.h $xr0, $xr0, 1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %add = add <16 x i16> %va, %vb + %shr = lshr <16 x i16> %add, + store <16 x i16> %shr, ptr %res + ret void +} + +define void @xvavg_wu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_wu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadd.w $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.w $xr0, $xr0, 1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %add = add <8 x i32> %va, %vb + %shr = lshr <8 x i32> %add, + store <8 x i32> %shr, ptr %res + ret void +} + +define void @xvavg_du(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_du: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadd.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.d $xr0, $xr0, 1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %add = add <4 x i64> %va, %vb + %shr = lshr <4 x i64> %add, + store <4 x i64> %shr, ptr %res + ret void +} + +define void @xvavgr_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadd.b $xr0, $xr0, $xr1 +; CHECK-NEXT: xvaddi.bu $xr0, $xr0, 1 +; CHECK-NEXT: xvsrai.b $xr0, $xr0, 1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %add = add <32 x i8> %va, %vb + %add1 = add <32 x i8> %add, + %shr = ashr <32 x i8> %add1, + store <32 x i8> %shr, ptr %res + ret void +} + +define void @xvavgr_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadd.h $xr0, $xr0, $xr1 +; CHECK-NEXT: xvaddi.hu $xr0, $xr0, 1 +; CHECK-NEXT: xvsrai.h $xr0, $xr0, 1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %add = add <16 x i16> %va, %vb + %add1 = add <16 x i16> %add, + %shr = ashr <16 x i16> %add1, + store <16 x i16> %shr, ptr %res + ret void +} + +define void @xvavgr_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadd.w $xr0, $xr0, $xr1 +; CHECK-NEXT: xvaddi.wu $xr0, $xr0, 1 +; CHECK-NEXT: xvsrai.w $xr0, $xr0, 1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %add = add <8 x i32> %va, %vb + %add1 = add <8 x i32> %add, + %shr = ashr <8 x i32> %add1, + store <8 x i32> %shr, ptr %res + ret void +} + +define void @xvavgr_d(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadd.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvaddi.du $xr0, $xr0, 1 +; CHECK-NEXT: xvsrai.d $xr0, $xr0, 1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %add = add <4 x i64> %va, %vb + %add1 = add <4 x i64> %add, + %shr = ashr <4 x i64> %add1, + store <4 x i64> %shr, ptr %res + ret void +} + +define void @xvavgr_bu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_bu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadd.b $xr0, $xr0, $xr1 +; CHECK-NEXT: xvaddi.bu $xr0, $xr0, 1 +; CHECK-NEXT: xvsrli.b $xr0, $xr0, 1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %add = add <32 x i8> %va, %vb + %add1 = add <32 x i8> %add, + %shr = lshr <32 x i8> %add1, + store <32 x i8> %shr, ptr %res + ret void +} + +define void @xvavgr_hu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_hu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadd.h $xr0, $xr0, $xr1 +; CHECK-NEXT: xvaddi.hu $xr0, $xr0, 1 +; CHECK-NEXT: xvsrli.h $xr0, $xr0, 1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %add = add <16 x i16> %va, %vb + %add1 = add <16 x i16> %add, + %shr = lshr <16 x i16> %add1, + store <16 x i16> %shr, ptr %res + ret void +} + +define void @xvavgr_wu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_wu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadd.w $xr0, $xr0, $xr1 +; CHECK-NEXT: xvaddi.wu $xr0, $xr0, 1 +; CHECK-NEXT: xvsrli.w $xr0, $xr0, 1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %add = add <8 x i32> %va, %vb + %add1 = add <8 x i32> %add, + %shr = lshr <8 x i32> %add1, + store <8 x i32> %shr, ptr %res + ret void +} + +define void @xvavgr_du(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_du: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadd.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvaddi.du $xr0, $xr0, 1 +; CHECK-NEXT: xvsrli.d $xr0, $xr0, 1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %add = add <4 x i64> %va, %vb + %add1 = add <4 x i64> %add, + %shr = lshr <4 x i64> %add1, + store <4 x i64> %shr, ptr %res + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll new file mode 100644 index 0000000000000..20b8898436cc4 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll @@ -0,0 +1,307 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +define void @vavg_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadd.b $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.b $vr0, $vr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %add = add <16 x i8> %va, %vb + %shr = ashr <16 x i8> %add, + store <16 x i8> %shr, ptr %res + ret void +} + +define void @vavg_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadd.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.h $vr0, $vr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %add = add <8 x i16> %va, %vb + %shr = ashr <8 x i16> %add, + store <8 x i16> %shr, ptr %res + ret void +} + +define void @vavg_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadd.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.w $vr0, $vr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %add = add <4 x i32> %va, %vb + %shr = ashr <4 x i32> %add, + store <4 x i32> %shr, ptr %res + ret void +} + +define void @vavg_d(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadd.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.d $vr0, $vr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %add = add <2 x i64> %va, %vb + %shr = ashr <2 x i64> %add, + store <2 x i64> %shr, ptr %res + ret void +} + +define void @vavg_bu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_bu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadd.b $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.b $vr0, $vr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %add = add <16 x i8> %va, %vb + %shr = lshr <16 x i8> %add, + store <16 x i8> %shr, ptr %res + ret void +} + +define void @vavg_hu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_hu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadd.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.h $vr0, $vr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %add = add <8 x i16> %va, %vb + %shr = lshr <8 x i16> %add, + store <8 x i16> %shr, ptr %res + ret void +} + +define void @vavg_wu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_wu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadd.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.w $vr0, $vr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %add = add <4 x i32> %va, %vb + %shr = lshr <4 x i32> %add, + store <4 x i32> %shr, ptr %res + ret void +} + +define void @vavg_du(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_du: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadd.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.d $vr0, $vr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %add = add <2 x i64> %va, %vb + %shr = lshr <2 x i64> %add, + store <2 x i64> %shr, ptr %res + ret void +} + +define void @vavgr_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadd.b $vr0, $vr0, $vr1 +; CHECK-NEXT: vaddi.bu $vr0, $vr0, 1 +; CHECK-NEXT: vsrai.b $vr0, $vr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %add = add <16 x i8> %va, %vb + %add1 = add <16 x i8> %add, + %shr = ashr <16 x i8> %add1, + store <16 x i8> %shr, ptr %res + ret void +} + +define void @vavgr_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadd.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vaddi.hu $vr0, $vr0, 1 +; CHECK-NEXT: vsrai.h $vr0, $vr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %add = add <8 x i16> %va, %vb + %add1 = add <8 x i16> %add, + %shr = ashr <8 x i16> %add1, + store <8 x i16> %shr, ptr %res + ret void +} + +define void @vavgr_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadd.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vaddi.wu $vr0, $vr0, 1 +; CHECK-NEXT: vsrai.w $vr0, $vr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %add = add <4 x i32> %va, %vb + %add1 = add <4 x i32> %add, + %shr = ashr <4 x i32> %add1, + store <4 x i32> %shr, ptr %res + ret void +} + +define void @vavgr_d(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadd.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vaddi.du $vr0, $vr0, 1 +; CHECK-NEXT: vsrai.d $vr0, $vr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %add = add <2 x i64> %va, %vb + %add1 = add <2 x i64> %add, + %shr = ashr <2 x i64> %add1, + store <2 x i64> %shr, ptr %res + ret void +} + +define void @vavgr_bu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_bu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadd.b $vr0, $vr0, $vr1 +; CHECK-NEXT: vaddi.bu $vr0, $vr0, 1 +; CHECK-NEXT: vsrli.b $vr0, $vr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %add = add <16 x i8> %va, %vb + %add1 = add <16 x i8> %add, + %shr = lshr <16 x i8> %add1, + store <16 x i8> %shr, ptr %res + ret void +} + +define void @vavgr_hu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_hu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadd.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vaddi.hu $vr0, $vr0, 1 +; CHECK-NEXT: vsrli.h $vr0, $vr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %add = add <8 x i16> %va, %vb + %add1 = add <8 x i16> %add, + %shr = lshr <8 x i16> %add1, + store <8 x i16> %shr, ptr %res + ret void +} + +define void @vavgr_wu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_wu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadd.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vaddi.wu $vr0, $vr0, 1 +; CHECK-NEXT: vsrli.w $vr0, $vr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %add = add <4 x i32> %va, %vb + %add1 = add <4 x i32> %add, + %shr = lshr <4 x i32> %add1, + store <4 x i32> %shr, ptr %res + ret void +} + +define void @vavgr_du(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_du: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadd.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vaddi.du $vr0, $vr0, 1 +; CHECK-NEXT: vsrli.d $vr0, $vr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %add = add <2 x i64> %va, %vb + %add1 = add <2 x i64> %add, + %shr = lshr <2 x i64> %add1, + store <2 x i64> %shr, ptr %res + ret void +} From 23af488a5dfb8d61ea255ef3825f94da84e60e6d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 30 Oct 2025 12:10:00 +0000 Subject: [PATCH 180/539] [X86] Add ldexp test coverage for avx512 targets (#165698) Pulled out of the abandoned patch #69710 to act as a baseline for #165694 --- llvm/test/CodeGen/X86/ldexp-avx512.ll | 467 ++++++++++++++++++++++++++ 1 file changed, 467 insertions(+) create mode 100644 llvm/test/CodeGen/X86/ldexp-avx512.ll diff --git a/llvm/test/CodeGen/X86/ldexp-avx512.ll b/llvm/test/CodeGen/X86/ldexp-avx512.ll new file mode 100644 index 0000000000000..ea93a911a1ad0 --- /dev/null +++ b/llvm/test/CodeGen/X86/ldexp-avx512.ll @@ -0,0 +1,467 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL + +define half @test_half(half %x, i32 %exp) nounwind { +; CHECK-LABEL: test_half: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq +entry: + %r = tail call fast half @llvm.ldexp.f16.i32(half %x, i32 %exp) + ret half %r +} +declare half @llvm.ldexp.f16.i32(half, i32) memory(none) + +define float @test_float(float %x, i32 %exp) nounwind { +; CHECK-LABEL: test_float: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: jmp ldexpf@PLT # TAILCALL +entry: + %r = tail call fast float @ldexpf(float %x, i32 %exp) + ret float %r +} +declare float @ldexpf(float, i32) memory(none) + +define double @test_double(double %x, i32 %exp) nounwind { +; CHECK-LABEL: test_double: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: jmp ldexp@PLT # TAILCALL +entry: + %r = tail call fast double @ldexp(double %x, i32 %exp) + ret double %r +} +declare double @ldexp(double, i32) memory(none) + +define fp128 @testExpl(fp128 %x, i32 %exp) nounwind { +; CHECK-LABEL: testExpl: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: jmp ldexpl@PLT # TAILCALL +entry: + %r = tail call fast fp128 @ldexpl(fp128 %x, i32 %exp) + ret fp128 %r +} +declare fp128 @ldexpl(fp128, i32) memory(none) + +define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) nounwind { +; CHECK-LABEL: test_ldexp_4xfloat: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovd %xmm1, %edi +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $1, %xmm0, %edi +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $2, %xmm0, %edi +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $3, %xmm0, %edi +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: retq + %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %x, <4 x i32> %exp) + ret <4 x float> %r +} +declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>) + +define <2 x double> @test_ldexp_2xdouble(<2 x double> %x, <2 x i32> %exp) nounwind { +; CHECK-LABEL: test_ldexp_2xdouble: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovd %xmm1, %edi +; CHECK-NEXT: callq ldexp@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $1, %xmm0, %edi +; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq ldexp@PLT +; CHECK-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: retq + %r = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> %x, <2 x i32> %exp) + ret <2 x double> %r +} +declare <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>) + +define <8 x float> @test_ldexp_8xfloat(<8 x float> %x, <8 x i32> %exp) nounwind { +; CHECK-LABEL: test_ldexp_8xfloat: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $120, %rsp +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovd %xmm1, %edi +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $1, %xmm0, %edi +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $2, %xmm0, %edi +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $3, %xmm0, %edi +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovd %xmm0, %edi +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vextractps $1, %xmm0, %edi +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vextractps $2, %xmm0, %edi +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vextractps $3, %xmm0, %edi +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; CHECK-NEXT: addq $120, %rsp +; CHECK-NEXT: retq + %r = call <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> %x, <8 x i32> %exp) + ret <8 x float> %r +} +declare <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float>, <8 x i32>) + +define <4 x double> @test_ldexp_4xdouble(<4 x double> %x, <4 x i32> %exp) nounwind { +; CHECK-LABEL: test_ldexp_4xdouble: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $88, %rsp +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vextractps $2, %xmm1, %edi +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexp@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $3, %xmm0, %edi +; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq ldexp@PLT +; CHECK-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovd %xmm0, %edi +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexp@PLT +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $1, %xmm0, %edi +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq ldexp@PLT +; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; CHECK-NEXT: addq $88, %rsp +; CHECK-NEXT: retq + %r = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> %x, <4 x i32> %exp) + ret <4 x double> %r +} +declare <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>) + +define <16 x float> @test_ldexp_16xfloat(<16 x float> %x, <16 x i32> %exp) nounwind { +; CHECK-LABEL: test_ldexp_16xfloat: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $216, %rsp +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; CHECK-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovd %xmm1, %edi +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $1, %xmm0, %edi +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $2, %xmm0, %edi +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $3, %xmm0, %edi +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1 +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovd %xmm0, %edi +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $1, %xmm0, %edi +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $2, %xmm0, %edi +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $3, %xmm0, %edi +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovd %xmm0, %edi +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $1, %xmm0, %edi +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $2, %xmm0, %edi +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $3, %xmm0, %edi +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vmovd %xmm0, %edi +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vextractps $1, %xmm0, %edi +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vextractps $2, %xmm0, %edi +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vextractps $3, %xmm0, %edi +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; CHECK-NEXT: addq $216, %rsp +; CHECK-NEXT: retq + %r = call <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> %x, <16 x i32> %exp) + ret <16 x float> %r +} +declare <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float>, <16 x i32>) + +define <8 x double> @test_ldexp_8xdouble(<8 x double> %x, <8 x i32> %exp) nounwind { +; CHECK-LABEL: test_ldexp_8xdouble: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $184, %rsp +; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vextractps $2, %xmm1, %edi +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexp@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $3, %xmm0, %edi +; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq ldexp@PLT +; CHECK-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1 +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovd %xmm0, %edi +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexp@PLT +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vextractps $1, %xmm0, %edi +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq ldexp@PLT +; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; CHECK-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vextractps $2, %xmm0, %edi +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexp@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vextractps $3, %xmm0, %edi +; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexp@PLT +; CHECK-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovd %xmm0, %edi +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexp@PLT +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vextractps $1, %xmm0, %edi +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq ldexp@PLT +; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; CHECK-NEXT: addq $184, %rsp +; CHECK-NEXT: retq + %r = call <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> %x, <8 x i32> %exp) + ret <8 x double> %r +} +declare <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double>, <8 x i32>) + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX512: {{.*}} +; AVX512VL: {{.*}} From fdac55eeb955672607fbe8ecd03441967a44677b Mon Sep 17 00:00:00 2001 From: Mads Marquart Date: Thu, 30 Oct 2025 13:18:00 +0100 Subject: [PATCH 181/539] [llvm-cxxfilt] update docs to reflect #106233 (#165709) It looks like the documentation for `llvm-cxxfilt`'s `--[no-]strip-underscore` options weren't updated when https://github.com/llvm/llvm-project/pull/106233 was made. CC @Michael137 (I don't have merge rights myself). --- llvm/docs/CommandGuide/llvm-cxxfilt.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/docs/CommandGuide/llvm-cxxfilt.rst b/llvm/docs/CommandGuide/llvm-cxxfilt.rst index 8c61cedd9b70b..8e509cec6ce02 100644 --- a/llvm/docs/CommandGuide/llvm-cxxfilt.rst +++ b/llvm/docs/CommandGuide/llvm-cxxfilt.rst @@ -54,8 +54,7 @@ OPTIONS .. option:: --no-strip-underscore, -n - Do not strip a leading underscore. This is the default for all platforms - except Mach-O based hosts. + Do not strip a leading underscore. This is the default for all platforms. .. option:: --quote @@ -64,7 +63,7 @@ OPTIONS .. option:: --strip-underscore, -_ Strip a single leading underscore, if present, from each input name before - demangling. On by default on Mach-O based platforms. + demangling. .. option:: --types, -t From 43c023f34a6359b30af0c61f1deb9561762432d6 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 30 Oct 2025 12:30:39 +0000 Subject: [PATCH 182/539] [X86] combinePTESTCC - ensure repeated operands are frozen (#165697) As noticed on #165676 - if we're increasing the use of an operand we should freeze it --- llvm/lib/Target/X86/X86ISelLowering.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 624cff24ddf03..0914338672206 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48778,10 +48778,9 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SDValue BC0 = peekThroughBitcasts(Op0); if (BC0.getOpcode() == X86ISD::PCMPEQ && ISD::isBuildVectorAllZeros(BC0.getOperand(1).getNode())) { - SDLoc DL(EFLAGS); CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE); - SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0)); - return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X); + SDValue X = DAG.getBitcast(OpVT, DAG.getFreeze(BC0.getOperand(0))); + return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, X, X); } } } @@ -48837,7 +48836,7 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, MVT FloatSVT = MVT::getFloatingPointVT(EltBits); MVT FloatVT = MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits); - Res = DAG.getBitcast(FloatVT, Res); + Res = DAG.getBitcast(FloatVT, DAG.getFreeze(Res)); return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res); } else if (EltBits == 16) { MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8; @@ -48856,8 +48855,10 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, } // TESTZ(X,-1) == TESTZ(X,X) - if (ISD::isBuildVectorAllOnes(Op1.getNode())) + if (ISD::isBuildVectorAllOnes(Op1.getNode())) { + Op0 = DAG.getFreeze(Op0); return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0); + } // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y) // TODO: Add COND_NE handling? From c6e882a1464810cc78bca396b13673e9a27581c8 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 30 Oct 2025 12:35:39 +0000 Subject: [PATCH 183/539] [X86] Narrow BT/BTC/BTR/BTS compare + RMW patterns on very large integers (#165540) This patch allows us to narrow single bit-test/twiddle operations for larger than legal scalar integers to efficiently operate just on the i32 sub-integer block actually affected. The BITOP(X,SHL(1,IDX)) patterns are split, with the IDX used to access the specific i32 block as well as specific bit within that block. BT comparisons are relatively simple, and builds on the truncated shifted loads fold from #165266. BTC/BTR/BTS bit twiddling patterns need to match the entire RMW pattern to safely confirm only one block is affected, but a similar approach is taken and creates codegen that should allow us to further merge with matching BT opcodes in a future patch (see #165291). The resulting codegen is notably more efficient than the heavily micro-coded memory folded variants of BT/BTC/BTR/BTS. There is still some work to improve the bit insert 'init' patterns included in bittest-big-integer.ll but I'm expecting this to be a straightforward future extension. Fixes #164225 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 114 +- llvm/test/CodeGen/X86/bittest-big-integer.ll | 7197 +++--------------- 2 files changed, 1036 insertions(+), 6275 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0914338672206..f5b8b58c18e1e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -53481,6 +53481,80 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Look for a RMW operation that only touches one bit of a larger than legal +// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value. +static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + using namespace SDPatternMatch; + + // Only handle normal stores and its chain was a matching normal load. + auto *Ld = dyn_cast(St->getChain()); + if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld || + !ISD::isNormalLoad(Ld) || !Ld->isSimple() || + Ld->getBasePtr() != St->getBasePtr() || + Ld->getOffset() != St->getOffset()) + return SDValue(); + + SDValue LoadVal(Ld, 0); + SDValue StoredVal = St->getValue(); + EVT VT = StoredVal.getValueType(); + + // Only narrow larger than legal scalar integers. + if (!VT.isScalarInteger() || + VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32)) + return SDValue(); + + // BTR: X & ~(1 << ShAmt) + // BTS: X | (1 << ShAmt) + // BTC: X ^ (1 << ShAmt) + SDValue ShAmt; + if (!StoredVal.hasOneUse() || + !(sd_match(StoredVal, m_And(m_Specific(LoadVal), + m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || + sd_match(StoredVal, + m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || + sd_match(StoredVal, + m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))))) + return SDValue(); + + // Ensure the shift amount is in bounds. + KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); + if (KnownAmt.getMaxValue().uge(VT.getSizeInBits())) + return SDValue(); + + // Split the shift into an alignment shift that moves the active i32 block to + // the bottom bits for truncation and a modulo shift that can act on the i32. + EVT AmtVT = ShAmt.getValueType(); + SDValue AlignAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, + DAG.getSignedConstant(-32LL, DL, AmtVT)); + SDValue ModuloAmt = + DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT)); + + // Compute the byte offset for the i32 block that is changed by the RMW. + // combineTruncate will adjust the load for us in a similar way. + EVT PtrVT = St->getBasePtr().getValueType(); + SDValue PtrBitOfs = DAG.getZExtOrTrunc(AlignAmt, DL, PtrVT); + SDValue PtrByteOfs = DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs, + DAG.getShiftAmountConstant(3, PtrVT, DL)); + SDValue NewPtr = DAG.getMemBasePlusOffset(St->getBasePtr(), PtrByteOfs, DL, + SDNodeFlags::NoUnsignedWrap); + + // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store. + SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt); + X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); + + SDValue Mask = + DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), + DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); + if (StoredVal.getOpcode() == ISD::AND) + Mask = DAG.getNOT(DL, Mask, MVT::i32); + + SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); + return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), + Align(), St->getMemOperand()->getFlags()); +} + static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -53707,6 +53781,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, } } + if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget)) + return R; + // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC) // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC) if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && @@ -54661,8 +54738,9 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, // truncation, see if we can convert the shift into a pointer offset instead. // Limit this to normal (non-ext) scalar integer loads. if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL && - Src.hasOneUse() && Src.getOperand(0).hasOneUse() && - ISD::isNormalLoad(Src.getOperand(0).getNode())) { + Src.hasOneUse() && ISD::isNormalLoad(Src.getOperand(0).getNode()) && + (Src.getOperand(0).hasOneUse() || + !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, SrcVT))) { auto *Ld = cast(Src.getOperand(0)); if (Ld->isSimple() && VT.isByteSized() && isPowerOf2_64(VT.getSizeInBits())) { @@ -56460,6 +56538,7 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + using namespace SDPatternMatch; const ISD::CondCode CC = cast(N->getOperand(2))->get(); const SDValue LHS = N->getOperand(0); const SDValue RHS = N->getOperand(1); @@ -56518,6 +56597,37 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, if (SDValue AndN = MatchAndCmpEq(RHS, LHS)) return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC); + // If we're performing a bit test on a larger than legal type, attempt + // to (aligned) shift down the value to the bottom 32-bits and then + // perform the bittest on the i32 value. + // ICMP_ZERO(AND(X,SHL(1,IDX))) + // --> ICMP_ZERO(AND(TRUNC(SRL(X,AND(IDX,-32))),SHL(1,AND(IDX,31)))) + if (isNullConstant(RHS) && + OpVT.getScalarSizeInBits() > (Subtarget.is64Bit() ? 64 : 32)) { + SDValue X, ShAmt; + if (sd_match(LHS, m_OneUse(m_And(m_Value(X), + m_Shl(m_One(), m_Value(ShAmt)))))) { + // Only attempt this if the shift amount is known to be in bounds. + KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); + if (KnownAmt.getMaxValue().ult(OpVT.getScalarSizeInBits())) { + EVT AmtVT = ShAmt.getValueType(); + SDValue AlignAmt = + DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, + DAG.getSignedConstant(-32LL, DL, AmtVT)); + SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, + DAG.getConstant(31, DL, AmtVT)); + SDValue Mask = DAG.getNode( + ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), + DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); + X = DAG.getNode(ISD::SRL, DL, OpVT, X, AlignAmt); + X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); + X = DAG.getNode(ISD::AND, DL, MVT::i32, X, Mask); + return DAG.getSetCC(DL, VT, X, DAG.getConstant(0, DL, MVT::i32), + CC); + } + } + } + // cmpeq(trunc(x),C) --> cmpeq(x,C) // cmpne(trunc(x),C) --> cmpne(x,C) // iff x upper bits are zero. diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index 19d751d176b6a..cc3dcf32ac0eb 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -203,24 +203,14 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shll %cl, %edx -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB5_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %esi -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: .LBB5_2: -; X86-NEXT: andl 4(%eax), %esi -; X86-NEXT: andl (%eax), %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: setne %al -; X86-NEXT: popl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $32, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%eax,%edx), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: test_ne_i64: @@ -242,38 +232,20 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind { define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_ne_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %eax -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB6_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: .LBB6_2: -; X86-NEXT: movl (%edx), %ecx -; X86-NEXT: movl 4(%edx), %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: andl %esi, %ebx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: andl %eax, %ebp -; X86-NEXT: xorl %esi, %edi -; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: setne %al -; X86-NEXT: movl %ecx, (%edx) -; X86-NEXT: movl %edi, 4(%edx) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btcl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: complement_ne_i64: @@ -300,40 +272,20 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind { define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: reset_eq_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %esi -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB7_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %esi, %edi -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: .LBB7_2: -; X86-NEXT: movl (%edx), %eax -; X86-NEXT: movl 4(%edx), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: andl %edi, %ebx -; X86-NEXT: notl %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: andl %esi, %ebp -; X86-NEXT: notl %esi -; X86-NEXT: andl %ecx, %edi -; X86-NEXT: andl %eax, %esi -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: sete %al -; X86-NEXT: movl %esi, (%edx) -; X86-NEXT: movl %edi, 4(%edx) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: reset_eq_i64: @@ -361,38 +313,20 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind { define i1 @set_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: set_ne_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %eax -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB8_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: .LBB8_2: -; X86-NEXT: movl (%edx), %ecx -; X86-NEXT: movl 4(%edx), %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: andl %esi, %ebx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: andl %eax, %ebp -; X86-NEXT: orl %esi, %edi -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: setne %al -; X86-NEXT: movl %ecx, (%edx) -; X86-NEXT: movl %edi, 4(%edx) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btsl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: set_ne_i64: @@ -419,52 +353,47 @@ define i1 @set_ne_i64(ptr %word, i32 %position) nounwind { define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-LABEL: init_eq_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %edx +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %edi, %edi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: shll %cl, %esi +; X86-NEXT: shldl %cl, %eax, %edi +; X86-NEXT: shll %cl, %eax ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB9_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl $0, %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl $0, %edx ; X86-NEXT: .LBB9_2: -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: notl %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: notl %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: notl %esi +; X86-NEXT: notl %edx ; X86-NEXT: je .LBB9_4 ; X86-NEXT: # %bb.3: -; X86-NEXT: movl %esi, %edi -; X86-NEXT: xorl %esi, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: .LBB9_4: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: andl %ecx, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl (%edi), %ecx -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: andl %ecx, %ebp -; X86-NEXT: orl %esi, %ebp -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %ebp, (%edi) -; X86-NEXT: movl %ebx, 4(%edi) -; X86-NEXT: sete %al +; X86-NEXT: andl 4(%ebx), %esi +; X86-NEXT: orl %edi, %esi +; X86-NEXT: andl (%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $32, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: movl (%ebx,%eax), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setae %al +; X86-NEXT: movl %esi, 4(%ebx) +; X86-NEXT: movl %edx, (%ebx) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: init_eq_i64: @@ -516,101 +445,25 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $48, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, (%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %esi -; X86-NEXT: movl 24(%esp,%esi), %edi -; X86-NEXT: movl 28(%esp,%esi), %eax -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl 16(%esp,%esi), %edx -; X86-NEXT: movl 20(%esp,%esi), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: andl 8(%ebx), %edi -; X86-NEXT: andl (%ebx), %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: andl 12(%ebx), %eax -; X86-NEXT: andl 4(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $96, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%eax,%edx), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setb %al ; X86-NEXT: retl ; -; SSE-LABEL: test_ne_i128: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: cmovneq %rsi, %rax -; SSE-NEXT: andq 8(%rdi), %rdx -; SSE-NEXT: andq (%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: setne %al -; SSE-NEXT: retq -; -; AVX2-LABEL: test_ne_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: movl $1, %edx -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shldq %cl, %rdx, %rsi -; AVX2-NEXT: shlxq %rcx, %rdx, %rdx -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rdx, %rsi -; AVX2-NEXT: cmovneq %rax, %rdx -; AVX2-NEXT: andq 8(%rdi), %rsi -; AVX2-NEXT: andq (%rdi), %rdx -; AVX2-NEXT: orq %rsi, %rdx -; AVX2-NEXT: setne %al -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_ne_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl $1, %eax -; AVX512-NEXT: xorl %edx, %edx -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shlxq %rcx, %rax, %rax -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %rax, %rdx -; AVX512-NEXT: cmovneq %rsi, %rax -; AVX512-NEXT: andq 8(%rdi), %rdx -; AVX512-NEXT: andq (%rdi), %rax -; AVX512-NEXT: orq %rdx, %rax -; AVX512-NEXT: setne %al -; AVX512-NEXT: retq +; X64-LABEL: test_ne_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $96, %eax +; X64-NEXT: shrl $3, %eax +; X64-NEXT: movl (%rdi,%rax), %eax +; X64-NEXT: btl %esi, %eax +; X64-NEXT: setb %al +; X64-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -623,124 +476,33 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind { define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_ne_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $80, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 56(%esp,%eax), %esi -; X86-NEXT: movl 60(%esp,%eax), %edx -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esp,%eax), %edi -; X86-NEXT: movl 52(%esp,%eax), %ebx -; X86-NEXT: shldl %cl, %ebx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %ebx -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl 8(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: andl %edi, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl 12(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 4(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: movl %edi, (%eax) -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btcl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: complement_ne_i128: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %edx -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: shldq %cl, %rdx, %rsi -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rdx, %rsi -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq 8(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r8 -; SSE-NEXT: andq %rsi, %r8 -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: andq %rdx, %r9 -; SSE-NEXT: xorq %rcx, %rsi -; SSE-NEXT: xorq %rax, %rdx -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: setne %al -; SSE-NEXT: movq %rdx, (%rdi) -; SSE-NEXT: movq %rsi, 8(%rdi) -; SSE-NEXT: retq -; -; AVX-LABEL: complement_ne_i128: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movl $1, %edx -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: shldq %cl, %rdx, %rsi -; AVX-NEXT: shlxq %rcx, %rdx, %rdx -; AVX-NEXT: testb $64, %cl -; AVX-NEXT: cmovneq %rdx, %rsi -; AVX-NEXT: cmovneq %rax, %rdx -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: movq %rcx, %r8 -; AVX-NEXT: andq %rsi, %r8 -; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: andq %rdx, %r9 -; AVX-NEXT: xorq %rcx, %rsi -; AVX-NEXT: xorq %rax, %rdx -; AVX-NEXT: orq %r8, %r9 -; AVX-NEXT: setne %al -; AVX-NEXT: movq %rdx, (%rdi) -; AVX-NEXT: movq %rsi, 8(%rdi) -; AVX-NEXT: retq +; X64-LABEL: complement_ne_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: andl $96, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setb %al +; X64-NEXT: btcl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -755,124 +517,33 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind { define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: reset_eq_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $80, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 56(%esp,%eax), %edx -; X86-NEXT: movl 60(%esp,%eax), %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esp,%eax), %esi -; X86-NEXT: movl 52(%esp,%eax), %edi -; X86-NEXT: shldl %cl, %edi, %edx -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl 8(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: movl (%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: movl 4(%ebx), %ebx -; X86-NEXT: andl %ebx, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: notl %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: notl %ecx -; X86-NEXT: andl %ebx, %ecx -; X86-NEXT: notl %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl 8(%ebp), %edi -; X86-NEXT: movl %edx, 8(%edi) -; X86-NEXT: movl %eax, 12(%edi) -; X86-NEXT: movl %esi, (%edi) -; X86-NEXT: movl %ecx, 4(%edi) -; X86-NEXT: sete %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: reset_eq_i128: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %edx -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: shldq %cl, %rdx, %rsi -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rdx, %rsi -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq 8(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r8 -; SSE-NEXT: andq %rsi, %r8 -; SSE-NEXT: notq %rsi -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: andq %rdx, %r9 -; SSE-NEXT: notq %rdx -; SSE-NEXT: andq %rcx, %rsi -; SSE-NEXT: andq %rax, %rdx -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: sete %al -; SSE-NEXT: movq %rdx, (%rdi) -; SSE-NEXT: movq %rsi, 8(%rdi) -; SSE-NEXT: retq -; -; AVX-LABEL: reset_eq_i128: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movl $1, %edx -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: shldq %cl, %rdx, %rsi -; AVX-NEXT: shlxq %rcx, %rdx, %rdx -; AVX-NEXT: testb $64, %cl -; AVX-NEXT: cmovneq %rdx, %rsi -; AVX-NEXT: cmovneq %rax, %rdx -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: andnq %rcx, %rsi, %r8 -; AVX-NEXT: andq %rsi, %rcx -; AVX-NEXT: andnq %rax, %rdx, %rsi -; AVX-NEXT: andq %rdx, %rax -; AVX-NEXT: orq %rcx, %rax -; AVX-NEXT: sete %al -; AVX-NEXT: movq %rsi, (%rdi) -; AVX-NEXT: movq %r8, 8(%rdi) -; AVX-NEXT: retq +; X64-LABEL: reset_eq_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: andl $96, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setae %al +; X64-NEXT: btrl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -888,124 +559,33 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind { define i1 @set_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: set_ne_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $80, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 56(%esp,%eax), %esi -; X86-NEXT: movl 60(%esp,%eax), %edx -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esp,%eax), %edi -; X86-NEXT: movl 52(%esp,%eax), %ebx -; X86-NEXT: shldl %cl, %ebx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %ebx -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl 8(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: andl %edi, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl 12(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 4(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: movl %edi, (%eax) -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btsl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: set_ne_i128: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %edx -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: shldq %cl, %rdx, %rsi -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rdx, %rsi -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq 8(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r8 -; SSE-NEXT: andq %rsi, %r8 -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: andq %rdx, %r9 -; SSE-NEXT: orq %rcx, %rsi -; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: setne %al -; SSE-NEXT: movq %rdx, (%rdi) -; SSE-NEXT: movq %rsi, 8(%rdi) -; SSE-NEXT: retq -; -; AVX-LABEL: set_ne_i128: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movl $1, %edx -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: shldq %cl, %rdx, %rsi -; AVX-NEXT: shlxq %rcx, %rdx, %rdx -; AVX-NEXT: testb $64, %cl -; AVX-NEXT: cmovneq %rdx, %rsi -; AVX-NEXT: cmovneq %rax, %rdx -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: movq %rcx, %r8 -; AVX-NEXT: andq %rsi, %r8 -; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: andq %rdx, %r9 -; AVX-NEXT: orq %rcx, %rsi -; AVX-NEXT: orq %rax, %rdx -; AVX-NEXT: orq %r8, %r9 -; AVX-NEXT: setne %al -; AVX-NEXT: movq %rdx, (%rdi) -; AVX-NEXT: movq %rsi, 8(%rdi) -; AVX-NEXT: retq +; X64-LABEL: set_ne_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: andl $96, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setb %al +; X64-NEXT: btsl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -1026,9 +606,9 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $128, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movzbl 16(%ebp), %eax +; X86-NEXT: subl $96, %esp +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movzbl 16(%ebp), %ebx ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1037,25 +617,29 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrb $3, %dl -; X86-NEXT: andb $12, %dl -; X86-NEXT: negb %dl -; X86-NEXT: movsbl %dl, %esi -; X86-NEXT: movl 64(%esp,%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 68(%esp,%esi), %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %edi +; X86-NEXT: movl 72(%esp,%edi), %edx +; X86-NEXT: movl 76(%esp,%edi), %esi +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 64(%esp,%edi), %ebx +; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: movl 68(%esp,%edi), %ebx +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: shldl %cl, %ebx, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 72(%esp,%esi), %ebx +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: notl %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 76(%esp,%esi), %edi -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: shldl %cl, %ebx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shll %cl, %edx ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1063,72 +647,53 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %edi, %esi +; X86-NEXT: movl 40(%esp,%eax), %edi +; X86-NEXT: movl 44(%esp,%eax), %esi +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shldl %cl, %edi, %esi ; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl 12(%ecx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: notl %ecx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl 100(%esp,%ecx), %edi -; X86-NEXT: movl 104(%esp,%ecx), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: shldl %cl, %edi, %ebx -; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: notl %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl 108(%esp,%ebx), %ebx -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 12(%ecx), %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: notl %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl 96(%esp,%ebx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: orl %ebx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl 36(%esp,%esi), %esi +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: andl 8(%edx), %eax +; X86-NEXT: orl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: notl %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl 32(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl 8(%ebp), %edi +; X86-NEXT: andl 4(%edi), %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edi -; X86-NEXT: orl %edi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 8(%ecx) -; X86-NEXT: movl %esi, 12(%ecx) -; X86-NEXT: movl %eax, (%ecx) -; X86-NEXT: movl %edx, 4(%ecx) -; X86-NEXT: sete %al +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %eax +; X86-NEXT: andl (%edi), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $96, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: movl (%edi,%eax), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 12(%edi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 8(%edi) +; X86-NEXT: movl %ebx, 4(%edi) +; X86-NEXT: movl %edx, (%edi) +; X86-NEXT: setae %al ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -1151,86 +716,84 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; SSE-NEXT: testb $64, %cl ; SSE-NEXT: cmovneq %rsi, %r8 ; SSE-NEXT: cmovneq %r9, %rsi +; SSE-NEXT: notq %r8 ; SSE-NEXT: cmovneq %rax, %rdx ; SSE-NEXT: cmovneq %r9, %rax -; SSE-NEXT: movq (%rdi), %rcx -; SSE-NEXT: movq 8(%rdi), %r9 -; SSE-NEXT: movq %r9, %r10 -; SSE-NEXT: andq %r8, %r10 -; SSE-NEXT: notq %r8 -; SSE-NEXT: movq %rcx, %r11 -; SSE-NEXT: andq %rsi, %r11 ; SSE-NEXT: notq %rsi -; SSE-NEXT: andq %r9, %r8 +; SSE-NEXT: andq 8(%rdi), %r8 ; SSE-NEXT: orq %rdx, %r8 -; SSE-NEXT: andq %rcx, %rsi +; SSE-NEXT: andq (%rdi), %rsi ; SSE-NEXT: orq %rax, %rsi -; SSE-NEXT: orq %r10, %r11 -; SSE-NEXT: sete %al -; SSE-NEXT: movq %rsi, (%rdi) +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: andl $96, %eax +; SSE-NEXT: shrl $3, %eax +; SSE-NEXT: movl (%rdi,%rax), %eax +; SSE-NEXT: btl %ecx, %eax +; SSE-NEXT: setae %al ; SSE-NEXT: movq %r8, 8(%rdi) +; SSE-NEXT: movq %rsi, (%rdi) ; SSE-NEXT: retq ; ; AVX2-LABEL: init_eq_i128: ; AVX2: # %bb.0: ; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl $1, %esi -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: shldq %cl, %rsi, %rax -; AVX2-NEXT: xorl %r8d, %r8d +; AVX2-NEXT: movl $1, %eax +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: shldq %cl, %rax, %rsi ; AVX2-NEXT: movl %edx, %edx +; AVX2-NEXT: xorl %r8d, %r8d +; AVX2-NEXT: shldq %cl, %rdx, %r8 ; AVX2-NEXT: xorl %r9d, %r9d -; AVX2-NEXT: shldq %cl, %rdx, %r9 -; AVX2-NEXT: shlxq %rcx, %rsi, %rsi +; AVX2-NEXT: shlxq %rcx, %rax, %rax ; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rsi, %rax -; AVX2-NEXT: cmovneq %r8, %rsi -; AVX2-NEXT: shlxq %rcx, %rdx, %rcx -; AVX2-NEXT: cmovneq %rcx, %r9 -; AVX2-NEXT: cmovneq %r8, %rcx -; AVX2-NEXT: movq (%rdi), %rdx -; AVX2-NEXT: movq 8(%rdi), %r8 -; AVX2-NEXT: andnq %r8, %rax, %r10 -; AVX2-NEXT: andq %rax, %r8 -; AVX2-NEXT: andnq %rdx, %rsi, %r11 -; AVX2-NEXT: andq %rsi, %rdx -; AVX2-NEXT: orq %r9, %r10 -; AVX2-NEXT: orq %rcx, %r11 -; AVX2-NEXT: orq %r8, %rdx -; AVX2-NEXT: sete %al -; AVX2-NEXT: movq %r11, (%rdi) -; AVX2-NEXT: movq %r10, 8(%rdi) +; AVX2-NEXT: cmovneq %rax, %rsi +; AVX2-NEXT: cmovneq %r9, %rax +; AVX2-NEXT: shlxq %rcx, %rdx, %rdx +; AVX2-NEXT: cmovneq %rdx, %r8 +; AVX2-NEXT: cmovneq %r9, %rdx +; AVX2-NEXT: andnq 8(%rdi), %rsi, %rsi +; AVX2-NEXT: orq %r8, %rsi +; AVX2-NEXT: andnq (%rdi), %rax, %r8 +; AVX2-NEXT: orq %rdx, %r8 +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $96, %eax +; AVX2-NEXT: shrl $3, %eax +; AVX2-NEXT: movl (%rdi,%rax), %eax +; AVX2-NEXT: btl %ecx, %eax +; AVX2-NEXT: setae %al +; AVX2-NEXT: movq %rsi, 8(%rdi) +; AVX2-NEXT: movq %r8, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: init_eq_i128: ; AVX512: # %bb.0: ; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movl $1, %esi +; AVX512-NEXT: movl $1, %eax +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: shldq %cl, %rax, %rsi ; AVX512-NEXT: xorl %r8d, %r8d -; AVX512-NEXT: shldq %cl, %rsi, %r8 -; AVX512-NEXT: shlxq %rcx, %rsi, %rsi +; AVX512-NEXT: shlxq %rcx, %rax, %rax ; AVX512-NEXT: movl %edx, %edx ; AVX512-NEXT: xorl %r9d, %r9d ; AVX512-NEXT: shldq %cl, %rdx, %r9 ; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %rsi, %r8 ; AVX512-NEXT: cmovneq %rax, %rsi -; AVX512-NEXT: shlxq %rcx, %rdx, %rcx -; AVX512-NEXT: cmovneq %rcx, %r9 -; AVX512-NEXT: cmovneq %rax, %rcx -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %rdx -; AVX512-NEXT: andnq %rdx, %r8, %r10 -; AVX512-NEXT: andq %r8, %rdx -; AVX512-NEXT: andnq %rax, %rsi, %r8 -; AVX512-NEXT: andq %rsi, %rax -; AVX512-NEXT: orq %r9, %r10 -; AVX512-NEXT: orq %rcx, %r8 -; AVX512-NEXT: orq %rdx, %rax -; AVX512-NEXT: sete %al +; AVX512-NEXT: cmovneq %r8, %rax +; AVX512-NEXT: shlxq %rcx, %rdx, %rdx +; AVX512-NEXT: cmovneq %rdx, %r9 +; AVX512-NEXT: cmovneq %r8, %rdx +; AVX512-NEXT: andnq 8(%rdi), %rsi, %rsi +; AVX512-NEXT: orq %r9, %rsi +; AVX512-NEXT: andnq (%rdi), %rax, %r8 +; AVX512-NEXT: orq %rdx, %r8 +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: andl $96, %eax +; AVX512-NEXT: shrl $3, %eax +; AVX512-NEXT: movl (%rdi,%rax), %eax +; AVX512-NEXT: btl %ecx, %eax +; AVX512-NEXT: setae %al +; AVX512-NEXT: movq %rsi, 8(%rdi) ; AVX512-NEXT: movq %r8, (%rdi) -; AVX512-NEXT: movq %r10, 8(%rdi) ; AVX512-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 @@ -1252,20 +815,175 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i512(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i512: ; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: andl $60, %edx +; X86-NEXT: movl (%eax,%edx), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: test_ne_i512: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: shrl $3, %eax +; X64-NEXT: andl $60, %eax +; X64-NEXT: movl (%rdi,%rax), %eax +; X64-NEXT: btl %esi, %eax +; X64-NEXT: setb %al +; X64-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %cmp = icmp ne i512 %test, 0 + ret i1 %cmp +} + +define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { +; X86-LABEL: complement_ne_i512: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btcl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl +; +; X64-LABEL: complement_ne_i512: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: andl $60, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setb %al +; X64-NEXT: btcl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %res = xor i512 %ld, %bit + %cmp = icmp ne i512 %test, 0 + store i512 %res, ptr %word + ret i1 %cmp +} + +define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind { +; X86-LABEL: reset_eq_i512: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl +; +; X64-LABEL: reset_eq_i512: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: andl $60, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setae %al +; X64-NEXT: btrl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %mask = xor i512 %bit, -1 + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %res = and i512 %ld, %mask + %cmp = icmp eq i512 %test, 0 + store i512 %res, ptr %word + ret i1 %cmp +} + +define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { +; X86-LABEL: set_ne_i512: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btsl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl +; +; X64-LABEL: set_ne_i512: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: andl $60, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setb %al +; X64-NEXT: btsl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %res = or i512 %ld, %bit + %cmp = icmp ne i512 %test, 0 + store i512 %res, ptr %word + ret i1 %cmp +} + +define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { +; X86-LABEL: init_eq_i512: +; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $224, %esp +; X86-NEXT: subl $352, %esp # imm = 0x160 ; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: andl $60, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-NEXT: subl %eax, %edx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: andl $60, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: subl %edx, %eax ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1298,325 +1016,88 @@ define i1 @test_ne_i512(ptr %word, i32 %position) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 24(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ecx -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%edx), %eax -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%edx), %edi +; X86-NEXT: movl 56(%eax), %esi +; X86-NEXT: movl 60(%eax), %ebx +; X86-NEXT: movl 52(%eax), %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%edx), %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%edx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%eax), %edi +; X86-NEXT: movl 44(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 24(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 4(%eax), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl 16(%ebp), %eax +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: andl $31, %ecx ; X86-NEXT: shldl %cl, %esi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 52(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 4(%edx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: andl 40(%ebx), %eax -; X86-NEXT: andl 8(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 56(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 24(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %esi, %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl %cl, %ebx, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: andl 44(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 12(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 60(%edi), %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 28(%edi), %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: orl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%edx), %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: negl %edx -; X86-NEXT: movl 192(%esp,%edx), %edx +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: andl 32(%ebx), %ecx -; X86-NEXT: andl (%ebx), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: andl 16(%ebx), %edi -; X86-NEXT: andl 48(%ebx), %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 36(%ebx), %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 4(%ebx), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 20(%ebx), %ecx -; X86-NEXT: andl 52(%ebx), %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl %edx, %eax -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; SSE-LABEL: test_ne_i512: -; SSE: # %bb.0: -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %rbx -; SSE-NEXT: movq -48(%rsp,%rbx), %rdx -; SSE-NEXT: movq -40(%rsp,%rbx), %r14 -; SSE-NEXT: movq %r14, %rax -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq -16(%rsp,%rbx), %r11 -; SSE-NEXT: movq -8(%rsp,%rbx), %r10 -; SSE-NEXT: shldq %cl, %r11, %r10 -; SSE-NEXT: movq -32(%rsp,%rbx), %r9 -; SSE-NEXT: movq -24(%rsp,%rbx), %r15 -; SSE-NEXT: movq %r15, %r8 -; SSE-NEXT: shldq %cl, %r9, %r8 -; SSE-NEXT: movq -56(%rsp,%rbx), %rsi -; SSE-NEXT: shldq %cl, %rsi, %rdx -; SSE-NEXT: shldq %cl, %r15, %r11 -; SSE-NEXT: shldq %cl, %r14, %r9 -; SSE-NEXT: movq -64(%rsp,%rbx), %rbx -; SSE-NEXT: shldq %cl, %rbx, %rsi -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rbx -; SSE-NEXT: andq 32(%rdi), %r9 -; SSE-NEXT: andq 48(%rdi), %r11 -; SSE-NEXT: andq 16(%rdi), %rdx -; SSE-NEXT: orq %r11, %rdx -; SSE-NEXT: andq 40(%rdi), %r8 -; SSE-NEXT: andq 56(%rdi), %r10 -; SSE-NEXT: andq 24(%rdi), %rax -; SSE-NEXT: orq %r10, %rax -; SSE-NEXT: andq (%rdi), %rbx -; SSE-NEXT: orq %r9, %rbx -; SSE-NEXT: orq %rdx, %rbx -; SSE-NEXT: andq 8(%rdi), %rsi -; SSE-NEXT: orq %r8, %rsi -; SSE-NEXT: orq %rax, %rsi -; SSE-NEXT: orq %rbx, %rsi -; SSE-NEXT: setne %al -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: retq -; -; AVX2-LABEL: test_ne_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rsi -; AVX2-NEXT: movq -48(%rsp,%rsi), %rdx -; AVX2-NEXT: movq -40(%rsp,%rsi), %rbx -; AVX2-NEXT: movq %rbx, %rax -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq -16(%rsp,%rsi), %r11 -; AVX2-NEXT: movq -8(%rsp,%rsi), %r10 -; AVX2-NEXT: shldq %cl, %r11, %r10 -; AVX2-NEXT: movq -32(%rsp,%rsi), %r9 -; AVX2-NEXT: movq -24(%rsp,%rsi), %r14 -; AVX2-NEXT: movq %r14, %r8 -; AVX2-NEXT: shldq %cl, %r9, %r8 -; AVX2-NEXT: movq -64(%rsp,%rsi), %r15 -; AVX2-NEXT: movq -56(%rsp,%rsi), %rsi -; AVX2-NEXT: shldq %cl, %rsi, %rdx -; AVX2-NEXT: shldq %cl, %r14, %r11 -; AVX2-NEXT: shldq %cl, %rbx, %r9 -; AVX2-NEXT: shldq %cl, %r15, %rsi -; AVX2-NEXT: shlxq %rcx, %r15, %rcx -; AVX2-NEXT: andq 32(%rdi), %r9 -; AVX2-NEXT: andq 48(%rdi), %r11 -; AVX2-NEXT: andq 16(%rdi), %rdx -; AVX2-NEXT: andq 40(%rdi), %r8 -; AVX2-NEXT: andq 56(%rdi), %r10 -; AVX2-NEXT: andq 24(%rdi), %rax -; AVX2-NEXT: orq %r11, %rdx -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: andq (%rdi), %rcx -; AVX2-NEXT: orq %r9, %rcx -; AVX2-NEXT: orq %rdx, %rcx -; AVX2-NEXT: andq 8(%rdi), %rsi -; AVX2-NEXT: orq %r8, %rsi -; AVX2-NEXT: orq %rax, %rsi -; AVX2-NEXT: orq %rcx, %rsi -; AVX2-NEXT: setne %al -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_ne_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rbx -; AVX512-NEXT: movq -48(%rsp,%rbx), %rdx -; AVX512-NEXT: movq -40(%rsp,%rbx), %r14 -; AVX512-NEXT: movq %r14, %rax -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq -16(%rsp,%rbx), %r11 -; AVX512-NEXT: movq -8(%rsp,%rbx), %r10 -; AVX512-NEXT: shldq %cl, %r11, %r10 -; AVX512-NEXT: movq -32(%rsp,%rbx), %r9 -; AVX512-NEXT: movq -24(%rsp,%rbx), %r15 -; AVX512-NEXT: movq %r15, %r8 -; AVX512-NEXT: shldq %cl, %r9, %r8 -; AVX512-NEXT: movq -56(%rsp,%rbx), %rsi -; AVX512-NEXT: shldq %cl, %rsi, %rdx -; AVX512-NEXT: shldq %cl, %r15, %r11 -; AVX512-NEXT: shldq %cl, %r14, %r9 -; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx -; AVX512-NEXT: shldq %cl, %rbx, %rsi -; AVX512-NEXT: shlxq %rcx, %rbx, %rcx -; AVX512-NEXT: andq 32(%rdi), %r9 -; AVX512-NEXT: andq 48(%rdi), %r11 -; AVX512-NEXT: andq 16(%rdi), %rdx -; AVX512-NEXT: andq 40(%rdi), %r8 -; AVX512-NEXT: andq 56(%rdi), %r10 -; AVX512-NEXT: andq 24(%rdi), %rax -; AVX512-NEXT: orq %r11, %rdx -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: andq (%rdi), %rcx -; AVX512-NEXT: orq %r9, %rcx -; AVX512-NEXT: orq %rdx, %rcx -; AVX512-NEXT: andq 8(%rdi), %rsi -; AVX512-NEXT: orq %r8, %rsi -; AVX512-NEXT: orq %rax, %rsi -; AVX512-NEXT: orq %rcx, %rsi -; AVX512-NEXT: setne %al -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %cmp = icmp ne i512 %test, 0 - ret i1 %cmp -} - -define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { -; X86-LABEL: complement_ne_i512: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $272, %esp # imm = 0x110 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: andl $60, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-NEXT: subl %eax, %edx +; X86-NEXT: shldl %cl, %esi, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1632,7 +1113,6 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1649,4366 +1129,274 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 24(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ecx -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%edx), %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%edx), %ebx -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%edx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 52(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl 56(%eax), %esi +; X86-NEXT: movl 60(%eax), %edi ; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: andl 60(%edx), %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: notl %ebx +; X86-NEXT: movl 52(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 56(%edx), %ebx +; X86-NEXT: orl %esi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: movl 40(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: movl 8(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl 56(%edx), %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 48(%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: andl 52(%edx), %ebx +; X86-NEXT: orl %edi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %ebx -; X86-NEXT: movl 24(%edx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 12(%eax), %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 44(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 48(%edx), %ebx +; X86-NEXT: orl %esi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 40(%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: andl 44(%edx), %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 36(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 40(%edx), %ebx ; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl 60(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 28(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: notl %ebx +; X86-NEXT: movl 32(%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: andl 36(%edx), %ebx +; X86-NEXT: orl %edi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: negl %eax -; X86-NEXT: movl 240(%esp,%eax), %esi -; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl 32(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %eax -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl 16(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: movl 48(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 36(%esi), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl 20(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl %esi, %edi -; X86-NEXT: movl 52(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl %edi, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: xorl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: movl %ebx, 60(%edx) -; X86-NEXT: movl %edi, 56(%edx) -; X86-NEXT: movl %ecx, 52(%edx) -; X86-NEXT: movl %esi, 44(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 40(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 36(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 32(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 24(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 12(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 8(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 4(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, (%edx) -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 48(%edx) -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; SSE-LABEL: complement_ne_i512: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $56, %rsp -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %rbx -; SSE-NEXT: movq (%rsp,%rbx), %rsi -; SSE-NEXT: movq 8(%rsp,%rbx), %r14 -; SSE-NEXT: movq %r14, %rax -; SSE-NEXT: shldq %cl, %rsi, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 32(%rsp,%rbx), %r8 -; SSE-NEXT: movq 40(%rsp,%rbx), %rbp -; SSE-NEXT: shldq %cl, %r8, %rbp -; SSE-NEXT: movq 16(%rsp,%rbx), %r9 -; SSE-NEXT: movq 24(%rsp,%rbx), %r15 -; SSE-NEXT: movq %r15, %r10 -; SSE-NEXT: shldq %cl, %r9, %r10 -; SSE-NEXT: movq -8(%rsp,%rbx), %r11 -; SSE-NEXT: shldq %cl, %r11, %rsi -; SSE-NEXT: shldq %cl, %r15, %r8 -; SSE-NEXT: shldq %cl, %r14, %r9 -; SSE-NEXT: movq -16(%rsp,%rbx), %rbx -; SSE-NEXT: shldq %cl, %rbx, %r11 -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rbx -; SSE-NEXT: movq 24(%rdi), %r15 -; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 56(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 16(%rdi), %r12 -; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 48(%rdi), %r13 -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %r8, %r13 -; SSE-NEXT: andq %rsi, %r12 -; SSE-NEXT: orq %r13, %r12 -; SSE-NEXT: movq %rcx, %r13 -; SSE-NEXT: andq %rbp, %r13 -; SSE-NEXT: andq %rax, %r15 -; SSE-NEXT: orq %r13, %r15 -; SSE-NEXT: movq 32(%rdi), %r14 -; SSE-NEXT: movq %r14, %rcx -; SSE-NEXT: andq %r9, %rcx -; SSE-NEXT: movq (%rdi), %r13 -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rbx, %r13 -; SSE-NEXT: orq %rcx, %r13 -; SSE-NEXT: orq %r12, %r13 -; SSE-NEXT: movq 40(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r12 -; SSE-NEXT: andq %r10, %r12 -; SSE-NEXT: movq 8(%rdi), %rdx -; SSE-NEXT: movq %rdx, %rax -; SSE-NEXT: andq %r11, %rax -; SSE-NEXT: orq %r12, %rax -; SSE-NEXT: orq %r15, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; SSE-NEXT: xorq %rcx, %r10 -; SSE-NEXT: xorq %r14, %r9 -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; SSE-NEXT: xorq %rdx, %r11 -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; SSE-NEXT: orq %r13, %rax -; SSE-NEXT: movq %r8, 48(%rdi) -; SSE-NEXT: movq %rbp, 56(%rdi) -; SSE-NEXT: movq %r9, 32(%rdi) -; SSE-NEXT: movq %r10, 40(%rdi) -; SSE-NEXT: movq %rsi, 16(%rdi) -; SSE-NEXT: movq %r15, 24(%rdi) -; SSE-NEXT: movq %rbx, (%rdi) -; SSE-NEXT: movq %r11, 8(%rdi) -; SSE-NEXT: setne %al -; SSE-NEXT: addq $56, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp -; SSE-NEXT: retq -; -; AVX2-LABEL: complement_ne_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $72, %rsp -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, (%rsp) -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rbx -; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi -; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp -; AVX2-NEXT: movq %rbp, %rax -; AVX2-NEXT: shldq %cl, %rsi, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 48(%rsp,%rbx), %r8 -; AVX2-NEXT: movq 56(%rsp,%rbx), %r13 -; AVX2-NEXT: shldq %cl, %r8, %r13 -; AVX2-NEXT: movq 32(%rsp,%rbx), %r9 -; AVX2-NEXT: movq 40(%rsp,%rbx), %r14 -; AVX2-NEXT: movq %r14, %r10 -; AVX2-NEXT: shldq %cl, %r9, %r10 -; AVX2-NEXT: movq 8(%rsp,%rbx), %r11 -; AVX2-NEXT: shldq %cl, %r11, %rsi -; AVX2-NEXT: shldq %cl, %r14, %r8 -; AVX2-NEXT: movq 16(%rdi), %r12 -; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 48(%rdi), %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r8, %r14 -; AVX2-NEXT: andq %rsi, %r12 -; AVX2-NEXT: orq %r14, %r12 -; AVX2-NEXT: movq 56(%rdi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r13, %r15 -; AVX2-NEXT: movq 24(%rdi), %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %rax, %r14 -; AVX2-NEXT: orq %r15, %r14 -; AVX2-NEXT: shldq %cl, %rbp, %r9 -; AVX2-NEXT: movq (%rsp,%rbx), %rdx -; AVX2-NEXT: movq 32(%rdi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r9, %r15 -; AVX2-NEXT: shlxq %rcx, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq (%rdi), %rbx -; AVX2-NEXT: movq %rbx, %rbp -; AVX2-NEXT: andq %rax, %rbp -; AVX2-NEXT: orq %r15, %rbp -; AVX2-NEXT: orq %r12, %rbp -; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %rdx, %r11 -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: andq %r10, %rcx -; AVX2-NEXT: movq 8(%rdi), %r15 -; AVX2-NEXT: movq %r15, %r12 -; AVX2-NEXT: andq %r11, %r12 -; AVX2-NEXT: orq %rcx, %r12 -; AVX2-NEXT: orq %r14, %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX2-NEXT: xorq %rax, %r10 -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX2-NEXT: xorq %r15, %r11 -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX2-NEXT: orq %rbp, %r12 -; AVX2-NEXT: movq %r8, 48(%rdi) -; AVX2-NEXT: movq %r13, 56(%rdi) -; AVX2-NEXT: movq %r9, 32(%rdi) -; AVX2-NEXT: movq %r10, 40(%rdi) -; AVX2-NEXT: movq %rsi, 16(%rdi) -; AVX2-NEXT: movq %rcx, 24(%rdi) -; AVX2-NEXT: movq %rbx, (%rdi) -; AVX2-NEXT: movq %r11, 8(%rdi) -; AVX2-NEXT: setne %al -; AVX2-NEXT: addq $72, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: complement_ne_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $72, %rsp -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, (%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rbx -; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi -; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp -; AVX512-NEXT: movq %rbp, %rax -; AVX512-NEXT: shldq %cl, %rsi, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rsp,%rbx), %r8 -; AVX512-NEXT: movq 56(%rsp,%rbx), %r13 -; AVX512-NEXT: shldq %cl, %r8, %r13 -; AVX512-NEXT: movq 32(%rsp,%rbx), %r9 -; AVX512-NEXT: movq 40(%rsp,%rbx), %r14 -; AVX512-NEXT: movq %r14, %r10 -; AVX512-NEXT: shldq %cl, %r9, %r10 -; AVX512-NEXT: movq 8(%rsp,%rbx), %r11 -; AVX512-NEXT: shldq %cl, %r11, %rsi -; AVX512-NEXT: shldq %cl, %r14, %r8 -; AVX512-NEXT: movq 16(%rdi), %r12 -; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rdi), %r14 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r8, %r14 -; AVX512-NEXT: andq %rsi, %r12 -; AVX512-NEXT: orq %r14, %r12 -; AVX512-NEXT: movq 56(%rdi), %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r13, %r15 -; AVX512-NEXT: movq 24(%rdi), %r14 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %rax, %r14 -; AVX512-NEXT: orq %r15, %r14 -; AVX512-NEXT: shldq %cl, %rbp, %r9 -; AVX512-NEXT: movq (%rsp,%rbx), %rdx -; AVX512-NEXT: movq 32(%rdi), %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r9, %r15 -; AVX512-NEXT: shlxq %rcx, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq (%rdi), %rbx -; AVX512-NEXT: movq %rbx, %rbp -; AVX512-NEXT: andq %rax, %rbp -; AVX512-NEXT: orq %r15, %rbp -; AVX512-NEXT: orq %r12, %rbp -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rdx, %r11 -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andq %r10, %rcx -; AVX512-NEXT: movq 8(%rdi), %r15 -; AVX512-NEXT: movq %r15, %r12 -; AVX512-NEXT: andq %r11, %r12 -; AVX512-NEXT: orq %rcx, %r12 -; AVX512-NEXT: orq %r14, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX512-NEXT: xorq %rax, %r10 -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX512-NEXT: xorq %r15, %r11 -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX512-NEXT: orq %rbp, %r12 -; AVX512-NEXT: movq %r8, 48(%rdi) -; AVX512-NEXT: movq %r13, 56(%rdi) -; AVX512-NEXT: movq %r9, 32(%rdi) -; AVX512-NEXT: movq %r10, 40(%rdi) -; AVX512-NEXT: movq %rsi, 16(%rdi) -; AVX512-NEXT: movq %rcx, 24(%rdi) -; AVX512-NEXT: movq %rbx, (%rdi) -; AVX512-NEXT: movq %r11, 8(%rdi) -; AVX512-NEXT: setne %al -; AVX512-NEXT: addq $72, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %res = xor i512 %ld, %bit - %cmp = icmp ne i512 %test, 0 - store i512 %res, ptr %word - ret i1 %cmp -} - -define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind { -; X86-LABEL: reset_eq_i512: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $288, %esp # imm = 0x120 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: andl $60, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edi -; X86-NEXT: subl %eax, %edi -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 4(%edi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%edi), %eax -; X86-NEXT: andl $31, %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: shldl %cl, %edx, %ebx -; X86-NEXT: movl 12(%edi), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%edi), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%edi), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%edi), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%edi), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%edi), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%edi), %esi -; X86-NEXT: movl %esi, %edx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%edi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %edx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %ebx -; X86-NEXT: orl %edx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%edi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl 52(%edi), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%edi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %esi, %ebx +; X86-NEXT: notl %ebx +; X86-NEXT: movl 28(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 32(%edx), %ebx +; X86-NEXT: orl %esi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl 56(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: orl %ebx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl 44(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%edi), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%edi), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: negl %eax -; X86-NEXT: movl 256(%esp,%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %esi, %edi -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: movl 32(%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %edi -; X86-NEXT: orl %edx, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 24(%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: andl 28(%edx), %ebx +; X86-NEXT: orl %edi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %ebx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: orl %ebx, %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%esi), %edi -; X86-NEXT: andl %edi, %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: movl 52(%ebx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: notl %ebx -; X86-NEXT: andl %edi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: notl %ecx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %edx, 60(%eax) -; X86-NEXT: movl %esi, 56(%eax) -; X86-NEXT: movl %ecx, 52(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 44(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 40(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 36(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 32(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 28(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 24(%eax) -; X86-NEXT: movl %ebx, 20(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 16(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 48(%eax) -; X86-NEXT: sete %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; SSE-LABEL: reset_eq_i512: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $56, %rsp -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %rdx -; SSE-NEXT: movq (%rsp,%rdx), %r9 -; SSE-NEXT: movq 8(%rsp,%rdx), %r8 -; SSE-NEXT: movq %r8, %rsi -; SSE-NEXT: shldq %cl, %r9, %rsi -; SSE-NEXT: movq -8(%rsp,%rdx), %rax -; SSE-NEXT: shldq %cl, %rax, %r9 -; SSE-NEXT: movq 16(%rsp,%rdx), %r14 -; SSE-NEXT: movq 24(%rsp,%rdx), %r10 -; SSE-NEXT: movq %r10, %rbx -; SSE-NEXT: shldq %cl, %r14, %rbx -; SSE-NEXT: shldq %cl, %r8, %r14 -; SSE-NEXT: movq 32(%rsp,%rdx), %r13 -; SSE-NEXT: movq 40(%rsp,%rdx), %r12 -; SSE-NEXT: shldq %cl, %r13, %r12 -; SSE-NEXT: shldq %cl, %r10, %r13 -; SSE-NEXT: movq -16(%rsp,%rdx), %rdx -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq %r12, %rbp -; SSE-NEXT: movq %r9, %r15 -; SSE-NEXT: movq %rsi, %r11 -; SSE-NEXT: movq 16(%rdi), %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 48(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rcx, %r13 -; SSE-NEXT: andq %r8, %r9 -; SSE-NEXT: orq %r13, %r9 -; SSE-NEXT: movq 56(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rcx, %r12 -; SSE-NEXT: movq 24(%rdi), %r10 -; SSE-NEXT: andq %r10, %rsi -; SSE-NEXT: orq %r12, %rsi -; SSE-NEXT: movq %r14, %r13 -; SSE-NEXT: movq 32(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rcx, %r14 -; SSE-NEXT: movq %rdx, %r12 -; SSE-NEXT: movq (%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rcx, %rdx -; SSE-NEXT: orq %r14, %rdx -; SSE-NEXT: orq %r9, %rdx -; SSE-NEXT: movq %rbx, %r14 -; SSE-NEXT: movq 40(%rdi), %rcx -; SSE-NEXT: andq %rcx, %rbx -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: movq 8(%rdi), %r8 -; SSE-NEXT: andq %r8, %rax -; SSE-NEXT: orq %rbx, %rax -; SSE-NEXT: orq %rsi, %rax -; SSE-NEXT: notq %r11 -; SSE-NEXT: andq %r10, %r11 -; SSE-NEXT: notq %r15 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT: notq %r14 -; SSE-NEXT: andq %rcx, %r14 -; SSE-NEXT: notq %r13 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; SSE-NEXT: notq %rbp -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE-NEXT: notq %rcx -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; SSE-NEXT: notq %r9 -; SSE-NEXT: andq %r8, %r9 -; SSE-NEXT: notq %r12 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rcx, 48(%rdi) -; SSE-NEXT: movq %rbp, 56(%rdi) -; SSE-NEXT: movq %r13, 32(%rdi) -; SSE-NEXT: movq %r14, 40(%rdi) -; SSE-NEXT: movq %r15, 16(%rdi) -; SSE-NEXT: movq %r11, 24(%rdi) -; SSE-NEXT: movq %r12, (%rdi) -; SSE-NEXT: movq %r9, 8(%rdi) -; SSE-NEXT: sete %al -; SSE-NEXT: addq $56, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp -; SSE-NEXT: retq -; -; AVX2-LABEL: reset_eq_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: pushq %rax -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rdx -; AVX2-NEXT: movq -48(%rsp,%rdx), %r8 -; AVX2-NEXT: movq -40(%rsp,%rdx), %rbx -; AVX2-NEXT: movq %rbx, %rax -; AVX2-NEXT: shldq %cl, %r8, %rax -; AVX2-NEXT: movq -16(%rsp,%rdx), %r10 -; AVX2-NEXT: movq -8(%rsp,%rdx), %rsi -; AVX2-NEXT: shldq %cl, %r10, %rsi -; AVX2-NEXT: movq -32(%rsp,%rdx), %r11 -; AVX2-NEXT: movq -24(%rsp,%rdx), %r14 -; AVX2-NEXT: movq %r14, %r9 -; AVX2-NEXT: shldq %cl, %r11, %r9 -; AVX2-NEXT: movq -64(%rsp,%rdx), %r15 -; AVX2-NEXT: movq -56(%rsp,%rdx), %rdx -; AVX2-NEXT: shldq %cl, %rdx, %r8 -; AVX2-NEXT: shldq %cl, %r14, %r10 -; AVX2-NEXT: shldq %cl, %rbx, %r11 -; AVX2-NEXT: shldq %cl, %r15, %rdx -; AVX2-NEXT: shlxq %rcx, %r15, %rcx -; AVX2-NEXT: movq 24(%rdi), %rbx -; AVX2-NEXT: movq 56(%rdi), %r14 -; AVX2-NEXT: movq 16(%rdi), %r15 -; AVX2-NEXT: movq 48(%rdi), %r13 -; AVX2-NEXT: movq 32(%rdi), %rbp -; AVX2-NEXT: andnq %rbp, %r11, %r12 -; AVX2-NEXT: andq %r11, %rbp -; AVX2-NEXT: andnq %r13, %r10, %r11 -; AVX2-NEXT: andq %r10, %r13 -; AVX2-NEXT: andnq %r15, %r8, %r10 -; AVX2-NEXT: andq %r8, %r15 -; AVX2-NEXT: movq 40(%rdi), %r8 -; AVX2-NEXT: orq %r13, %r15 -; AVX2-NEXT: andnq %r8, %r9, %r13 -; AVX2-NEXT: andq %r9, %r8 -; AVX2-NEXT: andnq %r14, %rsi, %r9 -; AVX2-NEXT: andq %rsi, %r14 -; AVX2-NEXT: andnq %rbx, %rax, %rsi -; AVX2-NEXT: andq %rax, %rbx -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: orq %r14, %rbx -; AVX2-NEXT: andnq %rax, %rcx, %r14 -; AVX2-NEXT: andq %rcx, %rax -; AVX2-NEXT: orq %rbp, %rax -; AVX2-NEXT: movq 8(%rdi), %rcx -; AVX2-NEXT: orq %r15, %rax -; AVX2-NEXT: andnq %rcx, %rdx, %r15 -; AVX2-NEXT: andq %rdx, %rcx -; AVX2-NEXT: orq %r8, %rcx -; AVX2-NEXT: orq %rbx, %rcx -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: movq %r11, 48(%rdi) -; AVX2-NEXT: movq %r9, 56(%rdi) -; AVX2-NEXT: movq %r12, 32(%rdi) -; AVX2-NEXT: movq %r13, 40(%rdi) -; AVX2-NEXT: movq %r10, 16(%rdi) -; AVX2-NEXT: movq %rsi, 24(%rdi) -; AVX2-NEXT: movq %r14, (%rdi) -; AVX2-NEXT: movq %r15, 8(%rdi) -; AVX2-NEXT: sete %al -; AVX2-NEXT: addq $8, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: reset_eq_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: pushq %rax -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rbx -; AVX512-NEXT: movq -48(%rsp,%rbx), %r8 -; AVX512-NEXT: movq -40(%rsp,%rbx), %r14 -; AVX512-NEXT: movq %r14, %rax -; AVX512-NEXT: shldq %cl, %r8, %rax -; AVX512-NEXT: movq -16(%rsp,%rbx), %r10 -; AVX512-NEXT: movq -8(%rsp,%rbx), %rsi -; AVX512-NEXT: shldq %cl, %r10, %rsi -; AVX512-NEXT: movq -32(%rsp,%rbx), %r11 -; AVX512-NEXT: movq -24(%rsp,%rbx), %r15 -; AVX512-NEXT: movq %r15, %r9 -; AVX512-NEXT: shldq %cl, %r11, %r9 -; AVX512-NEXT: movq -56(%rsp,%rbx), %rdx -; AVX512-NEXT: shldq %cl, %rdx, %r8 -; AVX512-NEXT: shldq %cl, %r15, %r10 -; AVX512-NEXT: shldq %cl, %r14, %r11 -; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx -; AVX512-NEXT: shldq %cl, %rbx, %rdx -; AVX512-NEXT: shlxq %rcx, %rbx, %rcx -; AVX512-NEXT: movq 24(%rdi), %rbx -; AVX512-NEXT: movq 56(%rdi), %r14 -; AVX512-NEXT: movq 16(%rdi), %r15 -; AVX512-NEXT: movq 48(%rdi), %r13 -; AVX512-NEXT: movq 32(%rdi), %rbp -; AVX512-NEXT: andnq %rbp, %r11, %r12 -; AVX512-NEXT: andq %r11, %rbp -; AVX512-NEXT: andnq %r13, %r10, %r11 -; AVX512-NEXT: andq %r10, %r13 -; AVX512-NEXT: andnq %r15, %r8, %r10 -; AVX512-NEXT: andq %r8, %r15 -; AVX512-NEXT: movq 40(%rdi), %r8 -; AVX512-NEXT: orq %r13, %r15 -; AVX512-NEXT: andnq %r8, %r9, %r13 -; AVX512-NEXT: andq %r9, %r8 -; AVX512-NEXT: andnq %r14, %rsi, %r9 -; AVX512-NEXT: andq %rsi, %r14 -; AVX512-NEXT: andnq %rbx, %rax, %rsi -; AVX512-NEXT: andq %rax, %rbx -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: orq %r14, %rbx -; AVX512-NEXT: andnq %rax, %rcx, %r14 -; AVX512-NEXT: andq %rcx, %rax -; AVX512-NEXT: orq %rbp, %rax -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: orq %r15, %rax -; AVX512-NEXT: andnq %rcx, %rdx, %r15 -; AVX512-NEXT: andq %rdx, %rcx -; AVX512-NEXT: orq %r8, %rcx -; AVX512-NEXT: orq %rbx, %rcx -; AVX512-NEXT: orq %rax, %rcx -; AVX512-NEXT: movq %r11, 48(%rdi) -; AVX512-NEXT: movq %r9, 56(%rdi) -; AVX512-NEXT: movq %r12, 32(%rdi) -; AVX512-NEXT: movq %r13, 40(%rdi) -; AVX512-NEXT: movq %r10, 16(%rdi) -; AVX512-NEXT: movq %rsi, 24(%rdi) -; AVX512-NEXT: movq %r14, (%rdi) -; AVX512-NEXT: movq %r15, 8(%rdi) -; AVX512-NEXT: sete %al -; AVX512-NEXT: addq $8, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %mask = xor i512 %bit, -1 - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %res = and i512 %ld, %mask - %cmp = icmp eq i512 %test, 0 - store i512 %res, ptr %word - ret i1 %cmp -} - -define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { -; X86-LABEL: set_ne_i512: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $272, %esp # imm = 0x110 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: andl $60, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-NEXT: subl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 24(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ecx -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%edx), %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%edx), %ebx -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%edx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%eax), %edi ; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 52(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: movl 40(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: movl 8(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl 56(%edx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %ebx -; X86-NEXT: movl 24(%edx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 12(%eax), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: andl 24(%edx), %ebx ; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl 60(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 28(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: negl %eax -; X86-NEXT: movl 240(%esp,%eax), %esi -; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl 32(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %eax -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl 16(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: movl 48(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 36(%esi), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl 20(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl %esi, %edi -; X86-NEXT: movl 52(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl %edi, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: movl %ebx, 60(%edx) -; X86-NEXT: movl %edi, 56(%edx) -; X86-NEXT: movl %ecx, 52(%edx) -; X86-NEXT: movl %esi, 44(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 40(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 36(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 32(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 24(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 12(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 8(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 4(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, (%edx) -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 48(%edx) -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; SSE-LABEL: set_ne_i512: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $56, %rsp -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %rbx -; SSE-NEXT: movq (%rsp,%rbx), %rsi -; SSE-NEXT: movq 8(%rsp,%rbx), %r14 -; SSE-NEXT: movq %r14, %rax -; SSE-NEXT: shldq %cl, %rsi, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 32(%rsp,%rbx), %r8 -; SSE-NEXT: movq 40(%rsp,%rbx), %rbp -; SSE-NEXT: shldq %cl, %r8, %rbp -; SSE-NEXT: movq 16(%rsp,%rbx), %r9 -; SSE-NEXT: movq 24(%rsp,%rbx), %r15 -; SSE-NEXT: movq %r15, %r10 -; SSE-NEXT: shldq %cl, %r9, %r10 -; SSE-NEXT: movq -8(%rsp,%rbx), %r11 -; SSE-NEXT: shldq %cl, %r11, %rsi -; SSE-NEXT: shldq %cl, %r15, %r8 -; SSE-NEXT: shldq %cl, %r14, %r9 -; SSE-NEXT: movq -16(%rsp,%rbx), %rbx -; SSE-NEXT: shldq %cl, %rbx, %r11 -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rbx -; SSE-NEXT: movq 24(%rdi), %r15 -; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 56(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 16(%rdi), %r12 -; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 48(%rdi), %r13 -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %r8, %r13 -; SSE-NEXT: andq %rsi, %r12 -; SSE-NEXT: orq %r13, %r12 -; SSE-NEXT: movq %rcx, %r13 -; SSE-NEXT: andq %rbp, %r13 -; SSE-NEXT: andq %rax, %r15 -; SSE-NEXT: orq %r13, %r15 -; SSE-NEXT: movq 32(%rdi), %r14 -; SSE-NEXT: movq %r14, %rcx -; SSE-NEXT: andq %r9, %rcx -; SSE-NEXT: movq (%rdi), %r13 -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rbx, %r13 -; SSE-NEXT: orq %rcx, %r13 -; SSE-NEXT: orq %r12, %r13 -; SSE-NEXT: movq 40(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r12 -; SSE-NEXT: andq %r10, %r12 -; SSE-NEXT: movq 8(%rdi), %rdx -; SSE-NEXT: movq %rdx, %rax -; SSE-NEXT: andq %r11, %rax -; SSE-NEXT: orq %r12, %rax -; SSE-NEXT: orq %r15, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; SSE-NEXT: orq %rcx, %r10 -; SSE-NEXT: orq %r14, %r9 -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; SSE-NEXT: orq %rdx, %r11 -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; SSE-NEXT: orq %r13, %rax -; SSE-NEXT: movq %r8, 48(%rdi) -; SSE-NEXT: movq %rbp, 56(%rdi) -; SSE-NEXT: movq %r9, 32(%rdi) -; SSE-NEXT: movq %r10, 40(%rdi) -; SSE-NEXT: movq %rsi, 16(%rdi) -; SSE-NEXT: movq %r15, 24(%rdi) -; SSE-NEXT: movq %rbx, (%rdi) -; SSE-NEXT: movq %r11, 8(%rdi) -; SSE-NEXT: setne %al -; SSE-NEXT: addq $56, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp -; SSE-NEXT: retq -; -; AVX2-LABEL: set_ne_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $72, %rsp -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, (%rsp) -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rbx -; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi -; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp -; AVX2-NEXT: movq %rbp, %rax -; AVX2-NEXT: shldq %cl, %rsi, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 48(%rsp,%rbx), %r8 -; AVX2-NEXT: movq 56(%rsp,%rbx), %r13 -; AVX2-NEXT: shldq %cl, %r8, %r13 -; AVX2-NEXT: movq 32(%rsp,%rbx), %r9 -; AVX2-NEXT: movq 40(%rsp,%rbx), %r14 -; AVX2-NEXT: movq %r14, %r10 -; AVX2-NEXT: shldq %cl, %r9, %r10 -; AVX2-NEXT: movq 8(%rsp,%rbx), %r11 -; AVX2-NEXT: shldq %cl, %r11, %rsi -; AVX2-NEXT: shldq %cl, %r14, %r8 -; AVX2-NEXT: movq 16(%rdi), %r12 -; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 48(%rdi), %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r8, %r14 -; AVX2-NEXT: andq %rsi, %r12 -; AVX2-NEXT: orq %r14, %r12 -; AVX2-NEXT: movq 56(%rdi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r13, %r15 -; AVX2-NEXT: movq 24(%rdi), %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %rax, %r14 -; AVX2-NEXT: orq %r15, %r14 -; AVX2-NEXT: shldq %cl, %rbp, %r9 -; AVX2-NEXT: movq (%rsp,%rbx), %rdx -; AVX2-NEXT: movq 32(%rdi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r9, %r15 -; AVX2-NEXT: shlxq %rcx, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq (%rdi), %rbx -; AVX2-NEXT: movq %rbx, %rbp -; AVX2-NEXT: andq %rax, %rbp -; AVX2-NEXT: orq %r15, %rbp -; AVX2-NEXT: orq %r12, %rbp -; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %rdx, %r11 -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: andq %r10, %rcx -; AVX2-NEXT: movq 8(%rdi), %r15 -; AVX2-NEXT: movq %r15, %r12 -; AVX2-NEXT: andq %r11, %r12 -; AVX2-NEXT: orq %rcx, %r12 -; AVX2-NEXT: orq %r14, %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX2-NEXT: orq %rax, %r10 -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX2-NEXT: orq %r15, %r11 -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX2-NEXT: orq %rbp, %r12 -; AVX2-NEXT: movq %r8, 48(%rdi) -; AVX2-NEXT: movq %r13, 56(%rdi) -; AVX2-NEXT: movq %r9, 32(%rdi) -; AVX2-NEXT: movq %r10, 40(%rdi) -; AVX2-NEXT: movq %rsi, 16(%rdi) -; AVX2-NEXT: movq %rcx, 24(%rdi) -; AVX2-NEXT: movq %rbx, (%rdi) -; AVX2-NEXT: movq %r11, 8(%rdi) -; AVX2-NEXT: setne %al -; AVX2-NEXT: addq $72, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: set_ne_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $72, %rsp -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, (%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rbx -; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi -; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp -; AVX512-NEXT: movq %rbp, %rax -; AVX512-NEXT: shldq %cl, %rsi, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rsp,%rbx), %r8 -; AVX512-NEXT: movq 56(%rsp,%rbx), %r13 -; AVX512-NEXT: shldq %cl, %r8, %r13 -; AVX512-NEXT: movq 32(%rsp,%rbx), %r9 -; AVX512-NEXT: movq 40(%rsp,%rbx), %r14 -; AVX512-NEXT: movq %r14, %r10 -; AVX512-NEXT: shldq %cl, %r9, %r10 -; AVX512-NEXT: movq 8(%rsp,%rbx), %r11 -; AVX512-NEXT: shldq %cl, %r11, %rsi -; AVX512-NEXT: shldq %cl, %r14, %r8 -; AVX512-NEXT: movq 16(%rdi), %r12 -; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rdi), %r14 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r8, %r14 -; AVX512-NEXT: andq %rsi, %r12 -; AVX512-NEXT: orq %r14, %r12 -; AVX512-NEXT: movq 56(%rdi), %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r13, %r15 -; AVX512-NEXT: movq 24(%rdi), %r14 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %rax, %r14 -; AVX512-NEXT: orq %r15, %r14 -; AVX512-NEXT: shldq %cl, %rbp, %r9 -; AVX512-NEXT: movq (%rsp,%rbx), %rdx -; AVX512-NEXT: movq 32(%rdi), %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r9, %r15 -; AVX512-NEXT: shlxq %rcx, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq (%rdi), %rbx -; AVX512-NEXT: movq %rbx, %rbp -; AVX512-NEXT: andq %rax, %rbp -; AVX512-NEXT: orq %r15, %rbp -; AVX512-NEXT: orq %r12, %rbp -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rdx, %r11 -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andq %r10, %rcx -; AVX512-NEXT: movq 8(%rdi), %r15 -; AVX512-NEXT: movq %r15, %r12 -; AVX512-NEXT: andq %r11, %r12 -; AVX512-NEXT: orq %rcx, %r12 -; AVX512-NEXT: orq %r14, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX512-NEXT: orq %rax, %r10 -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX512-NEXT: orq %r15, %r11 -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX512-NEXT: orq %rbp, %r12 -; AVX512-NEXT: movq %r8, 48(%rdi) -; AVX512-NEXT: movq %r13, 56(%rdi) -; AVX512-NEXT: movq %r9, 32(%rdi) -; AVX512-NEXT: movq %r10, 40(%rdi) -; AVX512-NEXT: movq %rsi, 16(%rdi) -; AVX512-NEXT: movq %rcx, 24(%rdi) -; AVX512-NEXT: movq %rbx, (%rdi) -; AVX512-NEXT: movq %r11, 8(%rdi) -; AVX512-NEXT: setne %al -; AVX512-NEXT: addq $72, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %res = or i512 %ld, %bit - %cmp = icmp ne i512 %test, 0 - store i512 %res, ptr %word - ret i1 %cmp -} - -define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { -; X86-LABEL: init_eq_i512: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $432, %esp # imm = 0x1B0 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrl $3, %edx -; X86-NEXT: andl $60, %edx -; X86-NEXT: leal {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl %edx, %esi -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 56(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 52(%esi), %eax -; X86-NEXT: movl 48(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%esi), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl 16(%ebp), %ebx -; X86-NEXT: movzbl %bl, %esi -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: leal {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: shldl %cl, %edi, %edx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 52(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%ebx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl 56(%edi), %ebx -; X86-NEXT: movl 60(%edi), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 52(%edi), %eax -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 48(%edi), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: notl %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl 40(%edi), %ebx -; X86-NEXT: movl 44(%edi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 36(%edi), %eax -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 32(%edi), %ebx -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 28(%edi), %eax -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 24(%edi), %ebx -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 20(%edi), %eax -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 16(%edi), %ebx -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 12(%edi), %eax -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 8(%edi), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: notl %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 4(%edi), %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edx -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl (%edi), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 60(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 56(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 52(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 44(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 40(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 36(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 32(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 28(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 24(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 20(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 16(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %esi, 48(%eax) -; X86-NEXT: sete %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; SSE-LABEL: init_eq_i512: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $216, %rsp -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %r10 -; SSE-NEXT: movq 184(%rsp,%r10), %r11 -; SSE-NEXT: movq 192(%rsp,%r10), %rsi -; SSE-NEXT: movq %rsi, %r13 -; SSE-NEXT: shldq %cl, %r11, %r13 -; SSE-NEXT: movq 200(%rsp,%r10), %r15 -; SSE-NEXT: shldq %cl, %rsi, %r15 -; SSE-NEXT: movq 168(%rsp,%r10), %rbx -; SSE-NEXT: movq 176(%rsp,%r10), %rsi -; SSE-NEXT: movq %rsi, %r14 -; SSE-NEXT: shldq %cl, %rbx, %r14 -; SSE-NEXT: shldq %cl, %rsi, %r11 -; SSE-NEXT: movq 152(%rsp,%r10), %rax -; SSE-NEXT: movq 160(%rsp,%r10), %r8 -; SSE-NEXT: movq %r8, %r12 -; SSE-NEXT: shldq %cl, %rax, %r12 -; SSE-NEXT: shldq %cl, %r8, %rbx -; SSE-NEXT: movq 144(%rsp,%r10), %r9 -; SSE-NEXT: movq %r9, %r8 -; SSE-NEXT: shlq %cl, %r8 -; SSE-NEXT: shldq %cl, %r9, %rax -; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movl %edx, %edx -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, (%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq 16(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 48(%rdi), %rsi -; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rsi, %r13 -; SSE-NEXT: andq %rdx, %r12 -; SSE-NEXT: orq %r13, %r12 -; SSE-NEXT: movq %r15, %rsi -; SSE-NEXT: movq 56(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %r15 -; SSE-NEXT: movq %rbx, %r13 -; SSE-NEXT: movq 24(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %rbx -; SSE-NEXT: orq %r15, %rbx -; SSE-NEXT: movq %r14, %rbp -; SSE-NEXT: movq 32(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %r14 -; SSE-NEXT: movq %r8, %r15 -; SSE-NEXT: movq (%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %r8 -; SSE-NEXT: orq %r14, %r8 -; SSE-NEXT: orq %r12, %r8 -; SSE-NEXT: movq %r11, %r12 -; SSE-NEXT: movq 40(%rdi), %r9 -; SSE-NEXT: andq %r9, %r11 -; SSE-NEXT: movq %rax, %r14 -; SSE-NEXT: movq 8(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %rax -; SSE-NEXT: orq %r11, %rax -; SSE-NEXT: orq %rbx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: notq %rax -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: movq 56(%rsp,%r10), %r11 -; SSE-NEXT: movq 64(%rsp,%r10), %rax -; SSE-NEXT: movq %rax, %rbx -; SSE-NEXT: shldq %cl, %r11, %rbx -; SSE-NEXT: orq %rbx, %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: notq %rsi -; SSE-NEXT: movq 72(%rsp,%r10), %rbx -; SSE-NEXT: shldq %cl, %rax, %rbx -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; SSE-NEXT: orq %rbx, %rsi -; SSE-NEXT: notq %rbp -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT: movq 40(%rsp,%r10), %rax -; SSE-NEXT: movq 48(%rsp,%r10), %rdx -; SSE-NEXT: movq %rdx, %rbx -; SSE-NEXT: shldq %cl, %rax, %rbx -; SSE-NEXT: orq %rbx, %rbp -; SSE-NEXT: notq %r12 -; SSE-NEXT: andq %r9, %r12 -; SSE-NEXT: shldq %cl, %rdx, %r11 -; SSE-NEXT: movq 24(%rsp,%r10), %r9 -; SSE-NEXT: movq 32(%rsp,%r10), %rdx -; SSE-NEXT: movq %rdx, %rbx -; SSE-NEXT: shldq %cl, %r9, %rbx -; SSE-NEXT: orq %r11, %r12 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: notq %r11 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: orq %rbx, %r11 -; SSE-NEXT: notq %r13 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; SSE-NEXT: orq %rax, %r13 -; SSE-NEXT: notq %r15 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT: movq 16(%rsp,%r10), %rax -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: orq %rdx, %r15 -; SSE-NEXT: notq %r14 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shldq %cl, %rax, %r9 -; SSE-NEXT: orq %r9, %r14 -; SSE-NEXT: orq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: movq %rax, 48(%rdi) -; SSE-NEXT: movq %rsi, 56(%rdi) -; SSE-NEXT: movq %rbp, 32(%rdi) -; SSE-NEXT: movq %r12, 40(%rdi) -; SSE-NEXT: movq %r11, 16(%rdi) -; SSE-NEXT: movq %r13, 24(%rdi) -; SSE-NEXT: movq %r15, (%rdi) -; SSE-NEXT: movq %r14, 8(%rdi) -; SSE-NEXT: sete %al -; SSE-NEXT: addq $216, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp -; SSE-NEXT: retq -; -; AVX2-LABEL: init_eq_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $200, %rsp -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, %r8d -; AVX2-NEXT: andl $63, %r8d -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rsi -; AVX2-NEXT: movq 144(%rsp,%rsi), %r11 -; AVX2-NEXT: movq 152(%rsp,%rsi), %r12 -; AVX2-NEXT: movq %r12, %r10 -; AVX2-NEXT: movl %r8d, %ecx -; AVX2-NEXT: shldq %cl, %r11, %r10 -; AVX2-NEXT: movq 176(%rsp,%rsi), %r14 -; AVX2-NEXT: movq 184(%rsp,%rsi), %r9 -; AVX2-NEXT: shldq %cl, %r14, %r9 -; AVX2-NEXT: movq 160(%rsp,%rsi), %r15 -; AVX2-NEXT: movq 168(%rsp,%rsi), %r13 -; AVX2-NEXT: movq %r13, %rbx -; AVX2-NEXT: shldq %cl, %r15, %rbx -; AVX2-NEXT: movq 128(%rsp,%rsi), %rbp -; AVX2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 136(%rsp,%rsi), %rax -; AVX2-NEXT: shldq %cl, %rax, %r11 -; AVX2-NEXT: shldq %cl, %r13, %r14 -; AVX2-NEXT: shldq %cl, %r12, %r15 -; AVX2-NEXT: shldq %cl, %rbp, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl %edx, %edx -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rdx, (%rsp) -; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 16(%rdi), %r12 -; AVX2-NEXT: movq 48(%rdi), %rbp -; AVX2-NEXT: movq 32(%rdi), %r13 -; AVX2-NEXT: andnq %r13, %r15, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r15, %r13 -; AVX2-NEXT: andnq %rbp, %r14, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r14, %rbp -; AVX2-NEXT: andnq %r12, %r11, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r11, %r12 -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: orq %rbp, %r12 -; AVX2-NEXT: andnq %rax, %rbx, %rcx -; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq %rax, %rbp -; AVX2-NEXT: andq %rbx, %rbp -; AVX2-NEXT: movq 56(%rdi), %rcx -; AVX2-NEXT: andnq %rcx, %r9, %rbx -; AVX2-NEXT: andq %r9, %rcx -; AVX2-NEXT: movq 24(%rdi), %rax -; AVX2-NEXT: andnq %rax, %r10, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r10, %rax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: shlxq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX2-NEXT: movq (%rdi), %r10 -; AVX2-NEXT: andnq %r10, %rcx, %r15 -; AVX2-NEXT: andq %rcx, %r10 -; AVX2-NEXT: movq 40(%rsp,%rsi), %rdx -; AVX2-NEXT: movq 48(%rsp,%rsi), %r11 -; AVX2-NEXT: movq %r11, %r9 -; AVX2-NEXT: movl %r8d, %ecx -; AVX2-NEXT: shldq %cl, %rdx, %r9 -; AVX2-NEXT: orq %r13, %r10 -; AVX2-NEXT: orq %r12, %r10 -; AVX2-NEXT: movq 8(%rdi), %r13 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: andnq %r13, %rcx, %r12 -; AVX2-NEXT: andq %rcx, %r13 -; AVX2-NEXT: orq %rbp, %r13 -; AVX2-NEXT: orq %rax, %r13 -; AVX2-NEXT: movq 56(%rsp,%rsi), %rax -; AVX2-NEXT: movl %r8d, %ecx -; AVX2-NEXT: shldq %cl, %r11, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: orq %r9, %r14 -; AVX2-NEXT: orq %rax, %rbx -; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 24(%rsp,%rsi), %rax -; AVX2-NEXT: movq 32(%rsp,%rsi), %r9 -; AVX2-NEXT: movq %r9, %r11 -; AVX2-NEXT: shldq %cl, %rax, %r11 -; AVX2-NEXT: shldq %cl, %r9, %rdx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX2-NEXT: orq %r11, %rbp -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: orq %rdx, %rbx -; AVX2-NEXT: movq 8(%rsp,%rsi), %rdx -; AVX2-NEXT: movq 16(%rsp,%rsi), %r9 -; AVX2-NEXT: movq %r9, %r11 -; AVX2-NEXT: shldq %cl, %rdx, %r11 -; AVX2-NEXT: shldq %cl, %r9, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: orq %r11, %r9 -; AVX2-NEXT: movq (%rsp,%rsi), %rsi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: orq %rax, %r11 -; AVX2-NEXT: shlxq %r8, %rsi, %rax -; AVX2-NEXT: shldq %cl, %rsi, %rdx -; AVX2-NEXT: orq %rax, %r15 -; AVX2-NEXT: orq %rdx, %r12 -; AVX2-NEXT: orq %r10, %r13 -; AVX2-NEXT: movq %r14, 48(%rdi) -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: movq %rax, 56(%rdi) -; AVX2-NEXT: movq %rbp, 32(%rdi) -; AVX2-NEXT: movq %rbx, 40(%rdi) -; AVX2-NEXT: movq %r9, 16(%rdi) -; AVX2-NEXT: movq %r11, 24(%rdi) -; AVX2-NEXT: movq %r15, (%rdi) -; AVX2-NEXT: movq %r12, 8(%rdi) -; AVX2-NEXT: sete %al -; AVX2-NEXT: addq $200, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: init_eq_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $184, %rsp -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rsi -; AVX512-NEXT: movq 128(%rsp,%rsi), %r10 -; AVX512-NEXT: movq 136(%rsp,%rsi), %r12 -; AVX512-NEXT: movq %r12, %rax -; AVX512-NEXT: shldq %cl, %r10, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 160(%rsp,%rsi), %r14 -; AVX512-NEXT: movq 168(%rsp,%rsi), %rax -; AVX512-NEXT: shldq %cl, %r14, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 144(%rsp,%rsi), %r15 -; AVX512-NEXT: movq 152(%rsp,%rsi), %r11 -; AVX512-NEXT: movq %r11, %rbx -; AVX512-NEXT: shldq %cl, %r15, %rbx -; AVX512-NEXT: movq 120(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rax, %r10 -; AVX512-NEXT: shldq %cl, %r11, %r14 -; AVX512-NEXT: movq %rdi, %r9 -; AVX512-NEXT: movq 112(%rsp,%rsi), %r11 -; AVX512-NEXT: shldq %cl, %r12, %r15 -; AVX512-NEXT: movl %edx, %edx -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 16(%rdi), %r12 -; AVX512-NEXT: movq 48(%rdi), %r13 -; AVX512-NEXT: movq 32(%rdi), %rbp -; AVX512-NEXT: andnq %rbp, %r15, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r15, %rbp -; AVX512-NEXT: andnq %r13, %r14, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r14, %r13 -; AVX512-NEXT: andnq %r12, %r10, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r10, %r12 -; AVX512-NEXT: movq 40(%rdi), %r8 -; AVX512-NEXT: orq %r13, %r12 -; AVX512-NEXT: andnq %r8, %rbx, %rdi -; AVX512-NEXT: andq %rbx, %r8 -; AVX512-NEXT: movq 56(%r9), %r13 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: andnq %r13, %rdx, %r10 -; AVX512-NEXT: andq %rdx, %r13 -; AVX512-NEXT: movq 24(%r9), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: andnq %rax, %rdx, %r15 -; AVX512-NEXT: andq %rdx, %rax -; AVX512-NEXT: orq %r13, %rax -; AVX512-NEXT: shlxq %rcx, %r11, %r13 -; AVX512-NEXT: movq (%r9), %rdx -; AVX512-NEXT: andnq %rdx, %r13, %r14 -; AVX512-NEXT: andq %r13, %rdx -; AVX512-NEXT: orq %rbp, %rdx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r11, %rbp -; AVX512-NEXT: orq %r12, %rdx -; AVX512-NEXT: movq 8(%r9), %r13 -; AVX512-NEXT: andnq %r13, %rbp, %rbx -; AVX512-NEXT: andq %rbp, %r13 -; AVX512-NEXT: orq %r8, %r13 -; AVX512-NEXT: movq 24(%rsp,%rsi), %r8 -; AVX512-NEXT: orq %rax, %r13 -; AVX512-NEXT: movq 32(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, %r12 -; AVX512-NEXT: shldq %cl, %r8, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: orq %r12, %r11 -; AVX512-NEXT: movq 40(%rsp,%rsi), %r12 -; AVX512-NEXT: shldq %cl, %rax, %r12 -; AVX512-NEXT: orq %r12, %r10 -; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 8(%rsp,%rsi), %rax -; AVX512-NEXT: movq 16(%rsp,%rsi), %r12 -; AVX512-NEXT: movq %r12, %rbp -; AVX512-NEXT: shldq %cl, %rax, %rbp -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: orq %rbp, %r10 -; AVX512-NEXT: shldq %cl, %r12, %r8 -; AVX512-NEXT: orq %r8, %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq -8(%rsp,%rsi), %r8 -; AVX512-NEXT: movq (%rsp,%rsi), %r12 -; AVX512-NEXT: movq %r12, %rbp -; AVX512-NEXT: shldq %cl, %r8, %rbp -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: orq %rbp, %rdi -; AVX512-NEXT: movq -16(%rsp,%rsi), %rsi -; AVX512-NEXT: shldq %cl, %r12, %rax -; AVX512-NEXT: orq %rax, %r15 -; AVX512-NEXT: shlxq %rcx, %rsi, %rax -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rsi, %r8 -; AVX512-NEXT: orq %rax, %r14 -; AVX512-NEXT: orq %r8, %rbx -; AVX512-NEXT: orq %rdx, %r13 -; AVX512-NEXT: movq %r11, 48(%r9) -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: movq %rax, 56(%r9) -; AVX512-NEXT: movq %r10, 32(%r9) -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: movq %rax, 40(%r9) -; AVX512-NEXT: movq %rdi, 16(%r9) -; AVX512-NEXT: movq %r15, 24(%r9) -; AVX512-NEXT: movq %r14, (%r9) -; AVX512-NEXT: movq %rbx, 8(%r9) -; AVX512-NEXT: sete %al -; AVX512-NEXT: addq $184, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %mask = xor i512 %bit, -1 - %val0 = zext i1 %value to i512 - %val = shl nuw i512 %val0, %ofs - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %res0 = and i512 %ld, %mask - %res = or i512 %res0, %val - %cmp = icmp eq i512 %test, 0 - store i512 %res, ptr %word - ret i1 %cmp -} - -; i4096 - -define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { -; X86-LABEL: test_ne_i4096: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $1792, %esp # imm = 0x700 -; X86-NEXT: movl 12(%ebp), %ebx -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: shrl $3, %ecx -; X86-NEXT: andl $508, %ecx # imm = 0x1FC -; X86-NEXT: leal {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: subl %ecx, %esi -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 248(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 252(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ebx -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 504(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 508(%esi), %edx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 120(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 124(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 376(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 380(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 184(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 188(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 440(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 444(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 312(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 316(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 216(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 220(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 472(%esi), %edi -; X86-NEXT: movl 476(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 88(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 92(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 344(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 348(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 152(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 156(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 408(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 412(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 280(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 284(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 232(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 236(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 488(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 492(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 104(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 108(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 360(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 364(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 168(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 172(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 424(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 428(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 296(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 300(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 200(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 204(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 456(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 460(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 72(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 76(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 328(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 332(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 136(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 140(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 392(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 396(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 264(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 268(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 240(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 244(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 496(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 500(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 112(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 116(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 368(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 372(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 176(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 180(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 432(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 436(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 52(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 304(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 308(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 208(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 212(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 464(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 468(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 80(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 84(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 336(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 340(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 144(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 148(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 400(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 404(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 272(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 276(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 224(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 228(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 480(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 484(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 96(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 100(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 352(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 356(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 160(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 164(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 416(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 420(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 288(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 292(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 192(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 196(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 448(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 452(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 64(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 68(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 320(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 324(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 128(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 132(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %edx -; X86-NEXT: movl 256(%esi), %edi -; X86-NEXT: movl 260(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 388(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 4(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shrdl $1, %eax, %edi -; X86-NEXT: shrl %eax -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: notb %cl -; X86-NEXT: shrdl %cl, %eax, %edi -; X86-NEXT: shrl %cl, %ebx -; X86-NEXT: movb $32, %cl -; X86-NEXT: testb %cl, %cl -; X86-NEXT: movl (%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: jne .LBB20_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: .LBB20_2: -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shll %cl, %edx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 320(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 64(%eax), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 448(%eax), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 192(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 288(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 32(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 416(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 160(%eax), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 352(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 96(%eax), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 480(%eax), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 224(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: orl %edi, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 272(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 16(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 400(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 144(%eax), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 336(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 80(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 464(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 208(%eax), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 304(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 48(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 432(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 176(%eax), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 368(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 112(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 496(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: andl 240(%eax), %ebx -; X86-NEXT: orl %ecx, %ebx -; X86-NEXT: orl %edx, %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 264(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 8(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 392(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 136(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 328(%ebx), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 72(%ebx), %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 456(%ebx), %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 200(%ebx), %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 296(%ebx), %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 40(%ebx), %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 424(%ebx), %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 168(%ebx), %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 360(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 104(%ebx), %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 488(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 232(%ebx), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 280(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 24(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 408(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 152(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 344(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 88(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 472(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 216(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 312(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 56(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 440(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 184(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 376(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 120(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 504(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 248(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 324(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 68(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 452(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 196(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 292(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 36(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 420(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 164(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 356(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 100(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 484(%ebx), %eax +; X86-NEXT: notl %ebx +; X86-NEXT: movl 16(%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: andl 20(%edx), %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 12(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 16(%edx), %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 8(%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: andl 12(%edx), %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 4(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 8(%edx), %ebx +; X86-NEXT: orl %esi, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 228(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 276(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 20(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 404(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 148(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 340(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 84(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 468(%ebx), %eax +; X86-NEXT: notl %esi +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: shldl %cl, %eax, %edi +; X86-NEXT: andl 4(%edx), %esi +; X86-NEXT: orl %edi, %esi +; X86-NEXT: movl %esi, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 212(%ebx), %esi +; X86-NEXT: notl %esi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: andl (%edx), %esi ; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 308(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 52(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 436(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 180(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 372(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 116(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 500(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 244(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 268(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 12(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 396(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 140(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 332(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 76(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl (%edx,%eax), %eax +; X86-NEXT: btl %ecx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 460(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 204(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl %eax, 60(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 300(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 44(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %eax, 56(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 428(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 172(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl %eax, 52(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 364(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 108(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %eax, 48(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 492(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 236(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, 44(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 284(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 28(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %eax, 40(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 412(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 156(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %ecx, %edi +; X86-NEXT: movl %eax, 36(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 348(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 92(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 476(%ebx), %ecx +; X86-NEXT: movl %eax, 32(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 220(%ebx), %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 316(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 60(%ebx), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 444(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 188(%ebx), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 380(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 124(%ebx), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 508(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: andl 252(%esi), %ebx -; X86-NEXT: orl %ecx, %ebx -; X86-NEXT: orl %edx, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: orl %eax, %ebx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: negl %ecx -; X86-NEXT: movl 1648(%esp,%ecx), %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: movl %eax, 28(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: andl 128(%edx), %ecx -; X86-NEXT: andl 384(%edx), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: andl (%edx), %eax -; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl %eax, 24(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 256(%edx), %eax -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %eax, 20(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 260(%edx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 4(%edx), %ecx -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %eax, 16(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 132(%edx), %eax -; X86-NEXT: andl 388(%edx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl %ebx, %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; SSE-LABEL: test_ne_i4096: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $1576, %rsp # imm = 0x628 -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl %esi, %eax -; SSE-NEXT: andl $4032, %eax # imm = 0xFC0 -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: negl %eax -; SSE-NEXT: movslq %eax, %rsi -; SSE-NEXT: movq 1296(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1304(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1552(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1560(%rsp,%rsi), %rax -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1168(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1176(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1424(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1432(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1232(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1240(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1488(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1496(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1104(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1112(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1360(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, (%rsp) # 8-byte Spill -; SSE-NEXT: movq 1368(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1264(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1272(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1520(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1528(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1136(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1144(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1392(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1400(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1200(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1208(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1456(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1464(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1072(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1080(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1328(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1336(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1280(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1288(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1536(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1544(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1152(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1160(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1408(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1416(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1216(%rsp,%rsi), %r11 -; SSE-NEXT: movq 1224(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %r11, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1472(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1480(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1088(%rsp,%rsi), %r9 -; SSE-NEXT: movq 1096(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %r9, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1344(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1352(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax +; X86-NEXT: movl %eax, 12(%edx) +; X86-NEXT: movl %ebx, 8(%edx) +; X86-NEXT: movl %edi, 4(%edx) +; X86-NEXT: movl %esi, (%edx) +; X86-NEXT: setae %al +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; SSE-LABEL: init_eq_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $184, %rsp +; SSE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: movl %esi, %eax +; SSE-NEXT: shrl $3, %eax ; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1248(%rsp,%rsi), %rax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: andl $56, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: movslq %eax, %r12 +; SSE-NEXT: movq 160(%rsp,%r12), %rax +; SSE-NEXT: movq 168(%rsp,%r12), %r10 +; SSE-NEXT: shldq %cl, %rax, %r10 +; SSE-NEXT: movq 152(%rsp,%r12), %rsi +; SSE-NEXT: shldq %cl, %rsi, %rax ; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1256(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 144(%rsp,%r12), %r11 +; SSE-NEXT: shldq %cl, %r11, %rsi +; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 136(%rsp,%r12), %rbx +; SSE-NEXT: shldq %cl, %rbx, %r11 +; SSE-NEXT: movq 128(%rsp,%r12), %r14 +; SSE-NEXT: shldq %cl, %r14, %rbx +; SSE-NEXT: movq 120(%rsp,%r12), %r15 +; SSE-NEXT: shldq %cl, %r15, %r14 +; SSE-NEXT: movq 112(%rsp,%r12), %r13 +; SSE-NEXT: shldq %cl, %r13, %r15 +; SSE-NEXT: shlq %cl, %r13 +; SSE-NEXT: movl %edx, %eax +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq 32(%rsp,%r12), %rax +; SSE-NEXT: movq 40(%rsp,%r12), %rdx ; SSE-NEXT: shldq %cl, %rax, %rdx ; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1504(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1512(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 24(%rsp,%r12), %rdx ; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1120(%rsp,%rsi), %rax -; SSE-NEXT: movq 1128(%rsp,%rsi), %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rax, %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1376(%rsp,%rsi), %r13 -; SSE-NEXT: movq 1384(%rsp,%rsi), %rbx -; SSE-NEXT: movq %rbx, %r8 -; SSE-NEXT: shldq %cl, %r13, %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1184(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1192(%rsp,%rsi), %r15 -; SSE-NEXT: movq %r15, %r14 -; SSE-NEXT: shldq %cl, %rdx, %r14 -; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1440(%rsp,%rsi), %r10 -; SSE-NEXT: movq 1448(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, %r14 -; SSE-NEXT: shldq %cl, %r10, %r14 -; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1312(%rsp,%rsi), %r14 -; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1320(%rsp,%rsi), %rbp -; SSE-NEXT: movq %rbp, %r12 -; SSE-NEXT: shldq %cl, %r14, %r12 -; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, (%rsp) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq 1064(%rsp,%rsi), %rbx -; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE-NEXT: shldq %cl, %rbp, %r14 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rdx, %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %r9 -; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %rbp -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %r13 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r12, %r15 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r12, %r10 -; SSE-NEXT: andq 384(%rdi), %r10 -; SSE-NEXT: andq 128(%rdi), %r15 -; SSE-NEXT: andq 320(%rdi), %r13 -; SSE-NEXT: andq 64(%rdi), %rax -; SSE-NEXT: orq %r10, %r15 -; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: orq %r13, %rax -; SSE-NEXT: andq 448(%rdi), %r9 -; SSE-NEXT: andq 192(%rdi), %rbp -; SSE-NEXT: orq %r9, %rbp -; SSE-NEXT: orq %rax, %rbp -; SSE-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq 288(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 32(%rdi), %r9 -; SSE-NEXT: andq 416(%rdi), %rdx -; SSE-NEXT: andq 160(%rdi), %r11 -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: orq %rdx, %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 352(%rdi), %rdx -; SSE-NEXT: orq %r9, %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 96(%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 480(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 224(%rdi), %r8 -; SSE-NEXT: orq %rax, %r8 -; SSE-NEXT: orq %rdx, %r8 -; SSE-NEXT: andq 272(%rdi), %r14 -; SSE-NEXT: orq %r11, %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 16(%rdi), %rax -; SSE-NEXT: orq %r14, %rax -; SSE-NEXT: movq %rax, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 400(%rdi), %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 144(%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: orq %r8, %rax -; SSE-NEXT: movq %rax, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 336(%rdi), %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 80(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 464(%rdi), %rdx -; SSE-NEXT: orq %r9, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: andq 208(%rdi), %r11 -; SSE-NEXT: orq %rdx, %r11 -; SSE-NEXT: orq %rax, %r11 -; SSE-NEXT: orq %r8, %r11 -; SSE-NEXT: movq (%rsp), %rdx # 8-byte Reload -; SSE-NEXT: andq 304(%rdi), %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 48(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 432(%rdi), %r9 -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rax, %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 176(%rdi), %r8 -; SSE-NEXT: orq %r9, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 368(%rdi), %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 112(%rdi), %rax -; SSE-NEXT: orq %r10, %r8 -; SSE-NEXT: movq %r8, %r10 -; SSE-NEXT: orq %r9, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 496(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE-NEXT: andq 240(%rdi), %rbp -; SSE-NEXT: orq %r8, %rbp -; SSE-NEXT: orq %rax, %rbp -; SSE-NEXT: orq %r10, %rbp -; SSE-NEXT: orq %r11, %rbp -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 392(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE-NEXT: andq 136(%rdi), %r12 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 328(%rdi), %rdx -; SSE-NEXT: orq %rax, %r12 +; SSE-NEXT: movq 16(%rsp,%r12), %rsi +; SSE-NEXT: shldq %cl, %rsi, %rdx +; SSE-NEXT: movq 8(%rsp,%r12), %r8 +; SSE-NEXT: shldq %cl, %r8, %rsi +; SSE-NEXT: movq (%rsp,%r12), %rbp +; SSE-NEXT: shldq %cl, %rbp, %r8 +; SSE-NEXT: movq -8(%rsp,%r12), %r9 +; SSE-NEXT: shldq %cl, %r9, %rbp +; SSE-NEXT: notq %r10 +; SSE-NEXT: andq 56(%rdi), %r10 +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload +; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE-NEXT: notq %r10 +; SSE-NEXT: andq 48(%rdi), %r10 +; SSE-NEXT: orq %rax, %r10 ; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 72(%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: andq 40(%rdi), %rax ; SSE-NEXT: orq %rdx, %rax ; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 456(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; SSE-NEXT: andq 200(%rdi), %r13 +; SSE-NEXT: notq %r11 +; SSE-NEXT: andq 32(%rdi), %r11 +; SSE-NEXT: orq %rsi, %r11 +; SSE-NEXT: notq %rbx +; SSE-NEXT: andq 24(%rdi), %rbx +; SSE-NEXT: orq %r8, %rbx +; SSE-NEXT: notq %r14 +; SSE-NEXT: andq 16(%rdi), %r14 +; SSE-NEXT: orq %rbp, %r14 +; SSE-NEXT: notq %r15 +; SSE-NEXT: movq -16(%rsp,%r12), %rax +; SSE-NEXT: shldq %cl, %rax, %r9 +; SSE-NEXT: andq 8(%rdi), %r15 +; SSE-NEXT: orq %r9, %r15 +; SSE-NEXT: notq %r13 +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shlq %cl, %rax +; SSE-NEXT: andq (%rdi), %r13 ; SSE-NEXT: orq %rax, %r13 -; SSE-NEXT: orq %rdx, %r13 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 296(%rdi), %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 40(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 424(%rdi), %r8 -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 168(%rdi), %rdx -; SSE-NEXT: orq %r8, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 360(%rdi), %r8 ; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 104(%rdi), %rax -; SSE-NEXT: orq %r9, %rdx -; SSE-NEXT: orq %r8, %rax -; SSE-NEXT: movq %rax, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 488(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: andq 232(%rdi), %r15 -; SSE-NEXT: orq %rax, %r15 -; SSE-NEXT: orq %r8, %r15 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 280(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 24(%rdi), %rax -; SSE-NEXT: orq %rdx, %r15 -; SSE-NEXT: orq %r8, %rax -; SSE-NEXT: movq %rax, %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 408(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 152(%rdi), %rax -; SSE-NEXT: orq %r8, %rax -; SSE-NEXT: orq %r10, %rax -; SSE-NEXT: movq %rax, %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: andq 344(%rdi), %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 88(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 472(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE-NEXT: andq 216(%rdi), %r14 -; SSE-NEXT: orq %r11, %r8 -; SSE-NEXT: orq %rax, %r14 -; SSE-NEXT: orq %r8, %r14 -; SSE-NEXT: orq %r10, %r14 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: andq 312(%rdi), %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE-NEXT: andq 56(%rdi), %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 440(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 184(%rdi), %r9 -; SSE-NEXT: orq %r11, %r10 -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: orq %r10, %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE-NEXT: andq 376(%rdi), %r10 +; SSE-NEXT: andl $60, %eax +; SSE-NEXT: movl (%rdi,%rax), %eax +; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; SSE-NEXT: btl %ecx, %eax ; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 120(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: andq 504(%rdi), %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 248(%rdi), %r8 -; SSE-NEXT: orq %r10, %rax -; SSE-NEXT: movq %rax, %r10 -; SSE-NEXT: orq %r11, %r8 -; SSE-NEXT: movq 1056(%rsp,%rsi), %rax -; SSE-NEXT: shldq %cl, %rax, %rbx -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: orq %r10, %r8 -; SSE-NEXT: orq %r9, %r8 -; SSE-NEXT: andq 256(%rdi), %rdx -; SSE-NEXT: orq %r14, %r8 -; SSE-NEXT: andq (%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT: orq %rbp, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE-NEXT: andq 264(%rdi), %rcx -; SSE-NEXT: andq 8(%rdi), %rbx -; SSE-NEXT: orq %rcx, %rbx -; SSE-NEXT: orq %r12, %rbx -; SSE-NEXT: orq %r13, %rbx -; SSE-NEXT: orq %r15, %rbx -; SSE-NEXT: orq %r8, %rbx -; SSE-NEXT: orq %rax, %rbx -; SSE-NEXT: setne %al -; SSE-NEXT: addq $1576, %rsp # imm = 0x628 +; SSE-NEXT: movq %rax, 56(%rdi) +; SSE-NEXT: movq %r10, 48(%rdi) +; SSE-NEXT: movq %rdx, 40(%rdi) +; SSE-NEXT: movq %r11, 32(%rdi) +; SSE-NEXT: movq %rbx, 24(%rdi) +; SSE-NEXT: movq %r14, 16(%rdi) +; SSE-NEXT: movq %r15, 8(%rdi) +; SSE-NEXT: movq %r13, (%rdi) +; SSE-NEXT: setae %al +; SSE-NEXT: addq $184, %rsp ; SSE-NEXT: popq %rbx ; SSE-NEXT: popq %r12 ; SSE-NEXT: popq %r13 @@ -6017,7 +1405,7 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { ; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; -; AVX2-LABEL: test_ne_i4096: +; AVX2-LABEL: init_eq_i512: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rbp ; AVX2-NEXT: pushq %r15 @@ -6025,490 +1413,103 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { ; AVX2-NEXT: pushq %r13 ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $1560, %rsp # imm = 0x618 -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: subq $168, %rsp ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: andl $4032, %eax # imm = 0xFC0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0] +; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %esi, %ecx ; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %eax +; AVX2-NEXT: movl %esi, %r11d +; AVX2-NEXT: shrl $3, %r11d +; AVX2-NEXT: movl %r11d, %eax +; AVX2-NEXT: andl $56, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: movslq %eax, %rsi -; AVX2-NEXT: movq 1280(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1288(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1536(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1544(%rsp,%rsi), %rax -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1152(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1160(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1408(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1416(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1216(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, (%rsp) # 8-byte Spill -; AVX2-NEXT: movq 1224(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1472(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1480(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1088(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1096(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1344(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1352(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1248(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1256(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1504(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1512(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1120(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1128(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1376(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1384(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1184(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1192(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1440(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1448(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1056(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1064(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1312(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1320(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1264(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1272(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1520(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1528(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1136(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1144(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1392(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1400(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1200(%rsp,%rsi), %r11 -; AVX2-NEXT: movq 1208(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %r11, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1456(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1464(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1072(%rsp,%rsi), %r12 -; AVX2-NEXT: movq 1080(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %r12, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1328(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1336(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1232(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1240(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rax, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1488(%rsp,%rsi), %rbp -; AVX2-NEXT: movq 1496(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rbp, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1104(%rsp,%rsi), %rax -; AVX2-NEXT: movq 1112(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movslq %eax, %r10 +; AVX2-NEXT: movq 104(%rsp,%r10), %r15 +; AVX2-NEXT: movq 112(%rsp,%r10), %rax +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shldq %cl, %r15, %rsi +; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 120(%rsp,%r10), %rsi +; AVX2-NEXT: movq %rsi, %r8 +; AVX2-NEXT: shldq %cl, %rax, %r8 +; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 128(%rsp,%r10), %rax +; AVX2-NEXT: movq %rax, %rbx +; AVX2-NEXT: shldq %cl, %rsi, %rbx +; AVX2-NEXT: movq 136(%rsp,%r10), %rsi +; AVX2-NEXT: movq %rsi, %r14 +; AVX2-NEXT: shldq %cl, %rax, %r14 +; AVX2-NEXT: movq 144(%rsp,%r10), %rax +; AVX2-NEXT: movq %rax, %r12 +; AVX2-NEXT: shldq %cl, %rsi, %r12 +; AVX2-NEXT: movq 96(%rsp,%r10), %rsi +; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 152(%rsp,%r10), %r13 +; AVX2-NEXT: shldq %cl, %rax, %r13 +; AVX2-NEXT: shldq %cl, %rsi, %r15 +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movq 16(%rsp,%r10), %rbp +; AVX2-NEXT: movq 24(%rsp,%r10), %r9 +; AVX2-NEXT: shldq %cl, %rbp, %r9 +; AVX2-NEXT: movq 8(%rsp,%r10), %rdx +; AVX2-NEXT: shldq %cl, %rdx, %rbp +; AVX2-NEXT: movq (%rsp,%r10), %rax ; AVX2-NEXT: shldq %cl, %rax, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1360(%rsp,%rsi), %r10 -; AVX2-NEXT: movq 1368(%rsp,%rsi), %r8 -; AVX2-NEXT: movq %r8, %rdx -; AVX2-NEXT: shldq %cl, %r10, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1168(%rsp,%rsi), %r9 -; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1176(%rsp,%rsi), %rbx -; AVX2-NEXT: movq %rbx, %rdx -; AVX2-NEXT: shldq %cl, %r9, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1424(%rsp,%rsi), %r9 -; AVX2-NEXT: movq 1432(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, %r14 -; AVX2-NEXT: shldq %cl, %r9, %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1296(%rsp,%rsi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1304(%rsp,%rsi), %r14 -; AVX2-NEXT: movq %r14, %r13 -; AVX2-NEXT: shldq %cl, %r15, %r13 -; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, (%rsp) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq 1048(%rsp,%rsi), %rdx -; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %rbx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r13 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %rbp -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, %r9 -; AVX2-NEXT: andq 384(%rdi), %r9 -; AVX2-NEXT: andq 128(%rdi), %r14 -; AVX2-NEXT: andq 320(%rdi), %r10 -; AVX2-NEXT: orq %r9, %r14 -; AVX2-NEXT: movq %r14, %r15 -; AVX2-NEXT: andq 64(%rdi), %rax -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: andq 448(%rdi), %rbp -; AVX2-NEXT: andq 192(%rdi), %r13 -; AVX2-NEXT: orq %rbp, %r13 -; AVX2-NEXT: orq %rax, %r13 -; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq 288(%rdi), %r8 -; AVX2-NEXT: andq 32(%rdi), %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 416(%rdi), %rax -; AVX2-NEXT: orq %r8, %r12 -; AVX2-NEXT: andq 160(%rdi), %r11 -; AVX2-NEXT: orq %rax, %r11 -; AVX2-NEXT: andq 352(%rdi), %rbx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 96(%rdi), %rax -; AVX2-NEXT: orq %r12, %r11 -; AVX2-NEXT: orq %rbx, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 480(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX2-NEXT: andq 224(%rdi), %r13 -; AVX2-NEXT: orq %r10, %r13 -; AVX2-NEXT: orq %rax, %r13 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 272(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 16(%rdi), %rax -; AVX2-NEXT: orq %r11, %r13 -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: andq 400(%rdi), %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 144(%rdi), %rax -; AVX2-NEXT: orq %r9, %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 336(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 80(%rdi), %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 464(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: andq 208(%rdi), %r11 -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: orq %r8, %r11 -; AVX2-NEXT: orq %rax, %r11 -; AVX2-NEXT: orq %r9, %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: andq 304(%rdi), %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 48(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 432(%rdi), %r10 -; AVX2-NEXT: movq (%rsp), %rax # 8-byte Reload -; AVX2-NEXT: andq 176(%rdi), %rax -; AVX2-NEXT: orq %r9, %r8 -; AVX2-NEXT: movq %r8, %r9 -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 368(%rdi), %r8 -; AVX2-NEXT: orq %r9, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 112(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 496(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: andq 240(%rdi), %r9 -; AVX2-NEXT: orq %r8, %r9 -; AVX2-NEXT: orq %rax, %r9 -; AVX2-NEXT: orq %r10, %r9 -; AVX2-NEXT: orq %r11, %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 392(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX2-NEXT: andq 136(%rdi), %rbp -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 328(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 72(%rdi), %rax -; AVX2-NEXT: orq %r10, %rbp -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 456(%rdi), %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; AVX2-NEXT: andq 200(%rdi), %r12 -; AVX2-NEXT: orq %rax, %r12 -; AVX2-NEXT: orq %r8, %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 296(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 40(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: andq 424(%rdi), %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 168(%rdi), %rax -; AVX2-NEXT: orq %r10, %r8 -; AVX2-NEXT: movq %r8, %r10 -; AVX2-NEXT: orq %r11, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 360(%rdi), %r8 -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 104(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 488(%rdi), %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: andq 232(%rdi), %r14 -; AVX2-NEXT: orq %rax, %r14 -; AVX2-NEXT: orq %r8, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 280(%rdi), %r8 -; AVX2-NEXT: orq %r10, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 24(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 408(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 152(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: andq 344(%rdi), %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 88(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 472(%rdi), %rax -; AVX2-NEXT: orq %r11, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: andq 216(%rdi), %rbx -; AVX2-NEXT: orq %rax, %rbx -; AVX2-NEXT: orq %r8, %rbx -; AVX2-NEXT: orq %r10, %rbx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 312(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 56(%rdi), %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 440(%rdi), %r10 -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 184(%rdi), %r8 -; AVX2-NEXT: orq %r10, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 376(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 120(%rdi), %rax -; AVX2-NEXT: orq %r11, %r8 -; AVX2-NEXT: movq %r8, %r11 -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 504(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 248(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r8, %r10 -; AVX2-NEXT: orq %r11, %rax -; AVX2-NEXT: movq 1040(%rsp,%rsi), %rsi -; AVX2-NEXT: orq %rbx, %rax -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: shlxq %rcx, %rsi, %rax -; AVX2-NEXT: andq 256(%rdi), %r10 -; AVX2-NEXT: andq (%rdi), %rax -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: orq %r15, %rax -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; AVX2-NEXT: orq %r13, %rax +; AVX2-NEXT: movq -8(%rsp,%r10), %r8 +; AVX2-NEXT: shldq %cl, %r8, %rax +; AVX2-NEXT: movq -16(%rsp,%r10), %rsi +; AVX2-NEXT: shldq %cl, %rsi, %r8 +; AVX2-NEXT: andnq 56(%rdi), %r13, %r13 +; AVX2-NEXT: orq %r9, %r13 +; AVX2-NEXT: movq -24(%rsp,%r10), %r9 +; AVX2-NEXT: shldq %cl, %r9, %rsi +; AVX2-NEXT: andnq 48(%rdi), %r12, %r12 +; AVX2-NEXT: andnq 40(%rdi), %r14, %r14 +; AVX2-NEXT: orq %rbp, %r12 +; AVX2-NEXT: orq %rdx, %r14 +; AVX2-NEXT: andnq 32(%rdi), %rbx, %rdx +; AVX2-NEXT: orq %rax, %rdx +; AVX2-NEXT: shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX2-NEXT: movq -32(%rsp,%r10), %r10 +; AVX2-NEXT: shlxq %rcx, %r10, %rbx ; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %rsi, %rdx +; AVX2-NEXT: shldq %cl, %r10, %r9 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: andq 264(%rdi), %rcx -; AVX2-NEXT: andq 8(%rdi), %rdx -; AVX2-NEXT: orq %r9, %rax -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: orq %rbp, %rdx -; AVX2-NEXT: orq %r12, %rdx -; AVX2-NEXT: orq %r14, %rdx -; AVX2-NEXT: orq %r8, %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: setne %al -; AVX2-NEXT: addq $1560, %rsp # imm = 0x618 +; AVX2-NEXT: andnq 24(%rdi), %rcx, %rcx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: andnq 16(%rdi), %r10, %r10 +; AVX2-NEXT: orq %r8, %rcx +; AVX2-NEXT: orq %rsi, %r10 +; AVX2-NEXT: andnq 8(%rdi), %r15, %rsi +; AVX2-NEXT: orq %r9, %rsi +; AVX2-NEXT: andnq (%rdi), %rax, %rax +; AVX2-NEXT: orq %rbx, %rax +; AVX2-NEXT: andl $60, %r11d +; AVX2-NEXT: movl (%rdi,%r11), %r8d +; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload +; AVX2-NEXT: btl %r9d, %r8d +; AVX2-NEXT: movq %r13, 56(%rdi) +; AVX2-NEXT: movq %r12, 48(%rdi) +; AVX2-NEXT: movq %r14, 40(%rdi) +; AVX2-NEXT: movq %rdx, 32(%rdi) +; AVX2-NEXT: movq %rcx, 24(%rdi) +; AVX2-NEXT: movq %r10, 16(%rdi) +; AVX2-NEXT: movq %rsi, 8(%rdi) +; AVX2-NEXT: movq %rax, (%rdi) +; AVX2-NEXT: setae %al +; AVX2-NEXT: addq $168, %rsp ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %r12 ; AVX2-NEXT: popq %r13 @@ -6518,7 +1519,7 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_ne_i4096: +; AVX512-LABEL: init_eq_i512: ; AVX512: # %bb.0: ; AVX512-NEXT: pushq %rbp ; AVX512-NEXT: pushq %r15 @@ -6526,489 +1527,100 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { ; AVX512-NEXT: pushq %r13 ; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $1560, %rsp # imm = 0x618 -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: andl $4032, %eax # imm = 0xFC0 +; AVX512-NEXT: subq $152, %rsp ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0] +; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %esi, %ecx ; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %eax +; AVX512-NEXT: movl %esi, %r8d +; AVX512-NEXT: shrl $3, %r8d +; AVX512-NEXT: movl %r8d, %eax +; AVX512-NEXT: andl $56, %eax ; AVX512-NEXT: negl %eax -; AVX512-NEXT: movslq %eax, %rsi -; AVX512-NEXT: movq 1280(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1288(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1536(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1544(%rsp,%rsi), %rax -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1152(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1160(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1408(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1416(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1216(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, (%rsp) # 8-byte Spill -; AVX512-NEXT: movq 1224(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1472(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1480(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1088(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1096(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1344(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1352(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1248(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1256(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1504(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1512(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1120(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1128(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1376(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1384(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1184(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1192(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1440(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1448(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1056(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1064(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1312(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1320(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1264(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1272(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1520(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1528(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1136(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1144(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1392(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1400(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1200(%rsp,%rsi), %r10 -; AVX512-NEXT: movq 1208(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %r10, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1456(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1464(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1072(%rsp,%rsi), %r14 -; AVX512-NEXT: movq 1080(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %r14, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1328(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1336(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1232(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1240(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1488(%rsp,%rsi), %r12 -; AVX512-NEXT: movq 1496(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %r12, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1104(%rsp,%rsi), %rax -; AVX512-NEXT: movq 1112(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1360(%rsp,%rsi), %r11 -; AVX512-NEXT: movq 1368(%rsp,%rsi), %rbx -; AVX512-NEXT: movq %rbx, %rdx -; AVX512-NEXT: shldq %cl, %r11, %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1168(%rsp,%rsi), %r9 -; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1176(%rsp,%rsi), %r8 -; AVX512-NEXT: movq %r8, %rdx -; AVX512-NEXT: shldq %cl, %r9, %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1424(%rsp,%rsi), %r9 -; AVX512-NEXT: movq 1432(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, %r15 -; AVX512-NEXT: shldq %cl, %r9, %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1296(%rsp,%rsi), %rbp -; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1304(%rsp,%rsi), %r15 -; AVX512-NEXT: movq %r15, %r13 -; AVX512-NEXT: shldq %cl, %rbp, %r13 -; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, (%rsp) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq 1048(%rsp,%rsi), %rdx -; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %rbx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: movslq %eax, %r9 +; AVX512-NEXT: movq 88(%rsp,%r9), %r10 +; AVX512-NEXT: movq 96(%rsp,%r9), %rax +; AVX512-NEXT: movq %rax, %rsi +; AVX512-NEXT: shldq %cl, %r10, %rsi +; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 104(%rsp,%r9), %rsi +; AVX512-NEXT: movq %rsi, %r11 +; AVX512-NEXT: shldq %cl, %rax, %r11 +; AVX512-NEXT: movq 112(%rsp,%r9), %rax +; AVX512-NEXT: movq %rax, %rbx +; AVX512-NEXT: shldq %cl, %rsi, %rbx +; AVX512-NEXT: movq 120(%rsp,%r9), %rsi +; AVX512-NEXT: movq %rsi, %r14 +; AVX512-NEXT: shldq %cl, %rax, %r14 +; AVX512-NEXT: movq 128(%rsp,%r9), %rax +; AVX512-NEXT: movq %rax, %r12 +; AVX512-NEXT: shldq %cl, %rsi, %r12 +; AVX512-NEXT: movq 136(%rsp,%r9), %r13 +; AVX512-NEXT: shldq %cl, %rax, %r13 +; AVX512-NEXT: movq 80(%rsp,%r9), %r15 ; AVX512-NEXT: shldq %cl, %r15, %r10 +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movq (%rsp,%r9), %rbp +; AVX512-NEXT: movq 8(%rsp,%r9), %rsi +; AVX512-NEXT: shldq %cl, %rbp, %rsi +; AVX512-NEXT: movq -8(%rsp,%r9), %rdx +; AVX512-NEXT: shldq %cl, %rdx, %rbp +; AVX512-NEXT: movq -16(%rsp,%r9), %rax +; AVX512-NEXT: shldq %cl, %rax, %rdx +; AVX512-NEXT: andnq 56(%rdi), %r13, %r13 +; AVX512-NEXT: andnq 48(%rdi), %r12, %r12 +; AVX512-NEXT: orq %rsi, %r13 +; AVX512-NEXT: orq %rbp, %r12 +; AVX512-NEXT: andnq 40(%rdi), %r14, %r14 +; AVX512-NEXT: orq %rdx, %r14 +; AVX512-NEXT: movq -24(%rsp,%r9), %rsi +; AVX512-NEXT: shldq %cl, %rsi, %rax +; AVX512-NEXT: andnq 32(%rdi), %rbx, %rdx +; AVX512-NEXT: orq %rax, %rdx +; AVX512-NEXT: movq -32(%rsp,%r9), %rax +; AVX512-NEXT: shldq %cl, %rax, %rsi +; AVX512-NEXT: shlxq %rcx, %r15, %rbx +; AVX512-NEXT: andnq 24(%rdi), %r11, %r11 +; AVX512-NEXT: orq %rsi, %r11 +; AVX512-NEXT: movq -48(%rsp,%r9), %rsi +; AVX512-NEXT: movq -40(%rsp,%r9), %r9 +; AVX512-NEXT: shldq %cl, %r9, %rax ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r14 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r13 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbp, %r15 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbp, %r9 -; AVX512-NEXT: andq 384(%rdi), %r9 -; AVX512-NEXT: andq 128(%rdi), %r15 -; AVX512-NEXT: orq %r9, %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq 320(%rdi), %r11 -; AVX512-NEXT: andq 64(%rdi), %rax -; AVX512-NEXT: orq %r11, %rax -; AVX512-NEXT: andq 448(%rdi), %r12 -; AVX512-NEXT: andq 192(%rdi), %r13 -; AVX512-NEXT: orq %r12, %r13 -; AVX512-NEXT: orq %rax, %r13 -; AVX512-NEXT: andq 288(%rdi), %r8 -; AVX512-NEXT: andq 32(%rdi), %r14 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 416(%rdi), %rax -; AVX512-NEXT: orq %r8, %r14 -; AVX512-NEXT: andq 160(%rdi), %r10 -; AVX512-NEXT: orq %rax, %r10 -; AVX512-NEXT: andq 352(%rdi), %rbx -; AVX512-NEXT: orq %r14, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 96(%rdi), %rax -; AVX512-NEXT: orq %rbx, %rax -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 480(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: andq 224(%rdi), %r15 +; AVX512-NEXT: andnq 16(%rdi), %r15, %r15 ; AVX512-NEXT: orq %rax, %r15 -; AVX512-NEXT: orq %r8, %r15 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 272(%rdi), %r8 -; AVX512-NEXT: orq %r10, %r15 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 16(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: andq 400(%rdi), %r9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 144(%rdi), %rax -; AVX512-NEXT: orq %r9, %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: andq 336(%rdi), %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 80(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 464(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: andq 208(%rdi), %r11 -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: orq %r8, %r11 -; AVX512-NEXT: orq %rax, %r11 -; AVX512-NEXT: orq %r9, %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: andq 304(%rdi), %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 48(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: andq 432(%rdi), %r9 -; AVX512-NEXT: movq (%rsp), %r8 # 8-byte Reload -; AVX512-NEXT: andq 176(%rdi), %r8 -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: orq %r9, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: andq 368(%rdi), %r9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 112(%rdi), %rax -; AVX512-NEXT: orq %r10, %r8 -; AVX512-NEXT: movq %r8, %r10 -; AVX512-NEXT: orq %r9, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 496(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: andq 240(%rdi), %r9 -; AVX512-NEXT: orq %r8, %r9 -; AVX512-NEXT: orq %rax, %r9 -; AVX512-NEXT: orq %r10, %r9 -; AVX512-NEXT: orq %r11, %r9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: andq 392(%rdi), %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT: andq 136(%rdi), %rbp -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 328(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 72(%rdi), %rax -; AVX512-NEXT: orq %r10, %rbp -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 456(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; AVX512-NEXT: andq 200(%rdi), %r12 -; AVX512-NEXT: orq %rax, %r12 -; AVX512-NEXT: orq %r8, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 296(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 40(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 424(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 168(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 360(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 104(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 488(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX512-NEXT: andq 232(%rdi), %r14 -; AVX512-NEXT: orq %rax, %r14 -; AVX512-NEXT: orq %r8, %r14 -; AVX512-NEXT: orq %r10, %r14 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 280(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 24(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 408(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 152(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: andq 344(%rdi), %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 88(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 472(%rdi), %rax -; AVX512-NEXT: orq %r11, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: andq 216(%rdi), %rbx -; AVX512-NEXT: orq %rax, %rbx -; AVX512-NEXT: orq %r8, %rbx -; AVX512-NEXT: orq %r10, %rbx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: andq 312(%rdi), %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 56(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 440(%rdi), %r8 -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 184(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 376(%rdi), %r8 -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 120(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 504(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 248(%rdi), %r8 -; AVX512-NEXT: orq %rax, %r8 -; AVX512-NEXT: orq %r10, %r8 -; AVX512-NEXT: orq %r11, %r8 -; AVX512-NEXT: movq 1040(%rsp,%rsi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rsi, %r10 -; AVX512-NEXT: orq %rbx, %r8 -; AVX512-NEXT: shlxq %rcx, %rax, %rsi -; AVX512-NEXT: andq 256(%rdi), %r10 -; AVX512-NEXT: andq (%rdi), %rsi -; AVX512-NEXT: orq %r10, %rsi -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX512-NEXT: orq %r13, %rsi -; AVX512-NEXT: orq %r15, %rsi +; AVX512-NEXT: shlxq %rcx, %rsi, %rax ; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: orq %r9, %rsi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 264(%rdi), %rax -; AVX512-NEXT: andq 8(%rdi), %rdx -; AVX512-NEXT: orq %rax, %rdx -; AVX512-NEXT: orq %rbp, %rdx -; AVX512-NEXT: orq %r12, %rdx -; AVX512-NEXT: orq %r14, %rdx -; AVX512-NEXT: orq %r8, %rdx -; AVX512-NEXT: orq %rsi, %rdx -; AVX512-NEXT: setne %al -; AVX512-NEXT: addq $1560, %rsp # imm = 0x618 +; AVX512-NEXT: shldq %cl, %rsi, %r9 +; AVX512-NEXT: andnq 8(%rdi), %r10, %rcx +; AVX512-NEXT: orq %r9, %rcx +; AVX512-NEXT: andnq (%rdi), %rbx, %rsi +; AVX512-NEXT: orq %rax, %rsi +; AVX512-NEXT: andl $60, %r8d +; AVX512-NEXT: movl (%rdi,%r8), %eax +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload +; AVX512-NEXT: btl %r8d, %eax +; AVX512-NEXT: movq %r13, 56(%rdi) +; AVX512-NEXT: movq %r12, 48(%rdi) +; AVX512-NEXT: movq %r14, 40(%rdi) +; AVX512-NEXT: movq %rdx, 32(%rdi) +; AVX512-NEXT: movq %r11, 24(%rdi) +; AVX512-NEXT: movq %r15, 16(%rdi) +; AVX512-NEXT: movq %rcx, 8(%rdi) +; AVX512-NEXT: movq %rsi, (%rdi) +; AVX512-NEXT: setae %al +; AVX512-NEXT: addq $152, %rsp ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r12 ; AVX512-NEXT: popq %r13 @@ -7017,6 +1629,45 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %mask = xor i512 %bit, -1 + %val0 = zext i1 %value to i512 + %val = shl nuw i512 %val0, %ofs + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %res0 = and i512 %ld, %mask + %res = or i512 %res0, %val + %cmp = icmp eq i512 %test, 0 + store i512 %res, ptr %word + ret i1 %cmp +} + +; i4096 + +define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { +; X86-LABEL: test_ne_i4096: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $4064, %edx # imm = 0xFE0 +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%eax,%edx), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: test_ne_i4096: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $4064, %eax # imm = 0xFE0 +; X64-NEXT: shrl $3, %eax +; X64-NEXT: movl (%rdi,%rax), %eax +; X64-NEXT: btl %esi, %eax +; X64-NEXT: setb %al +; X64-NEXT: retq %rem = and i32 %position, 4095 %ofs = zext nneg i32 %rem to i4096 %bit = shl nuw i4096 1, %ofs From 570e24292423eb2367ad761ad7e5ed89d20e9ff1 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Thu, 30 Oct 2025 13:42:32 +0100 Subject: [PATCH 184/539] Reapply "[HIP][Clang] Remove __AMDGCN_WAVEFRONT_SIZE macros" (#164217) This reverts commit 78bf682cb9033cf6a5bbc733e062c7b7d825fdaf. Original PR: #157463 Revert PR: #158566 The relevant buildbots have been updated to a ROCm version that does not use the macros anymore to avoid the failures. Implements SWDEV-522062. --- clang/docs/AMDGPUSupport.rst | 4 - clang/docs/HIPSupport.rst | 3 +- clang/lib/Basic/Targets/AMDGPU.cpp | 6 - .../CodeGenHIP/maybe_undef-attr-verify.hip | 2 +- .../CodeGenOpenCL/builtins-amdgcn-wave32.cl | 6 +- .../CodeGenOpenCL/builtins-amdgcn-wave64.cl | 4 - clang/test/Driver/amdgpu-macros.cl | 16 --- clang/test/Driver/hip-macros.hip | 23 ---- ...wavefront-size-deprecation-diagnostics.hip | 115 ------------------ .../Preprocessor/predefined-arch-macros.c | 2 - 10 files changed, 3 insertions(+), 178 deletions(-) delete mode 100644 clang/test/Driver/hip-wavefront-size-deprecation-diagnostics.hip diff --git a/clang/docs/AMDGPUSupport.rst b/clang/docs/AMDGPUSupport.rst index 3eada5f900613..18e3de8abe92a 100644 --- a/clang/docs/AMDGPUSupport.rst +++ b/clang/docs/AMDGPUSupport.rst @@ -49,10 +49,6 @@ Predefined Macros - Defined as 1 if the CU mode is enabled and 0 if the WGP mode is enabled. * - ``__AMDGCN_UNSAFE_FP_ATOMICS__`` - Defined if unsafe floating-point atomics are allowed. - * - ``__AMDGCN_WAVEFRONT_SIZE__`` - - Defines the wavefront size. Allowed values are 32 and 64 (deprecated). - * - ``__AMDGCN_WAVEFRONT_SIZE`` - - Alias to ``__AMDGCN_WAVEFRONT_SIZE__`` (deprecated). * - ``__HAS_FMAF__`` - Defined if FMAF instruction is available (deprecated). * - ``__HAS_LDEXPF__`` diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst index ec2af2a6f569d..ab9ea110e6d54 100644 --- a/clang/docs/HIPSupport.rst +++ b/clang/docs/HIPSupport.rst @@ -180,8 +180,7 @@ Predefined Macros - Alias to ``__HIP_API_PER_THREAD_DEFAULT_STREAM__``. Deprecated. Note that some architecture specific AMDGPU macros will have default values when -used from the HIP host compilation. Other :doc:`AMDGPU macros ` -like ``__AMDGCN_WAVEFRONT_SIZE__`` (deprecated) will default to 64 for example. +used from the HIP host compilation. Compilation Modes ================= diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp index d4de704689e72..d4d696b8456b6 100644 --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -356,12 +356,6 @@ void AMDGPUTargetInfo::getTargetDefines(const LangOptions &Opts, if (hasFastFMA()) Builder.defineMacro("FP_FAST_FMA"); - Builder.defineMacro("__AMDGCN_WAVEFRONT_SIZE__", Twine(WavefrontSize), - "compile-time-constant access to the wavefront size will " - "be removed in a future release"); - Builder.defineMacro("__AMDGCN_WAVEFRONT_SIZE", Twine(WavefrontSize), - "compile-time-constant access to the wavefront size will " - "be removed in a future release"); Builder.defineMacro("__AMDGCN_CUMODE__", Twine(CUMode)); } diff --git a/clang/test/CodeGenHIP/maybe_undef-attr-verify.hip b/clang/test/CodeGenHIP/maybe_undef-attr-verify.hip index 571fba148f5cc..6dc57c4fcc5fc 100644 --- a/clang/test/CodeGenHIP/maybe_undef-attr-verify.hip +++ b/clang/test/CodeGenHIP/maybe_undef-attr-verify.hip @@ -20,7 +20,7 @@ #define __maybe_undef __attribute__((maybe_undef)) #define WARP_SIZE 64 -static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE__; +static constexpr int warpSize = WARP_SIZE; __device__ static inline unsigned int __lane_id() { return __builtin_amdgcn_mbcnt_hi( diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl index d390418523694..31fd0e7bceaf5 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl @@ -1,5 +1,5 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -D__AMDGCN_WAVEFRONT_SIZE=32 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck -enable-var-scope %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck -enable-var-scope %s // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck -enable-var-scope %s // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck -enable-var-scope %s // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck -enable-var-scope %s @@ -48,7 +48,3 @@ void test_read_exec_lo(global uint* out) { void test_read_exec_hi(global uint* out) { *out = __builtin_amdgcn_read_exec_hi(); } - -#if __AMDGCN_WAVEFRONT_SIZE != 32 -#error Wrong wavesize detected -#endif diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl index d851ec7e6734f..758b5aa532d73 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl @@ -50,7 +50,3 @@ void test_read_exec_lo(global ulong* out) { void test_read_exec_hi(global ulong* out) { *out = __builtin_amdgcn_read_exec_hi(); } - -#if defined(__AMDGCN_WAVEFRONT_SIZE__) && __AMDGCN_WAVEFRONT_SIZE__ != 64 -#error Wrong wavesize detected -#endif diff --git a/clang/test/Driver/amdgpu-macros.cl b/clang/test/Driver/amdgpu-macros.cl index 9fda2f3657430..6d049e7a9bc39 100644 --- a/clang/test/Driver/amdgpu-macros.cl +++ b/clang/test/Driver/amdgpu-macros.cl @@ -154,26 +154,10 @@ // ARCH-GCN-DAG: #define __[[CPU]]__ 1 // ARCH-GCN-DAG: #define __[[FAMILY]]__ 1 // ARCH-GCN-DAG: #define __amdgcn_processor__ "[[CPU]]" -// ARCH-GCN-DAG: #define __AMDGCN_WAVEFRONT_SIZE [[WAVEFRONT_SIZE]] // ARCH-GCN-DAG: #define __GCC_DESTRUCTIVE_SIZE 128 // ARCH-GCN-DAG: #define __GCC_CONSTRUCTIVE_SIZE 128 // UNSAFEFPATOMIC-DAG: #define __AMDGCN_UNSAFE_FP_ATOMICS__ 1 -// RUN: %clang -E -dM -target amdgcn -mcpu=gfx906 -mwavefrontsize64 \ -// RUN: %s 2>&1 | FileCheck --check-prefix=WAVE64 %s -// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 -mwavefrontsize64 \ -// RUN: %s 2>&1 | FileCheck --check-prefix=WAVE64 %s -// RUN: %clang -E -dM -target amdgcn -mcpu=gfx906 -mwavefrontsize64 \ -// RUN: -mno-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=WAVE64 %s -// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 -mwavefrontsize64 \ -// RUN: -mno-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=WAVE32 %s -// RUN: %clang -E -dM -target amdgcn -mcpu=gfx906 -mno-wavefrontsize64 \ -// RUN: -mwavefrontsize64 %s 2>&1 | FileCheck --check-prefix=WAVE64 %s -// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 -mno-wavefrontsize64 \ -// RUN: -mwavefrontsize64 %s 2>&1 | FileCheck --check-prefix=WAVE64 %s -// WAVE64-DAG: #define __AMDGCN_WAVEFRONT_SIZE 64 -// WAVE32-DAG: #define __AMDGCN_WAVEFRONT_SIZE 32 - // RUN: %clang -E -dM -target amdgcn -mcpu=gfx906 \ // RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s // RUN: %clang -E -dM -target amdgcn -mcpu=gfx906 -mcumode \ diff --git a/clang/test/Driver/hip-macros.hip b/clang/test/Driver/hip-macros.hip index 516e01a6c4743..4c460d50bf39a 100644 --- a/clang/test/Driver/hip-macros.hip +++ b/clang/test/Driver/hip-macros.hip @@ -1,27 +1,4 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang -E -dM --offload-arch=gfx906 -mwavefrontsize64 \ -// RUN: --cuda-device-only -nogpuinc -nogpulib \ -// RUN: %s 2>&1 | FileCheck --check-prefixes=WAVE64 %s -// RUN: %clang -E -dM --offload-arch=gfx1010 -mwavefrontsize64 \ -// RUN: --cuda-device-only -nogpuinc -nogpulib \ -// RUN: %s 2>&1 | FileCheck --check-prefixes=WAVE64 %s -// RUN: %clang -E -dM --offload-arch=gfx906 -mwavefrontsize64 \ -// RUN: --cuda-device-only -nogpuinc -nogpulib \ -// RUN: -mno-wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=WAVE64 %s -// RUN: %clang -E -dM --offload-arch=gfx1010 -mwavefrontsize64 \ -// RUN: --cuda-device-only -nogpuinc -nogpulib \ -// RUN: -mno-wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=WAVE32 %s -// RUN: %clang -E -dM --offload-arch=gfx906 -mno-wavefrontsize64 \ -// RUN: --cuda-device-only -nogpuinc -nogpulib \ -// RUN: -mwavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=WAVE64 %s -// RUN: %clang -E -dM --offload-arch=gfx1010 -mno-wavefrontsize64 \ -// RUN: --cuda-device-only -nogpuinc -nogpulib \ -// RUN: -mwavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=WAVE64 %s -// WAVE64-DAG: #define __AMDGCN_WAVEFRONT_SIZE__ 64 -// WAVE32-DAG: #define __AMDGCN_WAVEFRONT_SIZE__ 32 -// WAVE64-DAG: #define __AMDGCN_WAVEFRONT_SIZE 64 -// WAVE32-DAG: #define __AMDGCN_WAVEFRONT_SIZE 32 - // RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib \ // RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s // RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \ diff --git a/clang/test/Driver/hip-wavefront-size-deprecation-diagnostics.hip b/clang/test/Driver/hip-wavefront-size-deprecation-diagnostics.hip deleted file mode 100644 index 8a60f5a150048..0000000000000 --- a/clang/test/Driver/hip-wavefront-size-deprecation-diagnostics.hip +++ /dev/null @@ -1,115 +0,0 @@ -// REQUIRES: amdgpu-registered-target -// RUN: %clang -xhip --offload-arch=gfx1030 --offload-host-only -pedantic -nogpuinc -nogpulib -nobuiltininc -fsyntax-only -Xclang -verify %s -// RUN: %clang -xhip --offload-arch=gfx1030 --offload-device-only -pedantic -nogpuinc -nogpulib -nobuiltininc -fsyntax-only -Xclang -verify %s - -// Test that deprecation warnings for the wavefront size macro are emitted properly. - -#define WRAPPED __AMDGCN_WAVEFRONT_SIZE__ - -#define DOUBLE_WRAPPED (WRAPPED) - -template struct my_enable_if {}; - -template struct my_enable_if { - typedef T type; -}; - -__attribute__((host, device)) void use(int, const char*); - -template __attribute__((host, device)) int templatify(int x) { - return x + N; -} - -__attribute__((device)) const int GlobalConst = __AMDGCN_WAVEFRONT_SIZE__; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} -constexpr int GlobalConstExpr = __AMDGCN_WAVEFRONT_SIZE__; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - -#if defined(__HIP_DEVICE_COMPILE__) && (__AMDGCN_WAVEFRONT_SIZE__ == 64) // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} -int foo(void); -#endif - -__attribute__((device)) int device_var = __AMDGCN_WAVEFRONT_SIZE__; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - -__attribute__((device)) -void device_fun() { - use(__AMDGCN_WAVEFRONT_SIZE, "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE' has been marked as deprecated}} - use(__AMDGCN_WAVEFRONT_SIZE__, "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - use(WRAPPED, "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - use(DOUBLE_WRAPPED, "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - use(templatify<__AMDGCN_WAVEFRONT_SIZE__>(42), "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - use(GlobalConst, "device function"); - use(GlobalConstExpr, "device function"); -} - -__attribute__((global)) -void global_fun() { - // no warnings expected - use(__AMDGCN_WAVEFRONT_SIZE, "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE' has been marked as deprecated}} - use(__AMDGCN_WAVEFRONT_SIZE__, "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - use(WRAPPED, "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - use(DOUBLE_WRAPPED, "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - use(templatify<__AMDGCN_WAVEFRONT_SIZE__>(42), "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} -} - -int host_var = __AMDGCN_WAVEFRONT_SIZE__; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} -int host_var_alt = __AMDGCN_WAVEFRONT_SIZE; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE' has been marked as deprecated}} -int host_var_wrapped = WRAPPED; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} -int host_var_double_wrapped = DOUBLE_WRAPPED; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - -__attribute__((host)) -void host_fun() { - use(__AMDGCN_WAVEFRONT_SIZE, "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE' has been marked as deprecated}} - use(__AMDGCN_WAVEFRONT_SIZE__, "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - use(WRAPPED, "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - use(DOUBLE_WRAPPED, "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - use(templatify<__AMDGCN_WAVEFRONT_SIZE__>(42), "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - use(GlobalConst, "host function"); - use(GlobalConstExpr, "host function"); -} - -__attribute((host, device)) -void host_device_fun() { - use(__AMDGCN_WAVEFRONT_SIZE__, "host device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - use(WRAPPED, "host device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - use(DOUBLE_WRAPPED, "host device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - use(templatify<__AMDGCN_WAVEFRONT_SIZE__>(42), "host device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} -} - -template // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} -class FunSelector { -public: - template - __attribute__((device)) - auto fun(void) - -> typename my_enable_if<(FunWarpSize <= __AMDGCN_WAVEFRONT_SIZE__), void>::type // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - { - use(1, "yay!"); - } - - template - __attribute__((device)) - auto fun(void) - -> typename my_enable_if<(FunWarpSize > __AMDGCN_WAVEFRONT_SIZE__), void>::type // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - { - use(0, "nay!"); - } -}; - -__attribute__((device)) -void device_fun_selector_user() { - FunSelector<> f; - f.fun<>(); - f.fun<1>(); - f.fun<1000>(); - - my_enable_if<(1 <= __AMDGCN_WAVEFRONT_SIZE__), int>::type x = 42; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} -} - -__attribute__((device)) my_enable_if<(1 <= __AMDGCN_WAVEFRONT_SIZE__), int>::type DeviceFunTemplateRet(void) { // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - return 42; -} - -__attribute__((device)) int DeviceFunTemplateArg(my_enable_if<(1 <= __AMDGCN_WAVEFRONT_SIZE__), int>::type x) { // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} - return x; -} - -// expected-note@* 0+ {{macro marked 'deprecated' here}} diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c index a3c3697c3a0b9..cdb46326c2838 100644 --- a/clang/test/Preprocessor/predefined-arch-macros.c +++ b/clang/test/Preprocessor/predefined-arch-macros.c @@ -4418,7 +4418,6 @@ // CHECK_AMDGCN_NONE-NOT: #define __HAS_FMAF__ // CHECK_AMDGCN_NONE-NOT: #define __HAS_FP64__ // CHECK_AMDGCN_NONE-NOT: #define __HAS_LDEXPF__ -// CHECK_AMDGCN_NONE-NOT: #define __AMDGCN_WAVEFRONT_SIZE__ // Begin r600 tests ---------------- @@ -4439,7 +4438,6 @@ // RUN: %clang -x hip -E -dM %s -o - 2>&1 --offload-host-only -nogpulib \ // RUN: -nogpuinc --offload-arch=gfx803 -target x86_64-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefixes=CHECK_HIP_HOST -// CHECK_HIP_HOST: #define __AMDGCN_WAVEFRONT_SIZE__ 64 // CHECK_HIP_HOST: #define __AMDGPU__ 1 // CHECK_HIP_HOST: #define __AMD__ 1 From db9fbfcff446f738b01c292e956febe42a608cbe Mon Sep 17 00:00:00 2001 From: Shawn K Date: Thu, 30 Oct 2025 05:58:25 -0700 Subject: [PATCH 185/539] [CIR] Upstream handling for __builtin_prefetch (Typo Fix) (#165209) Not sure if this warrants a PR, but I realized there was a typo in a test filename from my previous PR #164387. --- .../test/CIR/CodeGen/{builtin_prefetech.c => builtin_prefetch.c} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename clang/test/CIR/CodeGen/{builtin_prefetech.c => builtin_prefetch.c} (100%) diff --git a/clang/test/CIR/CodeGen/builtin_prefetech.c b/clang/test/CIR/CodeGen/builtin_prefetch.c similarity index 100% rename from clang/test/CIR/CodeGen/builtin_prefetech.c rename to clang/test/CIR/CodeGen/builtin_prefetch.c From a31c2701e5cda252de6dee23ec4d92b431ae2997 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 30 Oct 2025 13:17:12 +0000 Subject: [PATCH 186/539] [X86] combinePTESTCC - fold PTESTZ(X,SIGNMASK) -> VTESTPD/PSZ(X,X) on AVX targets (#165676) If the PTEST is just using the ZF result and one of the operands is a i32/i64 sign mask we can use the TESTPD/PS instructions instead and avoid the use of an extra constant. Fixes some codegen identified in #156233 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 20 +++++ llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll | 84 ++++--------------- 2 files changed, 38 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f5b8b58c18e1e..49beadae63f03 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48860,6 +48860,26 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0); } + // Attempt to convert PTESTZ(X,SIGNMASK) -> VTESTPD/PSZ(X,X) on AVX targets. + if (EFLAGS.getOpcode() == X86ISD::PTEST && Subtarget.hasAVX()) { + KnownBits KnownOp1 = DAG.computeKnownBits(Op1); + assert(KnownOp1.getBitWidth() == 64 && + "Illegal PTEST vector element width"); + if (KnownOp1.isConstant()) { + const APInt &Mask = KnownOp1.getConstant(); + if (Mask.isSignMask()) { + MVT FpVT = MVT::getVectorVT(MVT::f64, OpVT.getSizeInBits() / 64); + Op0 = DAG.getBitcast(FpVT, DAG.getFreeze(Op0)); + return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Op0, Op0); + } + if (Mask.isSplat(32) && Mask.trunc(32).isSignMask()) { + MVT FpVT = MVT::getVectorVT(MVT::f32, OpVT.getSizeInBits() / 32); + Op0 = DAG.getBitcast(FpVT, DAG.getFreeze(Op0)); + return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Op0, Op0); + } + } + } + // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y) // TODO: Add COND_NE handling? if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) { diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll index 9816fa7c83560..044327d94c0ef 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -875,28 +875,12 @@ define i1 @mask_v8i32(<8 x i32> %a0) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: mask_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: mask_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: mask_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: mask_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vtestps %ymm0, %ymm0 +; AVX-NEXT: sete %al +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0) %2 = and i32 %1, 2147483648 %3 = icmp eq i32 %2, 0 @@ -965,28 +949,12 @@ define i1 @signtest_v8i32(<8 x i32> %a0) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: signtest_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: signtest_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: signtest_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: signtest_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vtestps %ymm0, %ymm0 +; AVX-NEXT: sete %al +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0) %2 = icmp sgt i32 %1, -1 ret i1 %2 @@ -1010,28 +978,12 @@ define i1 @signtest_v4i64(<4 x i64> %a0) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: signtest_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: signtest_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: signtest_v4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: signtest_v4i64: +; AVX: # %bb.0: +; AVX-NEXT: vtestpd %ymm0, %ymm0 +; AVX-NEXT: sete %al +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a0) %2 = icmp sgt i64 %1, -1 ret i1 %2 From 88fe87e08ec67d5e00cdf67a2b1214be6c1d70cd Mon Sep 17 00:00:00 2001 From: Robert Imschweiler Date: Thu, 30 Oct 2025 14:22:42 +0100 Subject: [PATCH 187/539] [AMDGPU][FixIrreducible][UnifyLoopExits] Support callbr with inline-asm (#149308) First batch of changes to add support for inline-asm callbr for the AMDGPU backend. --- llvm/include/llvm/ADT/GenericCycleImpl.h | 11 + llvm/include/llvm/ADT/GenericCycleInfo.h | 1 + llvm/include/llvm/Support/GenericLoopInfo.h | 11 + .../llvm/Support/GenericLoopInfoImpl.h | 32 +- .../llvm/Transforms/Utils/BasicBlockUtils.h | 29 + .../llvm/Transforms/Utils/ControlFlowUtils.h | 6 +- llvm/lib/Transforms/Utils/BasicBlockUtils.cpp | 73 ++ .../lib/Transforms/Utils/ControlFlowUtils.cpp | 5 +- llvm/lib/Transforms/Utils/FixIrreducible.cpp | 126 ++- llvm/lib/Transforms/Utils/UnifyLoopExits.cpp | 77 +- .../Transforms/FixIrreducible/bug45623.ll | 109 +++ llvm/test/Transforms/FixIrreducible/callbr.ll | 869 ++++++++++++++++++ llvm/test/Transforms/FixIrreducible/nested.ll | 676 ++++++++++++++ .../Transforms/FixIrreducible/unreachable.ll | 23 + llvm/test/Transforms/UnifyLoopExits/basic.ll | 131 ++- .../UnifyLoopExits/integer_guards.ll | 410 +++++++++ llvm/test/Transforms/UnifyLoopExits/nested.ll | 142 +++ .../Transforms/UnifyLoopExits/restore-ssa.ll | 236 +++++ .../Transforms/UnifyLoopExits/undef-phis.ll | 68 ++ 19 files changed, 2985 insertions(+), 50 deletions(-) create mode 100644 llvm/test/Transforms/FixIrreducible/callbr.ll diff --git a/llvm/include/llvm/ADT/GenericCycleImpl.h b/llvm/include/llvm/ADT/GenericCycleImpl.h index 40390789e2deb..00f85ca819f3f 100644 --- a/llvm/include/llvm/ADT/GenericCycleImpl.h +++ b/llvm/include/llvm/ADT/GenericCycleImpl.h @@ -561,6 +561,17 @@ auto GenericCycleInfo::getSmallestCommonCycle(CycleT *A, return A; } +/// \brief Find the innermost cycle containing both given blocks. +/// +/// \returns the innermost cycle containing both \p A and \p B +/// or nullptr if there is no such cycle. +template +auto GenericCycleInfo::getSmallestCommonCycle(BlockT *A, + BlockT *B) const + -> CycleT * { + return getSmallestCommonCycle(getCycle(A), getCycle(B)); +} + /// \brief get the depth for the cycle which containing a given block. /// /// \returns the depth for the innermost cycle containing \p Block or 0 if it is diff --git a/llvm/include/llvm/ADT/GenericCycleInfo.h b/llvm/include/llvm/ADT/GenericCycleInfo.h index b8b6e3e9967a4..c31bab3c178ca 100644 --- a/llvm/include/llvm/ADT/GenericCycleInfo.h +++ b/llvm/include/llvm/ADT/GenericCycleInfo.h @@ -298,6 +298,7 @@ template class GenericCycleInfo { CycleT *getCycle(const BlockT *Block) const; CycleT *getSmallestCommonCycle(CycleT *A, CycleT *B) const; + CycleT *getSmallestCommonCycle(BlockT *A, BlockT *B) const; unsigned getCycleDepth(const BlockT *Block) const; CycleT *getTopLevelParentCycle(BlockT *Block); diff --git a/llvm/include/llvm/Support/GenericLoopInfo.h b/llvm/include/llvm/Support/GenericLoopInfo.h index 2775a8734dd47..b6bb360d9868f 100644 --- a/llvm/include/llvm/Support/GenericLoopInfo.h +++ b/llvm/include/llvm/Support/GenericLoopInfo.h @@ -615,6 +615,17 @@ template class LoopInfoBase { return L ? L->getLoopDepth() : 0; } + /// \brief Find the innermost loop containing both given loops. + /// + /// \returns the innermost loop containing both \p A and \p B + /// or nullptr if there is no such loop. + LoopT *getSmallestCommonLoop(LoopT *A, LoopT *B) const; + /// \brief Find the innermost loop containing both given blocks. + /// + /// \returns the innermost loop containing both \p A and \p B + /// or nullptr if there is no such loop. + LoopT *getSmallestCommonLoop(BlockT *A, BlockT *B) const; + // True if the block is a loop header node bool isLoopHeader(const BlockT *BB) const { const LoopT *L = getLoopFor(BB); diff --git a/llvm/include/llvm/Support/GenericLoopInfoImpl.h b/llvm/include/llvm/Support/GenericLoopInfoImpl.h index 6fc508b0e0cca..541678001a8ff 100644 --- a/llvm/include/llvm/Support/GenericLoopInfoImpl.h +++ b/llvm/include/llvm/Support/GenericLoopInfoImpl.h @@ -355,7 +355,7 @@ void LoopBase::verifyLoop() const { if (BB == getHeader()) { assert(!OutsideLoopPreds.empty() && "Loop is unreachable!"); } else if (!OutsideLoopPreds.empty()) { - // A non-header loop shouldn't be reachable from outside the loop, + // A non-header loop block shouldn't be reachable from outside the loop, // though it is permitted if the predecessor is not itself actually // reachable. BlockT *EntryBB = &BB->getParent()->front(); @@ -645,6 +645,36 @@ LoopInfoBase::getLoopsInReverseSiblingPreorder() const { return PreOrderLoops; } +template +LoopT *LoopInfoBase::getSmallestCommonLoop(LoopT *A, + LoopT *B) const { + if (!A || !B) + return nullptr; + + // If lops A and B have different depth replace them with parent loop + // until they have the same depth. + while (A->getLoopDepth() > B->getLoopDepth()) + A = A->getParentLoop(); + while (B->getLoopDepth() > A->getLoopDepth()) + B = B->getParentLoop(); + + // Loops A and B are at same depth but may be disjoint, replace them with + // parent loops until we find loop that contains both or we run out of + // parent loops. + while (A != B) { + A = A->getParentLoop(); + B = B->getParentLoop(); + } + + return A; +} + +template +LoopT *LoopInfoBase::getSmallestCommonLoop(BlockT *A, + BlockT *B) const { + return getSmallestCommonLoop(getLoopFor(A), getLoopFor(B)); +} + // Debugging template void LoopInfoBase::print(raw_ostream &OS) const { diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h index e677cbf2d8968..49885b7f06a15 100644 --- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h +++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h @@ -19,6 +19,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SetVector.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CycleInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Printable.h" @@ -262,6 +263,34 @@ LLVM_ABI BasicBlock *SplitEdge(BasicBlock *From, BasicBlock *To, MemorySSAUpdater *MSSAU = nullptr, const Twine &BBName = ""); +/// \brief Create a new intermediate target block for a callbr edge. +/// +/// Create a new basic block between a callbr instruction and one of its +/// successors. The new block replaces the original successor in the callbr +/// instruction and unconditionally branches to the original successor. This +/// is useful for normalizing control flow, e.g., when transforming +/// irreducible loops. +/// +/// \param CallBrBlock block containing the callbr instruction +/// \param Succ original successor block +/// \param SuccIdx index of the original successor in the callbr +/// instruction +/// \param DTU optional \p DomTreeUpdater for updating the +/// dominator tree +/// \param CI optional \p CycleInfo for updating cycle membership +/// \param LI optional \p LoopInfo for updating loop membership +/// \param UpdatedLI optional output flag indicating if \p LoopInfo has +/// been updated +/// +/// \returns newly created intermediate target block +/// +/// \note This function updates PHI nodes, dominator tree, loop info, and +/// cycle info as needed. +LLVM_ABI BasicBlock * +SplitCallBrEdge(BasicBlock *CallBrBlock, BasicBlock *Succ, unsigned SuccIdx, + DomTreeUpdater *DTU = nullptr, CycleInfo *CI = nullptr, + LoopInfo *LI = nullptr, bool *UpdatedLI = nullptr); + /// Sets the unwind edge of an instruction to a particular successor. LLVM_ABI void setUnwindEdgeTo(Instruction *TI, BasicBlock *Succ); diff --git a/llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h b/llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h index 810fef29f4010..17cde82b084d8 100644 --- a/llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h +++ b/llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h @@ -15,10 +15,13 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/IR/CycleInfo.h" namespace llvm { class BasicBlock; +class CallBrInst; +class LoopInfo; class DomTreeUpdater; /// Given a set of branch descriptors [BB, Succ0, Succ1], create a "hub" such @@ -104,7 +107,8 @@ struct ControlFlowHub { : BB(BB), Succ0(Succ0), Succ1(Succ1) {} }; - void addBranch(BasicBlock *BB, BasicBlock *Succ0, BasicBlock *Succ1) { + void addBranch(BasicBlock *BB, BasicBlock *Succ0, + BasicBlock *Succ1 = nullptr) { assert(BB); assert(Succ0 || Succ1); Branches.emplace_back(BB, Succ0, Succ1); diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 9829d4d50098c..11db0ec487328 100644 --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -674,6 +674,79 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT, return SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU, BBName); } +/// Helper function to update the cycle or loop information after inserting a +/// new block between a callbr instruction and one of its target blocks. Adds +/// the new block to the innermost cycle or loop that the callbr instruction and +/// the original target block share. +/// \p LCI cycle or loop information to update +/// \p CallBrBlock block containing the callbr instruction +/// \p CallBrTarget new target block of the callbr instruction +/// \p Succ original target block of the callbr instruction +template +static bool updateCycleLoopInfo(TI *LCI, BasicBlock *CallBrBlock, + BasicBlock *CallBrTarget, BasicBlock *Succ) { + static_assert(std::is_same_v || std::is_same_v, + "type must be CycleInfo or LoopInfo"); + if (!LCI) + return false; + + T *LC; + if constexpr (std::is_same_v) + LC = LCI->getSmallestCommonCycle(CallBrBlock, Succ); + else + LC = LCI->getSmallestCommonLoop(CallBrBlock, Succ); + if (!LC) + return false; + + if constexpr (std::is_same_v) + LCI->addBlockToCycle(CallBrTarget, LC); + else + LC->addBasicBlockToLoop(CallBrTarget, *LCI); + + return true; +} + +BasicBlock *llvm::SplitCallBrEdge(BasicBlock *CallBrBlock, BasicBlock *Succ, + unsigned SuccIdx, DomTreeUpdater *DTU, + CycleInfo *CI, LoopInfo *LI, + bool *UpdatedLI) { + CallBrInst *CallBr = dyn_cast(CallBrBlock->getTerminator()); + assert(CallBr && "expected callbr terminator"); + assert(SuccIdx < CallBr->getNumSuccessors() && + Succ == CallBr->getSuccessor(SuccIdx) && "invalid successor index"); + + // Create a new block between callbr and the specified successor. + // splitBlockBefore cannot be re-used here since it cannot split if the split + // point is a PHI node (because BasicBlock::splitBasicBlockBefore cannot + // handle that). But we don't need to rewire every part of a potential PHI + // node. We only care about the edge between CallBrBlock and the original + // successor. + BasicBlock *CallBrTarget = + BasicBlock::Create(CallBrBlock->getContext(), + CallBrBlock->getName() + ".target." + Succ->getName(), + CallBrBlock->getParent()); + // Rewire control flow from the new target block to the original successor. + Succ->replacePhiUsesWith(CallBrBlock, CallBrTarget); + // Rewire control flow from callbr to the new target block. + CallBr->setSuccessor(SuccIdx, CallBrTarget); + // Jump from the new target block to the original successor. + BranchInst::Create(Succ, CallBrTarget); + + bool Updated = + updateCycleLoopInfo(LI, CallBrBlock, CallBrTarget, Succ); + if (UpdatedLI) + *UpdatedLI = Updated; + updateCycleLoopInfo(CI, CallBrBlock, CallBrTarget, Succ); + if (DTU) { + DTU->applyUpdates({{DominatorTree::Insert, CallBrBlock, CallBrTarget}}); + if (DTU->getDomTree().dominates(CallBrBlock, Succ)) + DTU->applyUpdates({{DominatorTree::Delete, CallBrBlock, Succ}, + {DominatorTree::Insert, CallBrTarget, Succ}}); + } + + return CallBrTarget; +} + void llvm::setUnwindEdgeTo(Instruction *TI, BasicBlock *Succ) { if (auto *II = dyn_cast(TI)) II->setUnwindDest(Succ); diff --git a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp index 0046a00af4338..287a177371c80 100644 --- a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp +++ b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp @@ -13,6 +13,7 @@ #include "llvm/Transforms/Utils/ControlFlowUtils.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/ValueHandle.h" @@ -281,7 +282,9 @@ std::pair ControlFlowHub::finalize( for (auto [BB, Succ0, Succ1] : Branches) { #ifndef NDEBUG - assert(Incoming.insert(BB).second && "Duplicate entry for incoming block."); + assert( + (Incoming.insert(BB).second || isa(BB->getTerminator())) && + "Duplicate entry for incoming block."); #endif if (Succ0) Outgoing.insert(Succ0); diff --git a/llvm/lib/Transforms/Utils/FixIrreducible.cpp b/llvm/lib/Transforms/Utils/FixIrreducible.cpp index 45e1d12c2bfff..804af22daa5af 100644 --- a/llvm/lib/Transforms/Utils/FixIrreducible.cpp +++ b/llvm/lib/Transforms/Utils/FixIrreducible.cpp @@ -79,6 +79,53 @@ // Limitation: The pass cannot handle switch statements and indirect // branches. Both must be lowered to plain branches first. // +// CallBr support: CallBr is handled as a more general branch instruction which +// can have multiple successors. The pass redirects the edges to intermediate +// target blocks that unconditionally branch to the original callbr target +// blocks. This allows the control flow hub to know to which of the original +// target blocks to jump to. +// Example input CFG: +// Entry (callbr) +// / \ +// v v +// H ----> B +// ^ /| +// `----' | +// v +// Exit +// +// becomes: +// Entry (callbr) +// / \ +// v v +// target.H target.B +// | | +// v v +// H ----> B +// ^ /| +// `----' | +// v +// Exit +// +// Note +// OUTPUT CFG: Converted to a natural loop with a new header N. +// +// Entry (callbr) +// / \ +// v v +// target.H target.B +// \ / +// \ / +// v v +// N <---. +// / \ \ +// / \ | +// v v / +// H --> B --' +// | +// v +// Exit +// //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/FixIrreducible.h" @@ -231,6 +278,7 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT, return false; LLVM_DEBUG(dbgs() << "Processing cycle:\n" << CI.print(&C) << "\n";); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); ControlFlowHub CHub; SetVector Predecessors; @@ -242,18 +290,32 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT, } for (BasicBlock *P : Predecessors) { - auto *Branch = cast(P->getTerminator()); - // Exactly one of the two successors is the header. - BasicBlock *Succ0 = Branch->getSuccessor(0) == Header ? Header : nullptr; - BasicBlock *Succ1 = Succ0 ? nullptr : Header; - if (!Succ0) - assert(Branch->getSuccessor(1) == Header); - assert(Succ0 || Succ1); - CHub.addBranch(P, Succ0, Succ1); - - LLVM_DEBUG(dbgs() << "Added internal branch: " << P->getName() << " -> " - << (Succ0 ? Succ0->getName() : "") << " " - << (Succ1 ? Succ1->getName() : "") << "\n"); + if (BranchInst *Branch = dyn_cast(P->getTerminator())) { + // Exactly one of the two successors is the header. + BasicBlock *Succ0 = Branch->getSuccessor(0) == Header ? Header : nullptr; + BasicBlock *Succ1 = Succ0 ? nullptr : Header; + assert(Succ0 || Branch->getSuccessor(1) == Header); + assert(Succ0 || Succ1); + CHub.addBranch(P, Succ0, Succ1); + + LLVM_DEBUG(dbgs() << "Added internal branch: " << printBasicBlock(P) + << " -> " << printBasicBlock(Succ0) + << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) + << '\n'); + } else if (CallBrInst *CallBr = dyn_cast(P->getTerminator())) { + for (unsigned I = 0; I < CallBr->getNumSuccessors(); ++I) { + BasicBlock *Succ = CallBr->getSuccessor(I); + if (Succ != Header) + continue; + BasicBlock *NewSucc = SplitCallBrEdge(P, Succ, I, &DTU, &CI, LI); + CHub.addBranch(NewSucc, Succ); + LLVM_DEBUG(dbgs() << "Added internal branch: " + << printBasicBlock(NewSucc) << " -> " + << printBasicBlock(Succ) << '\n'); + } + } else { + llvm_unreachable("unsupported block terminator"); + } } // Redirect external incoming edges. This includes the edges on the header. @@ -266,17 +328,32 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT, } for (BasicBlock *P : Predecessors) { - auto *Branch = cast(P->getTerminator()); - BasicBlock *Succ0 = Branch->getSuccessor(0); - Succ0 = C.contains(Succ0) ? Succ0 : nullptr; - BasicBlock *Succ1 = - Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); - Succ1 = Succ1 && C.contains(Succ1) ? Succ1 : nullptr; - CHub.addBranch(P, Succ0, Succ1); - - LLVM_DEBUG(dbgs() << "Added external branch: " << P->getName() << " -> " - << (Succ0 ? Succ0->getName() : "") << " " - << (Succ1 ? Succ1->getName() : "") << "\n"); + if (BranchInst *Branch = dyn_cast(P->getTerminator()); Branch) { + BasicBlock *Succ0 = Branch->getSuccessor(0); + Succ0 = C.contains(Succ0) ? Succ0 : nullptr; + BasicBlock *Succ1 = + Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); + Succ1 = Succ1 && C.contains(Succ1) ? Succ1 : nullptr; + CHub.addBranch(P, Succ0, Succ1); + + LLVM_DEBUG(dbgs() << "Added external branch: " << printBasicBlock(P) + << " -> " << printBasicBlock(Succ0) + << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) + << '\n'); + } else if (CallBrInst *CallBr = dyn_cast(P->getTerminator())) { + for (unsigned I = 0; I < CallBr->getNumSuccessors(); ++I) { + BasicBlock *Succ = CallBr->getSuccessor(I); + if (!C.contains(Succ)) + continue; + BasicBlock *NewSucc = SplitCallBrEdge(P, Succ, I, &DTU, &CI, LI); + CHub.addBranch(NewSucc, Succ); + LLVM_DEBUG(dbgs() << "Added external branch: " + << printBasicBlock(NewSucc) << " -> " + << printBasicBlock(Succ) << '\n'); + } + } else { + llvm_unreachable("unsupported block terminator"); + } } // Redirect all the backedges through a "hub" consisting of a series @@ -292,7 +369,6 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT, SetVector Entries; Entries.insert(C.entry_rbegin(), C.entry_rend()); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); CHub.finalize(&DTU, GuardBlocks, "irr"); #if defined(EXPENSIVE_CHECKS) assert(DT.verify(DominatorTree::VerificationLevel::Full)); @@ -325,8 +401,6 @@ static bool FixIrreducibleImpl(Function &F, CycleInfo &CI, DominatorTree &DT, LLVM_DEBUG(dbgs() << "===== Fix irreducible control-flow in function: " << F.getName() << "\n"); - assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); - bool Changed = false; for (Cycle *TopCycle : CI.toplevel_cycles()) { for (Cycle *C : depth_first(TopCycle)) { diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp index 9f338dbc78cff..94c5c1709f43e 100644 --- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp +++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp @@ -12,7 +12,11 @@ // // Limitation: This assumes that all terminators in the CFG are direct branches // (the "br" instruction). The presence of any other control flow -// such as indirectbr, switch or callbr will cause an assert. +// such as indirectbr or switch will cause an assert. +// The callbr terminator is supported by creating intermediate +// target blocks that unconditionally branch to the original target +// blocks. These intermediate target blocks can then be redirected +// through the ControlFlowHub as usual. // //===----------------------------------------------------------------------===// @@ -150,25 +154,55 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { SmallVector ExitingBlocks; L->getExitingBlocks(ExitingBlocks); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + SmallVector CallBrTargetBlocksToFix; // Redirect exiting edges through a control flow hub. ControlFlowHub CHub; - for (auto *BB : ExitingBlocks) { - auto *Branch = cast(BB->getTerminator()); - BasicBlock *Succ0 = Branch->getSuccessor(0); - Succ0 = L->contains(Succ0) ? nullptr : Succ0; - - BasicBlock *Succ1 = - Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); - Succ1 = L->contains(Succ1) ? nullptr : Succ1; - CHub.addBranch(BB, Succ0, Succ1); - - LLVM_DEBUG(dbgs() << "Added exiting branch: " << BB->getName() << " -> {" - << (Succ0 ? Succ0->getName() : "") << ", " - << (Succ1 ? Succ1->getName() : "") << "}\n"); + + for (unsigned I = 0; I < ExitingBlocks.size(); ++I) { + BasicBlock *BB = ExitingBlocks[I]; + if (BranchInst *Branch = dyn_cast(BB->getTerminator())) { + BasicBlock *Succ0 = Branch->getSuccessor(0); + Succ0 = L->contains(Succ0) ? nullptr : Succ0; + + BasicBlock *Succ1 = + Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); + Succ1 = L->contains(Succ1) ? nullptr : Succ1; + CHub.addBranch(BB, Succ0, Succ1); + + LLVM_DEBUG(dbgs() << "Added extiting branch: " << printBasicBlock(BB) + << " -> " << printBasicBlock(Succ0) + << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) + << '\n'); + } else if (CallBrInst *CallBr = dyn_cast(BB->getTerminator())) { + for (unsigned J = 0; J < CallBr->getNumSuccessors(); ++J) { + BasicBlock *Succ = CallBr->getSuccessor(J); + if (L->contains(Succ)) + continue; + bool UpdatedLI = false; + BasicBlock *NewSucc = + SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI); + // Even if CallBr and Succ do not have a common parent loop, we need to + // add the new target block to the parent loop of the current loop. + if (!UpdatedLI) + CallBrTargetBlocksToFix.push_back(NewSucc); + // ExitingBlocks is later used to restore SSA, so we need to make sure + // that the blocks used for phi nodes in the guard blocks match the + // predecessors of the guard blocks, which, in the case of callbr, are + // the new intermediate target blocks instead of the callbr blocks + // themselves. + ExitingBlocks[I] = NewSucc; + CHub.addBranch(NewSucc, Succ); + LLVM_DEBUG(dbgs() << "Added exiting branch: " + << printBasicBlock(NewSucc) << " -> " + << printBasicBlock(Succ) << '\n'); + } + } else { + llvm_unreachable("unsupported block terminator"); + } } SmallVector GuardBlocks; - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); BasicBlock *LoopExitBlock; bool ChangedCFG; std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize( @@ -187,10 +221,19 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { // The guard blocks were created outside the loop, so they need to become // members of the parent loop. - if (auto ParentLoop = L->getParentLoop()) { + // Same goes for the callbr target blocks. Although we try to add them to the + // smallest common parent loop of the callbr block and the corresponding + // original target block, there might not have been such a loop, in which case + // the newly created callbr target blocks are not part of any loop. For nested + // loops, this might result in them leading to a loop with multiple entry + // points. + if (auto *ParentLoop = L->getParentLoop()) { for (auto *G : GuardBlocks) { ParentLoop->addBasicBlockToLoop(G, LI); } + for (auto *C : CallBrTargetBlocksToFix) { + ParentLoop->addBasicBlockToLoop(C, LI); + } ParentLoop->verifyLoop(); } @@ -218,8 +261,6 @@ bool UnifyLoopExitsLegacyPass::runOnFunction(Function &F) { auto &LI = getAnalysis().getLoopInfo(); auto &DT = getAnalysis().getDomTree(); - assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); - return runImpl(LI, DT); } diff --git a/llvm/test/Transforms/FixIrreducible/bug45623.ll b/llvm/test/Transforms/FixIrreducible/bug45623.ll index 58724431ff0ee..b6dd6fb9e6fcb 100644 --- a/llvm/test/Transforms/FixIrreducible/bug45623.ll +++ b/llvm/test/Transforms/FixIrreducible/bug45623.ll @@ -90,3 +90,112 @@ for.end626: ; preds = %for.cond616 if.else629: ; preds = %backtrack br label %retry } + +define void @tre_tnfa_run_backtrack_callbr(i1 %arg) { +; CHECK-LABEL: @tre_tnfa_run_backtrack_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[RETRY:%.*]] [] +; CHECK: retry: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[ARG:%.*]]) +; CHECK-NEXT: to label [[RETRY_TARGET_BACKTRACK:%.*]] [label %retry.target.while.body248] +; CHECK: while.body248: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[ARG]]) +; CHECK-NEXT: to label [[IF_THEN250:%.*]] [label %if.end275] +; CHECK: if.then250: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[FOR_COND264:%.*]] [] +; CHECK: for.cond264: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[ARG]]) +; CHECK-NEXT: to label [[FOR_BODY267:%.*]] [label %backtrack] +; CHECK: for.body267: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[FOR_COND264]] [] +; CHECK: if.end275: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[FOR_COND342:%.*]] [] +; CHECK: for.cond342: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[ARG]]) +; CHECK-NEXT: to label [[FOR_BODY345:%.*]] [label %for.end580] +; CHECK: for.body345: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[FOR_COND342]] [] +; CHECK: for.end580: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[BACKTRACK:%.*]] [] +; CHECK: backtrack: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[ARG]]) +; CHECK-NEXT: to label [[IF_THEN595:%.*]] [label %if.else629] +; CHECK: if.then595: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[FOR_COND616:%.*]] [] +; CHECK: for.cond616: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[ARG]]) +; CHECK-NEXT: to label [[FOR_BODY619:%.*]] [label %for.end626] +; CHECK: for.body619: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[FOR_COND616]] [] +; CHECK: for.end626: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[FOR_END626_TARGET_WHILE_BODY248:%.*]] [] +; CHECK: if.else629: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[RETRY]] [] +; CHECK: for.end626.target.while.body248: +; CHECK-NEXT: br label [[IRR_GUARD:%.*]] +; CHECK: retry.target.backtrack: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: retry.target.while.body248: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: irr.guard: +; CHECK-NEXT: [[GUARD_WHILE_BODY248:%.*]] = phi i1 [ true, [[FOR_END626_TARGET_WHILE_BODY248]] ], [ false, [[RETRY_TARGET_BACKTRACK]] ], [ true, [[RETRY_TARGET_WHILE_BODY248:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_WHILE_BODY248]], label [[WHILE_BODY248:%.*]], label [[BACKTRACK]] +; +entry: + callbr void asm "", ""() to label %retry [] + +retry: + callbr void asm "", "r,!i"(i1 %arg) to label %backtrack [label %while.body248] + +while.body248: ; preds = %for.end626, %retry + callbr void asm "", "r,!i"(i1 %arg) to label %if.then250 [label %if.end275] + +if.then250: ; preds = %while.body248 + callbr void asm "", ""() to label %for.cond264 [] + +for.cond264: ; preds = %for.body267, %if.then250 + callbr void asm "", "r,!i"(i1 %arg) to label %for.body267 [label %backtrack] + +for.body267: ; preds = %for.cond264 + callbr void asm "", ""() to label %for.cond264 [] + +if.end275: ; preds = %while.body248 + callbr void asm "", ""() to label %for.cond342 [] + +for.cond342: ; preds = %for.body345, %if.end275 + callbr void asm "", "r,!i"(i1 %arg) to label %for.body345 [label %for.end580] + +for.body345: ; preds = %for.cond342 + callbr void asm "", ""() to label %for.cond342 [] + +for.end580: ; preds = %for.cond342 + callbr void asm "", ""() to label %backtrack [] + +backtrack: ; preds = %for.end580, %for.cond264, %retry + callbr void asm "", "r,!i"(i1 %arg) to label %if.then595 [label %if.else629] + +if.then595: ; preds = %backtrack + callbr void asm "", ""() to label %for.cond616 [] + +for.cond616: ; preds = %for.body619, %if.then595 + callbr void asm "", "r,!i"(i1 %arg) to label %for.body619 [label %for.end626] + +for.body619: ; preds = %for.cond616 + callbr void asm "", ""() to label %for.cond616 [] + +for.end626: ; preds = %for.cond616 + callbr void asm "", ""() to label %while.body248 [] + +if.else629: ; preds = %backtrack + callbr void asm "", ""() to label %retry [] +} diff --git a/llvm/test/Transforms/FixIrreducible/callbr.ll b/llvm/test/Transforms/FixIrreducible/callbr.ll new file mode 100644 index 0000000000000..26ca6c7c12777 --- /dev/null +++ b/llvm/test/Transforms/FixIrreducible/callbr.ll @@ -0,0 +1,869 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes='fix-irreducible,verify' -S | FileCheck %s +; RUN: opt < %s -passes='verify,fix-irreducible,verify' -S | FileCheck %s +; RUN: opt < %s -passes='print' -disable-output 2>&1 | FileCheck %s --check-prefix CYCLES-BEFORE +; RUN: opt < %s -passes='fix-irreducible,print' -disable-output 2>&1 | FileCheck %s --check-prefix CYCLES-AFTER + +; CYCLES-BEFORE: CycleInfo for function: callbr_entry +; CYCLES-BEFORE-NEXT: depth=1: entries(indirect fallthrough) +; CYCLES-AFTER: CycleInfo for function: callbr_entry +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) indirect fallthrough + +; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_entry_targets_with_phi_nodes +; CYCLES-BEFORE-NEXT: depth=1: entries(block1 block) +; CYCLES-AFTER-NEXT: CycleInfo for function: callbr_entry_targets_with_phi_nodes +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) block1 block + +; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_entry_multiple_indirect_targets +; CYCLES-BEFORE-NEXT: depth=1: entries(indirect fallthrough) +; CYCLES-AFTER-NEXT: CycleInfo for function: callbr_entry_multiple_indirect_targets +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) indirect fallthrough + +; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_entry_multiple_indirect_targets1 +; CYCLES-BEFORE-NEXT: depth=1: entries(indirect1 indirect fallthrough) +; CYCLES-BEFORE-NEXT: depth=2: entries(indirect fallthrough) +; CYCLES-AFTER-NEXT: CycleInfo for function: callbr_entry_multiple_indirect_targets1 +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) indirect1 indirect fallthrough irr.guard1 irr.guard2 +; CYCLES-AFTER-NEXT: depth=2: entries(irr.guard2) indirect fallthrough + +; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_header_no_indirect +; CYCLES-BEFORE-NEXT: depth=1: entries(fallthrough callbr) +; CYCLES-AFTER-NEXT: CycleInfo for function: callbr_header_no_indirect +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) fallthrough callbr callbr.target.fallthrough + +; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_header +; CYCLES-BEFORE-NEXT: depth=1: entries(fallthrough callbr) +; CYCLES-AFTER-NEXT: CycleInfo for function: callbr_header +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) fallthrough callbr callbr.target.fallthrough + +; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_header_multiple_indirect_targets +; CYCLES-BEFORE-NEXT: depth=1: entries(fallthrough callbr) indirect1 +; CYCLES-BEFORE-NEXT: depth=2: entries(callbr) indirect1 +; CYCLES-AFTER-NEXT: CycleInfo for function: callbr_header_multiple_indirect_targets +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) fallthrough callbr indirect1 callbr.target.fallthrough +; CYCLES-AFTER-NEXT: depth=2: entries(callbr) indirect1 + +; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_regular +; CYCLES-BEFORE-NEXT: depth=1: entries(fallthrough2 fallthrough1) +; CYCLES-BEFORE-NEXT: depth=1: entries(indirect2 indirect1) +; CYCLES-BEFORE-NEXT: depth=1: entries(nocallbr2 nocallbr1) +; CYCLES-AFTER-NEXT: CycleInfo for function: callbr_regular +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) fallthrough2 fallthrough1 +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard1) indirect2 indirect1 +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard2) nocallbr2 nocallbr1 + +; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_regular1 +; CYCLES-BEFORE-NEXT: depth=1: entries(callbr nocallbr) +; CYCLES-AFTER-NEXT: CycleInfo for function: callbr_regular1 +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) callbr nocallbr + +; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_regular2 +; CYCLES-BEFORE-NEXT: depth=1: entries(callbr nocallbr) +; CYCLES-AFTER-NEXT: CycleInfo for function: callbr_regular2 +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) callbr nocallbr + +; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_header_and_regular +; CYCLES-BEFORE-NEXT: depth=1: entries(callbr_header) callbr_regular mid +; CYCLES-BEFORE-NEXT: depth=2: entries(callbr_regular mid) +; CYCLES-AFTER-NEXT: CycleInfo for function: callbr_header_and_regular +; CYCLES-AFTER-NEXT: depth=1: entries(callbr_header) callbr_regular mid callbr_header.target.mid callbr_header.target.callbr_regular irr.guard +; CYCLES-AFTER-NEXT: depth=2: entries(irr.guard) callbr_regular mid + +; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_only +; CYCLES-BEFORE-NEXT: depth=1: entries(callbr_block callbr_header) +; CYCLES-AFTER-NEXT: CycleInfo for function: callbr_only +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) callbr_block callbr_header callbr_header.target.callbr_block + +; CYCLES-BEFORE-NEXT: CycleInfo for function: entry_multiple_callbr +; CYCLES-BEFORE-NEXT: depth=1: entries(cb2 block block1) +; CYCLES-BEFORE-NEXT: depth=2: entries(block block1) +; CYCLES-AFTER-NEXT: CycleInfo for function: entry_multiple_callbr +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) cb2 block block1 irr.guard1 cb2.target.block1 cb2.target.block irr.guard2 +; CYCLES-AFTER-NEXT: depth=2: entries(irr.guard2) block block1 + +; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_exit_with_separate_entries +; CYCLES-BEFORE-NEXT: depth=1: entries(l2 l1) cb +; CYCLES-BEFORE-NEXT: depth=2: entries(l1 cb) +; CYCLES-AFTER-NEXT: CycleInfo for function: callbr_exit_with_separate_entries +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) l2 l1 cb cb.target.l1 irr.guard1 +; CYCLES-AFTER-NEXT: depth=2: entries(irr.guard1) l1 cb cb.target.l1 + +; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_exit_with_separate_entries1 +; CYCLES-BEFORE-NEXT: depth=1: entries(loop2 loop1) cb +; CYCLES-AFTER-NEXT: CycleInfo for function: callbr_exit_with_separate_entries1 +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) loop2 loop1 cb cb.target.loop2 + +; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_only_multiple +; CYCLES-BEFORE-NEXT: depth=1: entries(cb3 cb1 cb2) +; CYCLES-BEFORE-NEXT: depth=2: entries(cb1 cb2) +; CYCLES-AFTER-NEXT: CycleInfo for function: callbr_only_multiple +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) cb3 cb1 cb2 cb2.target.cb3 cb1.target.cb3 irr.guard1 cb2.target.cb1 cb3.target.cb1 irr.guard2 +; CYCLES-AFTER-NEXT: depth=2: entries(irr.guard2) cb1 cb2 cb2.target.cb1 + +; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_bypass +; CYCLES-BEFORE-NEXT: depth=1: entries(l1 cb) l2 +; CYCLES-BEFORE-NEXT: depth=2: entries(cb l2) +; CYCLES-AFTER-NEXT: CycleInfo for function: callbr_bypass +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) l1 cb l2 cb.target.l1 irr.guard1 +; CYCLES-AFTER-NEXT: depth=2: entries(irr.guard1) cb l2 + +; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_multiple_with_exit +; CYCLES-BEFORE-NEXT: depth=1: entries(l3 l1 l2) +; CYCLES-BEFORE-NEXT: depth=2: entries(l1 l2) +; CYCLES-AFTER-NEXT: CycleInfo for function: callbr_multiple_with_exit +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) l3 l1 l2 irr.guard1 irr.guard2 +; CYCLES-AFTER-NEXT: depth=2: entries(irr.guard2) l1 l2 + +; CYCLES-BEFORE-NEXT: CycleInfo for function: callbr_nested +; CYCLES-BEFORE-NEXT: depth=1: entries(bb bh) +; CYCLES-BEFORE-NEXT: depth=1: entries(b h) +; CYCLES-AFTER-NEXT: CycleInfo for function: callbr_nested +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard) bb bh +; CYCLES-AFTER-NEXT: depth=1: entries(irr.guard1) b h + +; Fix the irreducible loop in which callbr is the entry (see description at the +; top of FixIrreducible.cpp). +define void @callbr_entry(i1 %c) { +; CHECK-LABEL: define void @callbr_entry( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[CALLBR:.*:]] +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[CALLBR_TARGET_FALLTHROUGH:.*]] [label %callbr.target.indirect] +; CHECK: [[FALLTHROUGH:.*]]: +; CHECK-NEXT: br i1 [[C]], label %[[IRR_GUARD:.*]], label %[[RET:.*]] +; CHECK: [[INDIRECT:.*]]: +; CHECK-NEXT: br label %[[FALLTHROUGH]] +; CHECK: [[RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CALLBR_TARGET_FALLTHROUGH]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[CALLBR_TARGET_INDIRECT:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_INDIRECT:%.*]] = phi i1 [ true, %[[FALLTHROUGH]] ], [ false, %[[CALLBR_TARGET_FALLTHROUGH]] ], [ true, %[[CALLBR_TARGET_INDIRECT]] ] +; CHECK-NEXT: br i1 [[GUARD_INDIRECT]], label %[[INDIRECT]], label %[[FALLTHROUGH]] +; +callbr: + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br i1 %c, label %indirect, label %ret +indirect: + br label %fallthrough +ret: + ret void +} + +define i32 @callbr_entry_targets_with_phi_nodes(i1 %c) { +; CHECK-LABEL: define i32 @callbr_entry_targets_with_phi_nodes( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[ENTRY_TARGET_BLOCK:.*]] [label %entry.target.block1] +; CHECK: [[BLOCK:.*]]: +; CHECK-NEXT: [[A:%.*]] = phi i32 [ 1, %[[BLOCK1:.*]] ], [ [[A_MOVED:%.*]], %[[IRR_GUARD:.*]] ] +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[BLOCK1]]: +; CHECK-NEXT: br i1 [[C]], label %[[BLOCK]], label %[[RET:.*]] +; CHECK: [[RET]]: +; CHECK-NEXT: ret i32 [[B_MOVED:%.*]] +; CHECK: [[ENTRY_TARGET_BLOCK]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[ENTRY_TARGET_BLOCK1:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[A_MOVED]] = phi i32 [ poison, %[[BLOCK]] ], [ 42, %[[ENTRY_TARGET_BLOCK]] ], [ poison, %[[ENTRY_TARGET_BLOCK1]] ] +; CHECK-NEXT: [[B_MOVED]] = phi i32 [ [[A]], %[[BLOCK]] ], [ poison, %[[ENTRY_TARGET_BLOCK]] ], [ 43, %[[ENTRY_TARGET_BLOCK1]] ] +; CHECK-NEXT: [[GUARD_BLOCK1:%.*]] = phi i1 [ true, %[[BLOCK]] ], [ false, %[[ENTRY_TARGET_BLOCK]] ], [ true, %[[ENTRY_TARGET_BLOCK1]] ] +; CHECK-NEXT: br i1 [[GUARD_BLOCK1]], label %[[BLOCK1]], label %[[BLOCK]] +; +entry: + callbr void asm "", "!i"() to label %block [label %block1] +block: + %a = phi i32 [42, %entry], [1, %block1] + br label %block1 +block1: + %b = phi i32 [43, %entry], [%a, %block] + br i1 %c, label %block, label %ret +ret: + ret i32 %b +} + +define void @callbr_entry_multiple_indirect_targets(i1 %c) { +; CHECK-LABEL: define void @callbr_entry_multiple_indirect_targets( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[CALLBR:.*:]] +; CHECK-NEXT: callbr void asm "", "!i,!i,!i"() +; CHECK-NEXT: to label %[[CALLBR_TARGET_FALLTHROUGH:.*]] [label %[[CALLBR_TARGET_INDIRECT:.*]], label %[[INDIRECT1:.*]], label %indirect2] +; CHECK: [[INDIRECT3:.*]]: +; CHECK-NEXT: br i1 [[C]], label %[[IRR_GUARD:.*]], label %[[RET:.*]] +; CHECK: [[INDIRECT:.*]]: +; CHECK-NEXT: br label %[[INDIRECT3]] +; CHECK: [[INDIRECT1]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[INDIRECT2:.*:]] +; CHECK-NEXT: br label %[[RET]] +; CHECK: [[RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CALLBR_TARGET_FALLTHROUGH]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[CALLBR_TARGET_INDIRECT]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_INDIRECT:%.*]] = phi i1 [ true, %[[INDIRECT3]] ], [ true, %[[INDIRECT1]] ], [ false, %[[CALLBR_TARGET_FALLTHROUGH]] ], [ true, %[[CALLBR_TARGET_INDIRECT]] ] +; CHECK-NEXT: br i1 [[GUARD_INDIRECT]], label %[[INDIRECT]], label %[[INDIRECT3]] +; +callbr: + callbr void asm "", "!i,!i,!i"() to label %fallthrough [label %indirect, label %indirect1, label %indirect2] +fallthrough: + br i1 %c, label %indirect, label %ret +indirect: + br label %fallthrough +indirect1: + br label %indirect +indirect2: + br label %ret +ret: + ret void +} + +define void @callbr_entry_multiple_indirect_targets1(i1 %c, i1 %d) { +; CHECK-LABEL: define void @callbr_entry_multiple_indirect_targets1( +; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]]) { +; CHECK-NEXT: [[CALLBR:.*:]] +; CHECK-NEXT: callbr void asm "", "!i,!i,!i"() +; CHECK-NEXT: to label %[[CALLBR_TARGET_FALLTHROUGH:.*]] [label %[[CALLBR_TARGET_INDIRECT:.*]], label %[[CALLBR_TARGET_INDIRECT1:.*]], label %indirect2] +; CHECK: [[INDIRECT3:.*]]: +; CHECK-NEXT: br i1 [[C]], label %[[IRR_GUARD2:.*]], label %[[RET:.*]] +; CHECK: [[INDIRECT:.*]]: +; CHECK-NEXT: br i1 [[D]], label %[[INDIRECT3]], label %[[IRR_GUARD:.*]] +; CHECK: [[INDIRECT1:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD2]] +; CHECK: [[INDIRECT2:.*:]] +; CHECK-NEXT: br label %[[RET]] +; CHECK: [[RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CALLBR_TARGET_FALLTHROUGH]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[CALLBR_TARGET_INDIRECT]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[CALLBR_TARGET_INDIRECT1]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_INDIRECT1:%.*]] = phi i1 [ true, %[[INDIRECT]] ], [ false, %[[CALLBR_TARGET_FALLTHROUGH]] ], [ false, %[[CALLBR_TARGET_INDIRECT]] ], [ true, %[[CALLBR_TARGET_INDIRECT1]] ] +; CHECK-NEXT: [[GUARD_FALLTHROUGH:%.*]] = phi i1 [ false, %[[INDIRECT]] ], [ true, %[[CALLBR_TARGET_FALLTHROUGH]] ], [ false, %[[CALLBR_TARGET_INDIRECT]] ], [ false, %[[CALLBR_TARGET_INDIRECT1]] ] +; CHECK-NEXT: [[GUARD_FALLTHROUGH_INV:%.*]] = xor i1 [[GUARD_FALLTHROUGH]], true +; CHECK-NEXT: br i1 [[GUARD_INDIRECT1]], label %[[INDIRECT1]], label %[[IRR_GUARD1:.*]] +; CHECK: [[IRR_GUARD1]]: +; CHECK-NEXT: br label %[[IRR_GUARD2]] +; CHECK: [[IRR_GUARD2]]: +; CHECK-NEXT: [[GUARD_INDIRECT:%.*]] = phi i1 [ true, %[[INDIRECT3]] ], [ [[GUARD_FALLTHROUGH_INV]], %[[IRR_GUARD1]] ], [ true, %[[INDIRECT1]] ] +; CHECK-NEXT: br i1 [[GUARD_INDIRECT]], label %[[INDIRECT]], label %[[INDIRECT3]] +; +callbr: + callbr void asm "", "!i,!i,!i"() to label %fallthrough [label %indirect, label %indirect1, label %indirect2] +fallthrough: + br i1 %c, label %indirect, label %ret +indirect: + br i1 %d, label %fallthrough, label %indirect1 +indirect1: + br label %indirect +indirect2: + br label %ret +ret: + ret void +} + +; Fix the irreducible loop in which callbr is the header (see the example at the +; top of FixIrreducible.cpp). +define void @callbr_header_no_indirect(i1 %c, i1 %d) { +; CHECK-LABEL: define void @callbr_header_no_indirect( +; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]]) { +; CHECK-NEXT: [[D_INV:%.*]] = xor i1 [[D]], true +; CHECK-NEXT: br label %[[IRR_GUARD:.*]] +; CHECK: [[CALLBR:.*]]: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label %[[CALLBR_TARGET_FALLTHROUGH:.*]] [] +; CHECK: [[FALLTHROUGH:.*]]: +; CHECK-NEXT: br i1 [[C]], label %[[CALLBR]], label %[[RET:.*]] +; CHECK: [[RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CALLBR_TARGET_FALLTHROUGH]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_FALLTHROUGH:%.*]] = phi i1 [ true, %[[CALLBR_TARGET_FALLTHROUGH]] ], [ [[D_INV]], [[TMP0:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_FALLTHROUGH]], label %[[FALLTHROUGH]], label %[[CALLBR]] +; + br i1 %d, label %callbr, label %fallthrough +callbr: + callbr void asm "", ""() to label %fallthrough [] +fallthrough: + br i1 %c, label %callbr, label %ret +ret: + ret void +} + +; Fix the irreducible loop in which callbr is the header. +define void @callbr_header(i1 %c, i1 %d) { +; CHECK-LABEL: define void @callbr_header( +; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]]) { +; CHECK-NEXT: [[D_INV:%.*]] = xor i1 [[D]], true +; CHECK-NEXT: br label %[[IRR_GUARD:.*]] +; CHECK: [[CALLBR:.*]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[CALLBR_TARGET_FALLTHROUGH:.*]] [label %indirect] +; CHECK: [[INDIRECT:.*:]] +; CHECK-NEXT: br label %[[RET:.*]] +; CHECK: [[FALLTHROUGH:.*]]: +; CHECK-NEXT: br i1 [[C]], label %[[CALLBR]], label %[[RET]] +; CHECK: [[RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CALLBR_TARGET_FALLTHROUGH]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_FALLTHROUGH:%.*]] = phi i1 [ true, %[[CALLBR_TARGET_FALLTHROUGH]] ], [ [[D_INV]], [[TMP0:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_FALLTHROUGH]], label %[[FALLTHROUGH]], label %[[CALLBR]] +; + br i1 %d, label %callbr, label %fallthrough +callbr: + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +indirect: + br label %ret +fallthrough: + br i1 %c, label %callbr, label %ret +ret: + ret void +} + +define void @callbr_header_multiple_indirect_targets(i1 %c, i1 %d) { +; CHECK-LABEL: define void @callbr_header_multiple_indirect_targets( +; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]]) { +; CHECK-NEXT: [[D_INV:%.*]] = xor i1 [[D]], true +; CHECK-NEXT: br label %[[IRR_GUARD:.*]] +; CHECK: [[CALLBR:.*]]: +; CHECK-NEXT: callbr void asm "", "!i,!i"() +; CHECK-NEXT: to label %[[CALLBR_TARGET_FALLTHROUGH:.*]] [label %[[INDIRECT1:.*]], label %indirect1] +; CHECK: [[INDIRECT1]]: +; CHECK-NEXT: br label %[[RET:.*]] +; CHECK: [[INDIRECT2:.*:]] +; CHECK-NEXT: br label %[[CALLBR]] +; CHECK: [[FALLTHROUGH:.*]]: +; CHECK-NEXT: br i1 [[C]], label %[[CALLBR]], label %[[RET]] +; CHECK: [[RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CALLBR_TARGET_FALLTHROUGH]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_FALLTHROUGH:%.*]] = phi i1 [ true, %[[CALLBR_TARGET_FALLTHROUGH]] ], [ [[D_INV]], [[TMP0:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_FALLTHROUGH]], label %[[FALLTHROUGH]], label %[[CALLBR]] +; + br i1 %d, label %callbr, label %fallthrough +callbr: + callbr void asm "", "!i,!i"() to label %fallthrough [label %indirect, label %indirect1] +indirect: + br label %ret +indirect1: + br label %callbr +fallthrough: + br i1 %c, label %callbr, label %ret +ret: + ret void +} + +; Fix the three usual irreducible loops (callbr isn't a part of one of them): +; - fallthrough, fallthrough1, fallthrough2 +; - indirect, indirect1, indirect2 +; - nocallbr, nocallbr1, nocallbr2 +define void @callbr_regular(i1 %c, i1 %d) { +; CHECK-LABEL: define void @callbr_regular( +; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]]) { +; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true +; CHECK-NEXT: br i1 [[D]], label %[[CALLBR:.*]], label %[[NOCALLBR:.*]] +; CHECK: [[CALLBR]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[FALLTHROUGH:.*]] [label %indirect] +; CHECK: [[FALLTHROUGH]]: +; CHECK-NEXT: br label %[[IRR_GUARD:.*]] +; CHECK: [[FALLTHROUGH1:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[FALLTHROUGH2:.*]]: +; CHECK-NEXT: br i1 [[D]], label %[[FALLTHROUGH1]], label %[[RET:.*]] +; CHECK: [[INDIRECT:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD1:.*]] +; CHECK: [[INDIRECT1:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD1]] +; CHECK: [[INDIRECT2:.*]]: +; CHECK-NEXT: br i1 [[D]], label %[[INDIRECT1]], label %[[RET]] +; CHECK: [[NOCALLBR]]: +; CHECK-NEXT: br label %[[IRR_GUARD2:.*]] +; CHECK: [[NOCALLBR1:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD2]] +; CHECK: [[NOCALLBR2:.*]]: +; CHECK-NEXT: br i1 [[D]], label %[[NOCALLBR1]], label %[[RET]] +; CHECK: [[RET]]: +; CHECK-NEXT: ret void +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_FALLTHROUGH2:%.*]] = phi i1 [ true, %[[FALLTHROUGH1]] ], [ [[C_INV]], %[[FALLTHROUGH]] ] +; CHECK-NEXT: br i1 [[GUARD_FALLTHROUGH2]], label %[[FALLTHROUGH2]], label %[[FALLTHROUGH1]] +; CHECK: [[IRR_GUARD1]]: +; CHECK-NEXT: [[GUARD_INDIRECT2:%.*]] = phi i1 [ true, %[[INDIRECT1]] ], [ [[C_INV]], %[[INDIRECT]] ] +; CHECK-NEXT: br i1 [[GUARD_INDIRECT2]], label %[[INDIRECT2]], label %[[INDIRECT1]] +; CHECK: [[IRR_GUARD2]]: +; CHECK-NEXT: [[GUARD_NOCALLBR2:%.*]] = phi i1 [ true, %[[NOCALLBR1]] ], [ [[C_INV]], %[[NOCALLBR]] ] +; CHECK-NEXT: br i1 [[GUARD_NOCALLBR2]], label %[[NOCALLBR2]], label %[[NOCALLBR1]] +; + br i1 %d, label %callbr, label %nocallbr +callbr: + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br i1 %c, label %fallthrough1, label %fallthrough2 +fallthrough1: + br label %fallthrough2 +fallthrough2: + br i1 %d, label %fallthrough1, label %ret +indirect: + br i1 %c, label %indirect1, label %indirect2 +indirect1: + br label %indirect2 +indirect2: + br i1 %d, label %indirect1, label %ret +nocallbr: + br i1 %c, label %nocallbr1, label %nocallbr2 +nocallbr1: + br label %nocallbr2 +nocallbr2: + br i1 %d, label %nocallbr1, label %ret +ret: + ret void +} + +; Fix an irreducible loop in which callbr is a regular block (neither entry nor +; header). See the example at the top of FixIrreducible.cpp. +define void @callbr_regular1(i1 %c) { +; CHECK-LABEL: define void @callbr_regular1( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true +; CHECK-NEXT: br label %[[IRR_GUARD:.*]] +; CHECK: [[NOCALLBR:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[CALLBR:.*]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[RET:.*]] [label %nocallbr] +; CHECK: [[RET]]: +; CHECK-NEXT: ret void +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_CALLBR:%.*]] = phi i1 [ true, %[[NOCALLBR]] ], [ [[C_INV]], [[TMP0:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_CALLBR]], label %[[CALLBR]], label %[[NOCALLBR]] +; + br i1 %c, label %nocallbr, label %callbr +nocallbr: + br label %callbr +callbr: + callbr void asm "", "!i"() to label %ret [label %nocallbr] +ret: + ret void +} + +; Fix an irreducible loop in which callbr is a regular block (neither entry nor +; header). See the example at the top of FixIrreducible.cpp. +define void @callbr_regular2(i1 %c) { +; CHECK-LABEL: define void @callbr_regular2( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true +; CHECK-NEXT: br label %[[IRR_GUARD:.*]] +; CHECK: [[NOCALLBR:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[CALLBR:.*]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[NOCALLBR]] [label %ret] +; CHECK: [[RET:.*:]] +; CHECK-NEXT: ret void +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_CALLBR:%.*]] = phi i1 [ true, %[[NOCALLBR]] ], [ [[C_INV]], [[TMP0:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_CALLBR]], label %[[CALLBR]], label %[[NOCALLBR]] +; + br i1 %c, label %nocallbr, label %callbr +nocallbr: + br label %callbr +callbr: + callbr void asm "", "!i"() to label %nocallbr [label %ret] +ret: + ret void +} + +; Fix an irreducible loop with two callbr blocks, one as header and one as regular block. +define void @callbr_header_and_regular(i1 %c) { +; CHECK-LABEL: define void @callbr_header_and_regular( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: br label %[[CALLBR_HEADER:.*]] +; CHECK: [[CALLBR_HEADER]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[CALLBR_HEADER_TARGET_MID:.*]] [label %callbr_header.target.callbr_regular] +; CHECK: [[MID:.*]]: +; CHECK-NEXT: br i1 [[C]], label %[[IRR_GUARD:.*]], label %[[RET:.*]] +; CHECK: [[CALLBR_REGULAR:.*]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[CALLBR_HEADER]] [label %mid] +; CHECK: [[RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CALLBR_HEADER_TARGET_MID]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[CALLBR_HEADER_TARGET_CALLBR_REGULAR:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_CALLBR_REGULAR:%.*]] = phi i1 [ true, %[[MID]] ], [ false, %[[CALLBR_HEADER_TARGET_MID]] ], [ true, %[[CALLBR_HEADER_TARGET_CALLBR_REGULAR]] ] +; CHECK-NEXT: br i1 [[GUARD_CALLBR_REGULAR]], label %[[CALLBR_REGULAR]], label %[[MID]] +; + br label %callbr_header +callbr_header: + callbr void asm "", "!i"() to label %mid [label %callbr_regular] +mid: + br i1 %c, label %callbr_regular, label %ret +callbr_regular: + callbr void asm "", "!i"() to label %callbr_header [label %mid] +ret: + ret void +} + +; Fix an irreducible loop consisting only of callbr blocks (and ret). See the +; example at the top of FixIrreducible.cpp. +define void @callbr_only(i1 %c) { +; CHECK-LABEL: define void @callbr_only( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[CALLBR:.*:]] +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[CALLBR_ENTRY_TARGET_CALLBR_HEADER:.*]] [label %callbr_entry.target.callbr_block] +; CHECK: [[CALLBR_HEADER:.*]]: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label %[[CALLBR_HEADER_TARGET_CALLBR_BLOCK:.*]] [] +; CHECK: [[CALLBR_BLOCK:.*]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[CALLBR_HEADER]] [label %ret] +; CHECK: [[RET:.*:]] +; CHECK-NEXT: ret void +; CHECK: [[CALLBR_HEADER_TARGET_CALLBR_BLOCK]]: +; CHECK-NEXT: br label %[[IRR_GUARD:.*]] +; CHECK: [[CALLBR_ENTRY_TARGET_CALLBR_HEADER]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[CALLBR_ENTRY_TARGET_CALLBR_BLOCK:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_CALLBR_BLOCK:%.*]] = phi i1 [ true, %[[CALLBR_HEADER_TARGET_CALLBR_BLOCK]] ], [ false, %[[CALLBR_ENTRY_TARGET_CALLBR_HEADER]] ], [ true, %[[CALLBR_ENTRY_TARGET_CALLBR_BLOCK]] ] +; CHECK-NEXT: br i1 [[GUARD_CALLBR_BLOCK]], label %[[CALLBR_BLOCK]], label %[[CALLBR_HEADER]] +; +callbr_entry: + callbr void asm "", "!i"() to label %callbr_header [label %callbr_block] +callbr_header: + callbr void asm "", ""() to label %callbr_block [] +callbr_block: + callbr void asm "", "!i"() to label %callbr_header [label %ret] +ret: + ret void +} + +; Irreducible loop: entry leading to multiple callbr blocks. +define void @entry_multiple_callbr(i1 %a, i1 %b, i1 %c) { +; CHECK-LABEL: define void @entry_multiple_callbr( +; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 [[A]], label %[[CB1:.*]], label %[[IRR_GUARD:.*]] +; CHECK: [[CB1]]: +; CHECK-NEXT: callbr void asm "", "!i,!i"() +; CHECK-NEXT: to label %[[CB1_TARGET_BLOCK:.*]] [label %[[CB1_TARGET_CB2:.*]], label %cb1.target.block1] +; CHECK: [[BLOCK:.*]]: +; CHECK-NEXT: br i1 [[B]], label %[[IRR_GUARD]], label %[[BLOCK1:.*]] +; CHECK: [[CB2:.*]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[CB2_TARGET_BLOCK1:.*]] [label %cb2.target.block] +; CHECK: [[BLOCK1]]: +; CHECK-NEXT: br i1 [[C]], label %[[IRR_GUARD2:.*]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; CHECK: [[CB1_TARGET_BLOCK]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[CB1_TARGET_CB2]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[CB1_TARGET_BLOCK1:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_CB2:%.*]] = phi i1 [ true, %[[BLOCK]] ], [ false, %[[CB1_TARGET_BLOCK]] ], [ true, %[[CB1_TARGET_CB2]] ], [ false, %[[CB1_TARGET_BLOCK1]] ], [ true, %[[ENTRY]] ] +; CHECK-NEXT: [[GUARD_BLOCK:%.*]] = phi i1 [ false, %[[BLOCK]] ], [ true, %[[CB1_TARGET_BLOCK]] ], [ false, %[[CB1_TARGET_CB2]] ], [ false, %[[CB1_TARGET_BLOCK1]] ], [ false, %[[ENTRY]] ] +; CHECK-NEXT: br i1 [[GUARD_CB2]], label %[[CB2]], label %[[IRR_GUARD1:.*]] +; CHECK: [[IRR_GUARD1]]: +; CHECK-NEXT: br label %[[IRR_GUARD2]] +; CHECK: [[CB2_TARGET_BLOCK1]]: +; CHECK-NEXT: br label %[[IRR_GUARD2]] +; CHECK: [[CB2_TARGET_BLOCK:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD2]] +; CHECK: [[IRR_GUARD2]]: +; CHECK-NEXT: [[GUARD_BLOCK3:%.*]] = phi i1 [ true, %[[BLOCK1]] ], [ [[GUARD_BLOCK]], %[[IRR_GUARD1]] ], [ false, %[[CB2_TARGET_BLOCK1]] ], [ true, %[[CB2_TARGET_BLOCK]] ] +; CHECK-NEXT: br i1 [[GUARD_BLOCK3]], label %[[BLOCK]], label %[[BLOCK1]] +; +entry: + br i1 %a, label %cb1, label %cb2 +cb1: + callbr void asm "", "!i,!i"() to label %block [label %cb2, label %block1] +block: + br i1 %b, label %cb2, label %block1 +cb2: + callbr void asm "", "!i"() to label %block1 [label %block] +block1: + br i1 %c, label %block, label %exit +exit: + ret void +} + +; Irreducible loop: callbr as loop exit, with multiple entries +define void @callbr_exit_with_separate_entries(i1 %a, i1 %b, i1 %c) { +; CHECK-LABEL: define void @callbr_exit_with_separate_entries( +; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true +; CHECK-NEXT: [[A_INV:%.*]] = xor i1 [[A]], true +; CHECK-NEXT: br label %[[IRR_GUARD:.*]] +; CHECK: [[L1:.*]]: +; CHECK-NEXT: br i1 [[B]], label %[[CB:.*]], label %[[IRR_GUARD]] +; CHECK: [[L2:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD1:.*]] +; CHECK: [[CB]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[EXIT:.*]] [label %cb.target.l1] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_L2:%.*]] = phi i1 [ true, %[[L1]] ], [ [[A_INV]], %[[ENTRY]] ] +; CHECK-NEXT: br i1 [[GUARD_L2]], label %[[L2]], label %[[IRR_GUARD1]] +; CHECK: [[CB_TARGET_L1:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD1]] +; CHECK: [[IRR_GUARD1]]: +; CHECK-NEXT: [[GUARD_L1:%.*]] = phi i1 [ true, %[[CB_TARGET_L1]] ], [ true, %[[IRR_GUARD]] ], [ [[C_INV]], %[[L2]] ] +; CHECK-NEXT: br i1 [[GUARD_L1]], label %[[L1]], label %[[CB]] +; +entry: + br i1 %a, label %l1, label %l2 +l1: + br i1 %b, label %cb, label %l2 +l2: + br i1 %c, label %cb, label %l1 +cb: + callbr void asm "", "!i"() to label %exit [label %l1] +exit: + ret void +} + +define void @callbr_exit_with_separate_entries1(i1 %a, i1 %b) { +; CHECK-LABEL: define void @callbr_exit_with_separate_entries1( +; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[A_INV:%.*]] = xor i1 [[A]], true +; CHECK-NEXT: br label %[[IRR_GUARD:.*]] +; CHECK: [[LOOP1:.*]]: +; CHECK-NEXT: br i1 [[B]], label %[[CB:.*]], label %[[IRR_GUARD]] +; CHECK: [[LOOP2:.*]]: +; CHECK-NEXT: br label %[[LOOP1]] +; CHECK: [[CB]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[EXIT:.*]] [label %cb.target.loop2] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; CHECK: [[CB_TARGET_LOOP2:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_LOOP2:%.*]] = phi i1 [ true, %[[CB_TARGET_LOOP2]] ], [ true, %[[LOOP1]] ], [ [[A_INV]], %[[ENTRY]] ] +; CHECK-NEXT: br i1 [[GUARD_LOOP2]], label %[[LOOP2]], label %[[LOOP1]] +; +entry: + br i1 %a, label %loop1, label %loop2 +loop1: + br i1 %b, label %cb, label %loop2 +loop2: + br label %loop1 +cb: + callbr void asm "", "!i"() to label %exit [label %loop2] +exit: + ret void +} + +; Irreducible loop: all blocks are callbrs, with cross-edges +define void @callbr_only_multiple(i1 %a, i1 %b, i1 %c) { +; CHECK-LABEL: define void @callbr_only_multiple( +; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: callbr void asm "", "!i,!i"() +; CHECK-NEXT: to label %[[ENTRY_TARGET_CB1:.*]] [label %[[ENTRY_TARGET_CB2:.*]], label %entry.target.cb3] +; CHECK: [[CB1:.*]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[CB2:.*]] [label %cb1.target.cb3] +; CHECK: [[CB2]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[CB2_TARGET_CB3:.*]] [label %cb2.target.cb1] +; CHECK: [[CB3:.*]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[CB3_TARGET_CB1:.*]] [label %exit] +; CHECK: [[EXIT:.*:]] +; CHECK-NEXT: ret void +; CHECK: [[CB2_TARGET_CB3]]: +; CHECK-NEXT: br label %[[IRR_GUARD:.*]] +; CHECK: [[CB1_TARGET_CB3:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[ENTRY_TARGET_CB1]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[ENTRY_TARGET_CB2]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[ENTRY_TARGET_CB3:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_CB3:%.*]] = phi i1 [ true, %[[CB2_TARGET_CB3]] ], [ true, %[[CB1_TARGET_CB3]] ], [ false, %[[ENTRY_TARGET_CB1]] ], [ false, %[[ENTRY_TARGET_CB2]] ], [ true, %[[ENTRY_TARGET_CB3]] ] +; CHECK-NEXT: [[GUARD_CB1:%.*]] = phi i1 [ false, %[[CB2_TARGET_CB3]] ], [ false, %[[CB1_TARGET_CB3]] ], [ true, %[[ENTRY_TARGET_CB1]] ], [ false, %[[ENTRY_TARGET_CB2]] ], [ false, %[[ENTRY_TARGET_CB3]] ] +; CHECK-NEXT: br i1 [[GUARD_CB3]], label %[[CB3]], label %[[IRR_GUARD1:.*]] +; CHECK: [[IRR_GUARD1]]: +; CHECK-NEXT: br label %[[IRR_GUARD2:.*]] +; CHECK: [[CB2_TARGET_CB1:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD2]] +; CHECK: [[CB3_TARGET_CB1]]: +; CHECK-NEXT: br label %[[IRR_GUARD2]] +; CHECK: [[IRR_GUARD2]]: +; CHECK-NEXT: [[GUARD_CB13:%.*]] = phi i1 [ true, %[[CB2_TARGET_CB1]] ], [ [[GUARD_CB1]], %[[IRR_GUARD1]] ], [ true, %[[CB3_TARGET_CB1]] ] +; CHECK-NEXT: br i1 [[GUARD_CB13]], label %[[CB1]], label %[[CB2]] +; +entry: + callbr void asm "", "!i,!i"() to label %cb1 [label %cb2, label %cb3] +cb1: + callbr void asm "", "!i"() to label %cb2 [label %cb3] +cb2: + callbr void asm "", "!i"() to label %cb3 [label %cb1] +cb3: + callbr void asm "", "!i"() to label %cb1 [label %exit] +exit: + ret void +} + +; Irreducible loop: callbr as a "bypass" block +define void @callbr_bypass(i1 %a, i1 %b, i1 %c) { +; CHECK-LABEL: define void @callbr_bypass( +; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[B_INV:%.*]] = xor i1 [[B]], true +; CHECK-NEXT: [[A_INV:%.*]] = xor i1 [[A]], true +; CHECK-NEXT: br label %[[IRR_GUARD:.*]] +; CHECK: [[CB:.*]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[L2:.*]] [label %cb.target.l1] +; CHECK: [[L1:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD1:.*]] +; CHECK: [[L2]]: +; CHECK-NEXT: br i1 [[C]], label %[[IRR_GUARD1]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; CHECK: [[CB_TARGET_L1:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_L1:%.*]] = phi i1 [ true, %[[CB_TARGET_L1]] ], [ [[A_INV]], %[[ENTRY]] ] +; CHECK-NEXT: br i1 [[GUARD_L1]], label %[[L1]], label %[[IRR_GUARD1]] +; CHECK: [[IRR_GUARD1]]: +; CHECK-NEXT: [[GUARD_CB:%.*]] = phi i1 [ true, %[[L2]] ], [ true, %[[IRR_GUARD]] ], [ [[B_INV]], %[[L1]] ] +; CHECK-NEXT: br i1 [[GUARD_CB]], label %[[CB]], label %[[L2]] +; +entry: + br i1 %a, label %cb, label %l1 +cb: + callbr void asm "", "!i"() to label %l2 [label %l1] +l1: + br i1 %b, label %l2, label %cb +l2: + br i1 %c, label %cb, label %exit +exit: + ret void +} + +; Irreducible loop: callbr with multiple indirect targets, some looping, some exiting +define void @callbr_multiple_with_exit(i1 %a, i1 %b, i1 %c) { +; CHECK-LABEL: define void @callbr_multiple_with_exit( +; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: callbr void asm "", "!i,!i,!i"() +; CHECK-NEXT: to label %[[ENTRY_TARGET_L1:.*]] [label %[[ENTRY_TARGET_L2:.*]], label %[[EXIT:.*]], label %entry.target.l3] +; CHECK: [[L1:.*]]: +; CHECK-NEXT: br i1 [[A]], label %[[L2:.*]], label %[[IRR_GUARD:.*]] +; CHECK: [[L2]]: +; CHECK-NEXT: br i1 [[B]], label %[[IRR_GUARD2:.*]], label %[[EXIT]] +; CHECK: [[L3:.*]]: +; CHECK-NEXT: br i1 [[C]], label %[[IRR_GUARD2]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; CHECK: [[ENTRY_TARGET_L1]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[ENTRY_TARGET_L2]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[ENTRY_TARGET_L3:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_L3:%.*]] = phi i1 [ true, %[[L1]] ], [ false, %[[ENTRY_TARGET_L1]] ], [ false, %[[ENTRY_TARGET_L2]] ], [ true, %[[ENTRY_TARGET_L3]] ] +; CHECK-NEXT: [[GUARD_L1:%.*]] = phi i1 [ false, %[[L1]] ], [ true, %[[ENTRY_TARGET_L1]] ], [ false, %[[ENTRY_TARGET_L2]] ], [ false, %[[ENTRY_TARGET_L3]] ] +; CHECK-NEXT: br i1 [[GUARD_L3]], label %[[L3]], label %[[IRR_GUARD1:.*]] +; CHECK: [[IRR_GUARD1]]: +; CHECK-NEXT: br label %[[IRR_GUARD2]] +; CHECK: [[IRR_GUARD2]]: +; CHECK-NEXT: [[GUARD_L13:%.*]] = phi i1 [ true, %[[L2]] ], [ [[GUARD_L1]], %[[IRR_GUARD1]] ], [ true, %[[L3]] ] +; CHECK-NEXT: br i1 [[GUARD_L13]], label %[[L1]], label %[[L2]] +; +entry: + callbr void asm "", "!i,!i,!i"() to label %l1 [label %l2, label %exit, label %l3] +l1: + br i1 %a, label %l2, label %l3 +l2: + br i1 %b, label %l1, label %exit +l3: + br i1 %c, label %l1, label %exit +exit: + ret void +} + +define void @callbr_nested(i1 %c, i1 %d) { +; CHECK-LABEL: define void @callbr_nested( +; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[ENTRY_TARGET_H:.*]] [label %entry.target.b] +; CHECK: [[H:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD1:.*]] +; CHECK: [[B:.*]]: +; CHECK-NEXT: callbr void asm "", "!i,!i"() +; CHECK-NEXT: to label %[[H]] [label %[[B_TARGET_BH:.*]], label %b.target.bb] +; CHECK: [[BH:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD:.*]] +; CHECK: [[BB:.*]]: +; CHECK-NEXT: br i1 [[C]], label %[[BH]], label %[[RET:.*]] +; CHECK: [[RET]]: +; CHECK-NEXT: ret void +; CHECK: [[B_TARGET_BH]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[B_TARGET_BB:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD]] +; CHECK: [[IRR_GUARD]]: +; CHECK-NEXT: [[GUARD_BB:%.*]] = phi i1 [ true, %[[BH]] ], [ false, %[[B_TARGET_BH]] ], [ true, %[[B_TARGET_BB]] ] +; CHECK-NEXT: br i1 [[GUARD_BB]], label %[[BB]], label %[[BH]] +; CHECK: [[ENTRY_TARGET_H]]: +; CHECK-NEXT: br label %[[IRR_GUARD1]] +; CHECK: [[ENTRY_TARGET_B:.*]]: +; CHECK-NEXT: br label %[[IRR_GUARD1]] +; CHECK: [[IRR_GUARD1]]: +; CHECK-NEXT: [[GUARD_B:%.*]] = phi i1 [ true, %[[H]] ], [ false, %[[ENTRY_TARGET_H]] ], [ true, %[[ENTRY_TARGET_B]] ] +; CHECK-NEXT: br i1 [[GUARD_B]], label %[[B]], label %[[H]] +; +entry: + callbr void asm "","!i"() to label %h [label %b] +h: + br label %b +b: + callbr void asm "","!i,!i"() to label %h [label %bh, label %bb] +bh: + br label %bb +bb: + br i1 %c, label %bh, label %ret +ret: + ret void +} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; LOOPS-AFTER: {{.*}} +; LOOPS-BEFORE: {{.*}} diff --git a/llvm/test/Transforms/FixIrreducible/nested.ll b/llvm/test/Transforms/FixIrreducible/nested.ll index 0cc6b473d62f6..c9161cc14f208 100644 --- a/llvm/test/Transforms/FixIrreducible/nested.ll +++ b/llvm/test/Transforms/FixIrreducible/nested.ll @@ -50,6 +50,69 @@ exit: ret void } +define void @nested_irr_top_level_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5) { +; CHECK-LABEL: @nested_irr_top_level_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED0:%.*]]) +; CHECK-NEXT: to label [[ENTRY_TARGET_A1:%.*]] [label %entry.target.A2] +; CHECK: A1: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED1:%.*]]) +; CHECK-NEXT: to label [[A1_TARGET_B1:%.*]] [label %A1.target.B2] +; CHECK: B1: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED2:%.*]]) +; CHECK-NEXT: to label [[B1_TARGET_B2:%.*]] [label %A3] +; CHECK: B2: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED3:%.*]]) +; CHECK-NEXT: to label [[B1:%.*]] [label %A3] +; CHECK: A3: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED4:%.*]]) +; CHECK-NEXT: to label [[A3_TARGET_A2:%.*]] [label %exit] +; CHECK: A2: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED5:%.*]]) +; CHECK-NEXT: to label [[A1:%.*]] [label %exit] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: A3.target.A2: +; CHECK-NEXT: br label [[IRR_GUARD:%.*]] +; CHECK: entry.target.A1: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: entry.target.A2: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: irr.guard: +; CHECK-NEXT: [[GUARD_A2:%.*]] = phi i1 [ true, [[A3_TARGET_A2]] ], [ false, [[ENTRY_TARGET_A1]] ], [ true, [[ENTRY_TARGET_A2:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_A2]], label [[A2:%.*]], label [[A1]] +; CHECK: B1.target.B2: +; CHECK-NEXT: br label [[IRR_GUARD1:%.*]] +; CHECK: A1.target.B1: +; CHECK-NEXT: br label [[IRR_GUARD1]] +; CHECK: A1.target.B2: +; CHECK-NEXT: br label [[IRR_GUARD1]] +; CHECK: irr.guard1: +; CHECK-NEXT: [[GUARD_B2:%.*]] = phi i1 [ true, [[B1_TARGET_B2]] ], [ false, [[A1_TARGET_B1]] ], [ true, [[A1_TARGET_B2:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_B2]], label [[B2:%.*]], label [[B1]] +; +entry: + callbr void asm "", "r,!i"(i1 %Pred0) to label %A1 [label %A2] + +A1: + callbr void asm "", "r,!i"(i1 %Pred1) to label %B1 [label %B2] + +B1: + callbr void asm "", "r,!i"(i1 %Pred2) to label %B2 [label %A3] + +B2: + callbr void asm "", "r,!i"(i1 %Pred3) to label %B1 [label %A3] + +A3: + callbr void asm "", "r,!i"(i1 %Pred4) to label %A2 [label %exit] + +A2: + callbr void asm "", "r,!i"(i1 %Pred5) to label %A1 [label %exit] + +exit: + ret void +} + define void @nested_irr_in_loop(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5, i1 %Pred6) { ; CHECK-LABEL: @nested_irr_in_loop( ; CHECK-NEXT: entry: @@ -107,6 +170,80 @@ exit: ret void } +define void @nested_irr_in_loop_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5, i1 %Pred6) { +; CHECK-LABEL: @nested_irr_in_loop_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[H1:%.*]] +; CHECK: H1: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED0:%.*]]) +; CHECK-NEXT: to label [[H1_TARGET_A1:%.*]] [label %H1.target.A2] +; CHECK: A1: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED1:%.*]]) +; CHECK-NEXT: to label [[A1_TARGET_B1:%.*]] [label %A1.target.B2] +; CHECK: B1: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED2:%.*]]) +; CHECK-NEXT: to label [[B1_TARGET_B2:%.*]] [label %A3] +; CHECK: B2: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED3:%.*]]) +; CHECK-NEXT: to label [[B1:%.*]] [label %A3] +; CHECK: A3: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED4:%.*]]) +; CHECK-NEXT: to label [[A3_TARGET_A2:%.*]] [label %L1] +; CHECK: A2: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED5:%.*]]) +; CHECK-NEXT: to label [[A1:%.*]] [label %L1] +; CHECK: L1: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED6:%.*]]) +; CHECK-NEXT: to label [[EXIT:%.*]] [label %H1] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: A3.target.A2: +; CHECK-NEXT: br label [[IRR_GUARD:%.*]] +; CHECK: H1.target.A1: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: H1.target.A2: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: irr.guard: +; CHECK-NEXT: [[GUARD_A2:%.*]] = phi i1 [ true, [[A3_TARGET_A2]] ], [ false, [[H1_TARGET_A1]] ], [ true, [[H1_TARGET_A2:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_A2]], label [[A2:%.*]], label [[A1]] +; CHECK: B1.target.B2: +; CHECK-NEXT: br label [[IRR_GUARD1:%.*]] +; CHECK: A1.target.B1: +; CHECK-NEXT: br label [[IRR_GUARD1]] +; CHECK: A1.target.B2: +; CHECK-NEXT: br label [[IRR_GUARD1]] +; CHECK: irr.guard1: +; CHECK-NEXT: [[GUARD_B2:%.*]] = phi i1 [ true, [[B1_TARGET_B2]] ], [ false, [[A1_TARGET_B1]] ], [ true, [[A1_TARGET_B2:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_B2]], label [[B2:%.*]], label [[B1]] +; +entry: + br label %H1 + +H1: + callbr void asm "", "r,!i"(i1 %Pred0) to label %A1 [label %A2] + +A1: + callbr void asm "", "r,!i"(i1 %Pred1) to label %B1 [label %B2] + +B1: + callbr void asm "", "r,!i"(i1 %Pred2) to label %B2 [label %A3] + +B2: + callbr void asm "", "r,!i"(i1 %Pred3) to label %B1 [label %A3] + +A3: + callbr void asm "", "r,!i"(i1 %Pred4) to label %A2 [label %L1] + +A2: + callbr void asm "", "r,!i"(i1 %Pred5) to label %A1 [label %L1] + +L1: + callbr void asm "", "r,!i"(i1 %Pred6) to label %exit [label %H1] + +exit: + ret void +} + define void @loop_in_irr(i1 %Pred0, i1 %Pred1, i1 %Pred2) { ; CHECK-LABEL: @loop_in_irr( ; CHECK-NEXT: entry: @@ -150,6 +287,60 @@ exit: ret void } +define void @loop_in_irr_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2) { +; CHECK-LABEL: @loop_in_irr_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED0:%.*]]) +; CHECK-NEXT: to label [[ENTRY_TARGET_A1:%.*]] [label %entry.target.A2] +; CHECK: A1: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[H1:%.*]] [] +; CHECK: H1: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[L1:%.*]] [] +; CHECK: L1: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED1:%.*]]) +; CHECK-NEXT: to label [[H1]] [label %A3] +; CHECK: A3: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED2:%.*]]) +; CHECK-NEXT: to label [[A3_TARGET_A2:%.*]] [label %exit] +; CHECK: A2: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[A1:%.*]] [] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: A3.target.A2: +; CHECK-NEXT: br label [[IRR_GUARD:%.*]] +; CHECK: entry.target.A1: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: entry.target.A2: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: irr.guard: +; CHECK-NEXT: [[GUARD_A2:%.*]] = phi i1 [ true, [[A3_TARGET_A2]] ], [ false, [[ENTRY_TARGET_A1]] ], [ true, [[ENTRY_TARGET_A2:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_A2]], label [[A2:%.*]], label [[A1]] +; +entry: + callbr void asm "", "r,!i"(i1 %Pred0) to label %A1 [label %A2] + +A1: + callbr void asm "", ""() to label %H1 [] + +H1: + callbr void asm "", ""() to label %L1 [] + +L1: + callbr void asm "", "r,!i"(i1 %Pred1) to label %H1 [label %A3] + +A3: + callbr void asm "", "r,!i"(i1 %Pred2) to label %A2 [label %exit] + +A2: + callbr void asm "", ""() to label %A1 [] + +exit: + ret void +} + define void @loop_in_irr_shared_entry(i1 %Pred0, i1 %Pred1, i1 %Pred2) { ; CHECK-LABEL: @loop_in_irr_shared_entry( ; CHECK-NEXT: entry: @@ -188,6 +379,54 @@ exit: ret void } +define void @loop_in_irr_shared_entry_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2) { +; CHECK-LABEL: @loop_in_irr_shared_entry_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED0:%.*]]) +; CHECK-NEXT: to label [[ENTRY_TARGET_H1:%.*]] [label %entry.target.A2] +; CHECK: H1: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[L1:%.*]] [] +; CHECK: L1: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED1:%.*]]) +; CHECK-NEXT: to label [[H1:%.*]] [label %A3] +; CHECK: A3: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED2:%.*]]) +; CHECK-NEXT: to label [[A3_TARGET_A2:%.*]] [label %exit] +; CHECK: A2: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[H1]] [] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: A3.target.A2: +; CHECK-NEXT: br label [[IRR_GUARD:%.*]] +; CHECK: entry.target.H1: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: entry.target.A2: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: irr.guard: +; CHECK-NEXT: [[GUARD_A2:%.*]] = phi i1 [ true, [[A3_TARGET_A2]] ], [ false, [[ENTRY_TARGET_H1]] ], [ true, [[ENTRY_TARGET_A2:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_A2]], label [[A2:%.*]], label [[H1]] +; +entry: + callbr void asm "", "r,!i"(i1 %Pred0) to label %H1 [label %A2] + +H1: + callbr void asm "", ""() to label %L1 [] + +L1: + callbr void asm "", "r,!i"(i1 %Pred1) to label %H1 [label %A3] + +A3: + callbr void asm "", "r,!i"(i1 %Pred2) to label %A2 [label %exit] + +A2: + callbr void asm "", ""() to label %H1 [] + +exit: + ret void +} + define void @loop_in_irr_shared_header(i1 %Pred0, i1 %Pred1, i1 %Pred2) { ; CHECK-LABEL: @loop_in_irr_shared_header( ; CHECK-NEXT: entry: @@ -226,6 +465,56 @@ exit: ret void } +define void @loop_in_irr_shared_header_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2) { +; CHECK-LABEL: @loop_in_irr_shared_header_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED0:%.*]]) +; CHECK-NEXT: to label [[ENTRY_TARGET_A2:%.*]] [label %entry.target.H1] +; CHECK: H1: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[L1:%.*]] [] +; CHECK: L1: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED1:%.*]]) +; CHECK-NEXT: to label [[L1_TARGET_H1:%.*]] [label %A3] +; CHECK: A3: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED2:%.*]]) +; CHECK-NEXT: to label [[A2:%.*]] [label %exit] +; CHECK: A2: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[A2_TARGET_H1:%.*]] [] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: A2.target.H1: +; CHECK-NEXT: br label [[IRR_GUARD:%.*]] +; CHECK: L1.target.H1: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: entry.target.A2: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: entry.target.H1: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: irr.guard: +; CHECK-NEXT: [[GUARD_H1:%.*]] = phi i1 [ true, [[A2_TARGET_H1]] ], [ true, [[L1_TARGET_H1]] ], [ false, [[ENTRY_TARGET_A2]] ], [ true, [[ENTRY_TARGET_H1:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_H1]], label [[H1:%.*]], label [[A2]] +; +entry: + callbr void asm "", "r,!i"(i1 %Pred0) to label %A2 [label %H1] + +H1: + callbr void asm "", ""() to label %L1 [] + +L1: + callbr void asm "", "r,!i"(i1 %Pred1) to label %H1 [label %A3] + +A3: + callbr void asm "", "r,!i"(i1 %Pred2) to label %A2 [label %exit] + +A2: + callbr void asm "", ""() to label %H1 [] + +exit: + ret void +} + define void @loop_irr_loop_shared_header(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3) { ; CHECK-LABEL: @loop_irr_loop_shared_header( ; CHECK-NEXT: entry: @@ -269,6 +558,62 @@ exit: ret void } +define void @loop_irr_loop_shared_header_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3) { +; CHECK-LABEL: @loop_irr_loop_shared_header_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[H2:%.*]] [] +; CHECK: H2: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED0:%.*]]) +; CHECK-NEXT: to label [[H2_TARGET_A2:%.*]] [label %H2.target.H1] +; CHECK: H1: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED1:%.*]]) +; CHECK-NEXT: to label [[A3:%.*]] [label %H1.target.H1] +; CHECK: A3: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED2:%.*]]) +; CHECK-NEXT: to label [[A2:%.*]] [label %L2] +; CHECK: A2: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[A2_TARGET_H1:%.*]] [] +; CHECK: L2: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED3:%.*]]) +; CHECK-NEXT: to label [[H2]] [label %exit] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: A2.target.H1: +; CHECK-NEXT: br label [[IRR_GUARD:%.*]] +; CHECK: H1.target.H1: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: H2.target.A2: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: H2.target.H1: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: irr.guard: +; CHECK-NEXT: [[GUARD_H1:%.*]] = phi i1 [ true, [[A2_TARGET_H1]] ], [ true, [[H1_TARGET_H1:%.*]] ], [ false, [[H2_TARGET_A2]] ], [ true, [[H2_TARGET_H1:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_H1]], label [[H1:%.*]], label [[A2]] +; +entry: + callbr void asm "", ""() to label %H2 [] + +H2: + callbr void asm "", "r,!i"(i1 %Pred0) to label %A2 [label %H1] + +H1: + callbr void asm "", "r,!i"(i1 %Pred1) to label %A3 [label %H1] + +A3: + callbr void asm "", "r,!i"(i1 %Pred2) to label %A2 [label %L2] + +A2: + callbr void asm "", ""() to label %H1 [] + +L2: + callbr void asm "", "r,!i"(i1 %Pred3) to label %H2 [label %exit] + +exit: + ret void +} + define void @siblings_top_level(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5, i1 %Pred6) { ; CHECK-LABEL: @siblings_top_level( ; CHECK-NEXT: entry: @@ -336,6 +681,93 @@ exit: ret void } +define void @siblings_top_level_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5, i1 %Pred6) { +; CHECK-LABEL: @siblings_top_level_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED0:%.*]]) +; CHECK-NEXT: to label [[H1:%.*]] [label %fork1] +; CHECK: H1: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED1:%.*]]) +; CHECK-NEXT: to label [[H1_TARGET_A1:%.*]] [label %H1.target.A2] +; CHECK: A1: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[A1_TARGET_A2:%.*]] [] +; CHECK: A2: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED2:%.*]]) +; CHECK-NEXT: to label [[A1:%.*]] [label %L1] +; CHECK: L1: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED3:%.*]]) +; CHECK-NEXT: to label [[H1]] [label %exit] +; CHECK: fork1: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED4:%.*]]) +; CHECK-NEXT: to label [[FORK1_TARGET_B1:%.*]] [label %fork1.target.B2] +; CHECK: B1: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[H2:%.*]] [] +; CHECK: H2: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[L2:%.*]] [] +; CHECK: L2: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED5:%.*]]) +; CHECK-NEXT: to label [[H2]] [label %L2.target.B2] +; CHECK: B2: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED6:%.*]]) +; CHECK-NEXT: to label [[B1:%.*]] [label %exit] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: A1.target.A2: +; CHECK-NEXT: br label [[IRR_GUARD:%.*]] +; CHECK: H1.target.A1: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: H1.target.A2: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: irr.guard: +; CHECK-NEXT: [[GUARD_A2:%.*]] = phi i1 [ true, [[A1_TARGET_A2]] ], [ false, [[H1_TARGET_A1]] ], [ true, [[H1_TARGET_A2:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_A2]], label [[A2:%.*]], label [[A1]] +; CHECK: L2.target.B2: +; CHECK-NEXT: br label [[IRR_GUARD1:%.*]] +; CHECK: fork1.target.B1: +; CHECK-NEXT: br label [[IRR_GUARD1]] +; CHECK: fork1.target.B2: +; CHECK-NEXT: br label [[IRR_GUARD1]] +; CHECK: irr.guard1: +; CHECK-NEXT: [[GUARD_B2:%.*]] = phi i1 [ true, [[L2_TARGET_B2:%.*]] ], [ false, [[FORK1_TARGET_B1]] ], [ true, [[FORK1_TARGET_B2:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_B2]], label [[B2:%.*]], label [[B1]] +; +entry: + callbr void asm "", "r,!i"(i1 %Pred0) to label %H1 [label %fork1] + +H1: + callbr void asm "", "r,!i"(i1 %Pred1) to label %A1 [label %A2] + +A1: + callbr void asm "", ""() to label %A2 [] + +A2: + callbr void asm "", "r,!i"(i1 %Pred2) to label %A1 [label %L1] + +L1: + callbr void asm "", "r,!i"(i1 %Pred3) to label %H1 [label %exit] + +fork1: + callbr void asm "", "r,!i"(i1 %Pred4) to label %B1 [label %B2] + +B1: + callbr void asm "", ""() to label %H2 [] + +H2: + callbr void asm "", ""() to label %L2 [] + +L2: + callbr void asm "", "r,!i"(i1 %Pred5) to label %H2 [label %B2] + +B2: + callbr void asm "", "r,!i"(i1 %Pred6) to label %B1 [label %exit] + +exit: + ret void +} + define void @siblings_in_loop(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5, i1 %Pred6, i1 %Pred7) { ; CHECK-LABEL: @siblings_in_loop( ; CHECK-NEXT: entry: @@ -413,6 +845,105 @@ exit: ret void } +define void @siblings_in_loop_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5, i1 %Pred6, i1 %Pred7) { +; CHECK-LABEL: @siblings_in_loop_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[H0:%.*]] [] +; CHECK: H0: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED0:%.*]]) +; CHECK-NEXT: to label [[H1:%.*]] [label %fork1] +; CHECK: H1: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED1:%.*]]) +; CHECK-NEXT: to label [[H1_TARGET_A1:%.*]] [label %H1.target.A2] +; CHECK: A1: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[A1_TARGET_A2:%.*]] [] +; CHECK: A2: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED2:%.*]]) +; CHECK-NEXT: to label [[A1:%.*]] [label %L1] +; CHECK: L1: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED3:%.*]]) +; CHECK-NEXT: to label [[H1]] [label %L0] +; CHECK: fork1: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED4:%.*]]) +; CHECK-NEXT: to label [[FORK1_TARGET_B1:%.*]] [label %fork1.target.B2] +; CHECK: B1: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[H2:%.*]] [] +; CHECK: H2: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[L2:%.*]] [] +; CHECK: L2: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED5:%.*]]) +; CHECK-NEXT: to label [[H2]] [label %L2.target.B2] +; CHECK: B2: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED6:%.*]]) +; CHECK-NEXT: to label [[B1:%.*]] [label %L0] +; CHECK: L0: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED7:%.*]]) +; CHECK-NEXT: to label [[EXIT:%.*]] [label %H0] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: A1.target.A2: +; CHECK-NEXT: br label [[IRR_GUARD:%.*]] +; CHECK: H1.target.A1: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: H1.target.A2: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: irr.guard: +; CHECK-NEXT: [[GUARD_A2:%.*]] = phi i1 [ true, [[A1_TARGET_A2]] ], [ false, [[H1_TARGET_A1]] ], [ true, [[H1_TARGET_A2:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_A2]], label [[A2:%.*]], label [[A1]] +; CHECK: L2.target.B2: +; CHECK-NEXT: br label [[IRR_GUARD1:%.*]] +; CHECK: fork1.target.B1: +; CHECK-NEXT: br label [[IRR_GUARD1]] +; CHECK: fork1.target.B2: +; CHECK-NEXT: br label [[IRR_GUARD1]] +; CHECK: irr.guard1: +; CHECK-NEXT: [[GUARD_B2:%.*]] = phi i1 [ true, [[L2_TARGET_B2:%.*]] ], [ false, [[FORK1_TARGET_B1]] ], [ true, [[FORK1_TARGET_B2:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_B2]], label [[B2:%.*]], label [[B1]] +; +entry: + callbr void asm "", ""() to label %H0 [] + +H0: + callbr void asm "", "r,!i"(i1 %Pred0) to label %H1 [label %fork1] + +H1: + callbr void asm "", "r,!i"(i1 %Pred1) to label %A1 [label %A2] + +A1: + callbr void asm "", ""() to label %A2 [] + +A2: + callbr void asm "", "r,!i"(i1 %Pred2) to label %A1 [label %L1] + +L1: + callbr void asm "", "r,!i"(i1 %Pred3) to label %H1 [label %L0] + +fork1: + callbr void asm "", "r,!i"(i1 %Pred4) to label %B1 [label %B2] + +B1: + callbr void asm "", ""() to label %H2 [] + +H2: + callbr void asm "", ""() to label %L2 [] + +L2: + callbr void asm "", "r,!i"(i1 %Pred5) to label %H2 [label %B2] + +B2: + callbr void asm "", "r,!i"(i1 %Pred6) to label %B1 [label %L0] + +L0: + callbr void asm "", "r,!i"(i1 %Pred7) to label %exit [label %H0] + +exit: + ret void +} + define void @irr_in_irr_shared_entry(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5, i1 %Pred6, i1 %Pred7, i1 %Pred8, i1 %Pred9, i1 %Pred10, i1 %Pred11, i1 %Pred12, i1 %Pred13) { ; CHECK-LABEL: @irr_in_irr_shared_entry( ; CHECK-NEXT: entry: @@ -527,3 +1058,148 @@ if.end8.i: exit: ret void } + +define void @irr_in_irr_shared_entry_callbr(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5, i1 %Pred6, i1 %Pred7, i1 %Pred8, i1 %Pred9, i1 %Pred10, i1 %Pred11, i1 %Pred12, i1 %Pred13) { +; CHECK-LABEL: @irr_in_irr_shared_entry_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED0:%.*]]) +; CHECK-NEXT: to label [[IF_END:%.*]] [label %if.then] +; CHECK: if.end: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED1:%.*]]) +; CHECK-NEXT: to label [[IF_THEN7:%.*]] [label %if.else] +; CHECK: if.then7: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[IF_END16:%.*]] [] +; CHECK: if.else: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[IF_END16]] [] +; CHECK: if.end16: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED2:%.*]]) +; CHECK-NEXT: to label [[WHILE_COND_PREHEADER:%.*]] [label %if.then39] +; CHECK: while.cond.preheader: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[WHILE_COND:%.*]] [] +; CHECK: while.cond: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED3:%.*]]) +; CHECK-NEXT: to label [[WHILE_COND_TARGET_COND_TRUE49:%.*]] [label %lor.rhs] +; CHECK: cond.true49: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED4:%.*]]) +; CHECK-NEXT: to label [[IF_THEN69:%.*]] [label %cond.true49.target.while.body63] +; CHECK: while.body63: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED5:%.*]]) +; CHECK-NEXT: to label [[EXIT:%.*]] [label %while.cond47] +; CHECK: while.cond47: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED6:%.*]]) +; CHECK-NEXT: to label [[COND_TRUE49:%.*]] [label %while.cond47.target.cond.end61] +; CHECK: cond.end61: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED7:%.*]]) +; CHECK-NEXT: to label [[COND_END61_TARGET_WHILE_BODY63:%.*]] [label %while.cond] +; CHECK: if.then69: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED8:%.*]]) +; CHECK-NEXT: to label [[EXIT]] [label %while.cond] +; CHECK: lor.rhs: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED9:%.*]]) +; CHECK-NEXT: to label [[LOR_RHS_TARGET_COND_END61:%.*]] [label %while.end76] +; CHECK: while.end76: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[EXIT]] [] +; CHECK: if.then39: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED10:%.*]]) +; CHECK-NEXT: to label [[EXIT]] [label %if.end.i145] +; CHECK: if.end.i145: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED11:%.*]]) +; CHECK-NEXT: to label [[EXIT]] [label %if.end8.i149] +; CHECK: if.end8.i149: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[EXIT]] [] +; CHECK: if.then: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED12:%.*]]) +; CHECK-NEXT: to label [[EXIT]] [label %if.end.i] +; CHECK: if.end.i: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PRED13:%.*]]) +; CHECK-NEXT: to label [[EXIT]] [label %if.end8.i] +; CHECK: if.end8.i: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[EXIT]] [] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: while.cond47.target.cond.end61: +; CHECK-NEXT: br label [[IRR_GUARD:%.*]] +; CHECK: lor.rhs.target.cond.end61: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: while.cond.target.cond.true49: +; CHECK-NEXT: br label [[IRR_GUARD]] +; CHECK: irr.guard: +; CHECK-NEXT: [[GUARD_COND_END61:%.*]] = phi i1 [ true, [[WHILE_COND47_TARGET_COND_END61:%.*]] ], [ true, [[LOR_RHS_TARGET_COND_END61]] ], [ false, [[WHILE_COND_TARGET_COND_TRUE49]] ] +; CHECK-NEXT: br i1 [[GUARD_COND_END61]], label [[COND_END61:%.*]], label [[IRR_GUARD1:%.*]] +; CHECK: cond.true49.target.while.body63: +; CHECK-NEXT: br label [[IRR_GUARD1]] +; CHECK: cond.end61.target.while.body63: +; CHECK-NEXT: br label [[IRR_GUARD1]] +; CHECK: irr.guard1: +; CHECK-NEXT: [[GUARD_WHILE_BODY63:%.*]] = phi i1 [ true, [[COND_TRUE49_TARGET_WHILE_BODY63:%.*]] ], [ true, [[COND_END61_TARGET_WHILE_BODY63]] ], [ false, [[IRR_GUARD]] ] +; CHECK-NEXT: br i1 [[GUARD_WHILE_BODY63]], label [[WHILE_BODY63:%.*]], label [[COND_TRUE49]] +; +entry: + callbr void asm "", "r,!i"(i1 %Pred0) to label %if.end [label %if.then] + +if.end: + callbr void asm "", "r,!i"(i1 %Pred1) to label %if.then7 [label %if.else] + +if.then7: + callbr void asm "", ""() to label %if.end16 [] + +if.else: + callbr void asm "", ""() to label %if.end16 [] + +if.end16: + callbr void asm "", "r,!i"(i1 %Pred2) to label %while.cond.preheader [label %if.then39] + +while.cond.preheader: + callbr void asm "", ""() to label %while.cond [] + +while.cond: + callbr void asm "", "r,!i"(i1 %Pred3) to label %cond.true49 [label %lor.rhs] + +cond.true49: + callbr void asm "", "r,!i"(i1 %Pred4) to label %if.then69 [label %while.body63] + +while.body63: + callbr void asm "", "r,!i"(i1 %Pred5) to label %exit [label %while.cond47] + +while.cond47: + callbr void asm "", "r,!i"(i1 %Pred6) to label %cond.true49 [label %cond.end61] + +cond.end61: + callbr void asm "", "r,!i"(i1 %Pred7) to label %while.body63 [label %while.cond] + +if.then69: + callbr void asm "", "r,!i"(i1 %Pred8) to label %exit [label %while.cond] + +lor.rhs: + callbr void asm "", "r,!i"(i1 %Pred9) to label %cond.end61 [label %while.end76] + +while.end76: + callbr void asm "", ""() to label %exit [] + +if.then39: + callbr void asm "", "r,!i"(i1 %Pred10) to label %exit [label %if.end.i145] + +if.end.i145: + callbr void asm "", "r,!i"(i1 %Pred11) to label %exit [label %if.end8.i149] + +if.end8.i149: + callbr void asm "", ""() to label %exit [] + +if.then: + callbr void asm "", "r,!i"(i1 %Pred12) to label %exit [label %if.end.i] + +if.end.i: + callbr void asm "", "r,!i"(i1 %Pred13) to label %exit [label %if.end8.i] + +if.end8.i: + callbr void asm "", ""() to label %exit [] + +exit: + ret void +} diff --git a/llvm/test/Transforms/FixIrreducible/unreachable.ll b/llvm/test/Transforms/FixIrreducible/unreachable.ll index defbefb3ba812..845cf507c7fc0 100644 --- a/llvm/test/Transforms/FixIrreducible/unreachable.ll +++ b/llvm/test/Transforms/FixIrreducible/unreachable.ll @@ -25,3 +25,26 @@ loop.latch: loop.exit: ret void } + +; CHECK-LABEL: @unreachable_callbr( +; CHECK: entry: +; CHECK-NOT: irr.guard: +define void @unreachable_callbr(i32 %n, i1 %arg) { +entry: + callbr void asm "", ""() to label %loop.body [] + +loop.body: + callbr void asm "", ""() to label %inner.block [] + +unreachable.block: + callbr void asm "", ""() to label %inner.block [] + +inner.block: + callbr void asm "", "r,!i"(i1 %arg) to label %loop.exit [label %loop.latch] + +loop.latch: + callbr void asm "", ""() to label %loop.body [] + +loop.exit: + ret void +} diff --git a/llvm/test/Transforms/UnifyLoopExits/basic.ll b/llvm/test/Transforms/UnifyLoopExits/basic.ll index ccd15d4e6b943..d04d142f196d3 100644 --- a/llvm/test/Transforms/UnifyLoopExits/basic.ll +++ b/llvm/test/Transforms/UnifyLoopExits/basic.ll @@ -18,12 +18,12 @@ define void @loop_1(i1 %PredEntry, i1 %PredB, i1 %PredC, i1 %PredD) { ; CHECK: F: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: G: -; CHECK-NEXT: br label [[F:%.*]] +; CHECK-NEXT: br label [[Y:%.*]] ; CHECK: exit: ; CHECK-NEXT: ret void ; CHECK: loop.exit.guard: -; CHECK-NEXT: [[GUARD_E:%.*]] = phi i1 [ true, [[B]] ], [ false, [[C]] ], [ false, [[D]] ] -; CHECK-NEXT: br i1 [[GUARD_E]], label [[E:%.*]], label [[F]] +; CHECK-NEXT: [[GUARD_X:%.*]] = phi i1 [ true, [[B]] ], [ false, [[C]] ], [ false, [[D]] ] +; CHECK-NEXT: br i1 [[GUARD_X]], label [[X:%.*]], label [[Y]] ; entry: br i1 %PredEntry, label %A, label %G @@ -53,6 +53,67 @@ exit: ret void } +define void @loop_1_callbr(i1 %PredEntry, i1 %PredB, i1 %PredC, i1 %PredD) { +; CHECK-LABEL: @loop_1_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[G:%.*]] +; CHECK: A: +; CHECK-NEXT: br label [[B:%.*]] +; CHECK: B: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PREDB:%.*]]) +; CHECK-NEXT: to label [[C:%.*]] [label %B.target.E] +; CHECK: C: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PREDC:%.*]]) +; CHECK-NEXT: to label [[D:%.*]] [label %C.target.F] +; CHECK: D: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PREDD:%.*]]) +; CHECK-NEXT: to label [[A]] [label %D.target.F] +; CHECK: E: +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: F: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: G: +; CHECK-NEXT: br label [[Y:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: B.target.E: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD:%.*]] +; CHECK: C.target.F: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: D.target.F: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[GUARD_X:%.*]] = phi i1 [ true, [[B_TARGET_E:%.*]] ], [ false, [[C_TARGET_F:%.*]] ], [ false, [[D_TARGET_F:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_X]], label [[X:%.*]], label [[Y]] +; +entry: + br i1 %PredEntry, label %A, label %G + +A: + br label %B + +B: + callbr void asm "", "r,!i"(i1 %PredB) to label %C [label %E] + +C: + callbr void asm "", "r,!i"(i1 %PredC) to label %D [label %F] + +D: + callbr void asm "", "r,!i"(i1 %PredD) to label %A [label %F] + +E: + br label %exit + +F: + br label %exit + +G: + br label %F + +exit: + ret void +} + define void @loop_2(i1 %PredA, i1 %PredB, i1 %PredC) { ; CHECK-LABEL: @loop_2( ; CHECK-NEXT: entry: @@ -107,3 +168,67 @@ Z: exit: ret void } + +define void @loop_2_callbr(i1 %PredA, i1 %PredB, i1 %PredC) { +; CHECK-LABEL: @loop_2_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[A:%.*]] +; CHECK: A: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PREDA:%.*]]) +; CHECK-NEXT: to label [[B:%.*]] [label %A.target.X] +; CHECK: B: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PREDB:%.*]]) +; CHECK-NEXT: to label [[C:%.*]] [label %B.target.Y] +; CHECK: C: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PREDC:%.*]]) +; CHECK-NEXT: to label [[D:%.*]] [label %C.target.Z] +; CHECK: D: +; CHECK-NEXT: br label [[A]] +; CHECK: X: +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: Y: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: Z: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: A.target.X: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD:%.*]] +; CHECK: B.target.Y: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: C.target.Z: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[GUARD_X:%.*]] = phi i1 [ true, [[A_TARGET_X:%.*]] ], [ false, [[B_TARGET_Y:%.*]] ], [ false, [[C_TARGET_Z:%.*]] ] +; CHECK-NEXT: [[GUARD_Y:%.*]] = phi i1 [ false, [[A_TARGET_X]] ], [ true, [[B_TARGET_Y]] ], [ false, [[C_TARGET_Z]] ] +; CHECK-NEXT: br i1 [[GUARD_X]], label [[X:%.*]], label [[LOOP_EXIT_GUARD1:%.*]] +; CHECK: loop.exit.guard1: +; CHECK-NEXT: br i1 [[GUARD_Y]], label [[Y:%.*]], label [[Z:%.*]] +; +entry: + br label %A + +A: + callbr void asm "", "r,!i"(i1 %PredA) to label %B [label %X] + +B: + callbr void asm "", "r,!i"(i1 %PredB) to label %C [label %Y] + +C: + callbr void asm "", "r,!i"(i1 %PredC) to label %D [label %Z] + +D: + br label %A + +X: + br label %exit + +Y: + br label %exit + +Z: + br label %exit + +exit: + ret void +} diff --git a/llvm/test/Transforms/UnifyLoopExits/integer_guards.ll b/llvm/test/Transforms/UnifyLoopExits/integer_guards.ll index f55639ff2db37..be982d5d043f9 100644 --- a/llvm/test/Transforms/UnifyLoopExits/integer_guards.ll +++ b/llvm/test/Transforms/UnifyLoopExits/integer_guards.ll @@ -71,6 +71,85 @@ E: ret void } +define void @loop_two_exits_callbr(i1 %PredEntry, i1 %PredA) { +; CHECK-LABEL: @loop_two_exits_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[E:%.*]] +; CHECK: A: +; CHECK-NEXT: [[INC1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC2:%.*]], [[C:%.*]] ] +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PREDA:%.*]]) +; CHECK-NEXT: to label [[A_TARGET_B:%.*]] [label %C] +; CHECK: B: +; CHECK-NEXT: tail call fastcc void @check(i32 1) #[[ATTR0]] +; CHECK-NEXT: br label [[D:%.*]] +; CHECK: C: +; CHECK-NEXT: [[INC2]] = add i32 [[INC1]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC2]], 10 +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[CMP]]) +; CHECK-NEXT: to label [[A]] [label %C.target.E] +; CHECK: D: +; CHECK-NEXT: unreachable +; CHECK: E: +; CHECK-NEXT: ret void +; CHECK: A.target.B: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD:%.*]] +; CHECK: C.target.E: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[MERGED_BB_IDX:%.*]] = phi i32 [ 0, [[A_TARGET_B]] ], [ 1, [[C_TARGET_E:%.*]] ] +; CHECK-NEXT: [[B_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX]], 0 +; CHECK-NEXT: br i1 [[B_PREDICATE]], label [[B:%.*]], label [[E]] +; +; BOOLEAN-LABEL: @loop_two_exits_callbr( +; BOOLEAN-NEXT: entry: +; BOOLEAN-NEXT: br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[E:%.*]] +; BOOLEAN: A: +; BOOLEAN-NEXT: [[INC1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC2:%.*]], [[C:%.*]] ] +; BOOLEAN-NEXT: callbr void asm "", "r,!i"(i1 [[PREDA:%.*]]) +; BOOLEAN-NEXT: to label [[A_TARGET_B:%.*]] [label %C] +; BOOLEAN: B: +; BOOLEAN-NEXT: tail call fastcc void @check(i32 1) #[[ATTR0]] +; BOOLEAN-NEXT: br label [[D:%.*]] +; BOOLEAN: C: +; BOOLEAN-NEXT: [[INC2]] = add i32 [[INC1]], 1 +; BOOLEAN-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC2]], 10 +; BOOLEAN-NEXT: callbr void asm "", "r,!i"(i1 [[CMP]]) +; BOOLEAN-NEXT: to label [[A]] [label %C.target.E] +; BOOLEAN: D: +; BOOLEAN-NEXT: unreachable +; BOOLEAN: E: +; BOOLEAN-NEXT: ret void +; BOOLEAN: A.target.B: +; BOOLEAN-NEXT: br label [[LOOP_EXIT_GUARD:%.*]] +; BOOLEAN: C.target.E: +; BOOLEAN-NEXT: br label [[LOOP_EXIT_GUARD]] +; BOOLEAN: loop.exit.guard: +; BOOLEAN-NEXT: [[GUARD_B:%.*]] = phi i1 [ true, [[A_TARGET_B]] ], [ false, [[C_TARGET_E:%.*]] ] +; BOOLEAN-NEXT: br i1 [[GUARD_B]], label [[B:%.*]], label [[E]] +; +entry: + br i1 %PredEntry, label %A, label %E + +A: + %inc1 = phi i32 [ 0, %entry ], [ %inc2, %C ] + callbr void asm "", "r,!i"(i1 %PredA) to label %B [label %C] + +B: + tail call fastcc void @check(i32 1) #0 + br label %D + +C: + %inc2 = add i32 %inc1, 1 + %cmp = icmp ult i32 %inc2, 10 + callbr void asm "","r,!i"(i1 %cmp) to label %A [label %E] + +D: + unreachable + +E: + ret void +} + ; The loop exit blocks appear in an inner loop. define void @inner_loop(i1 %PredEntry, i1 %PredA, i1 %PredB) { @@ -196,6 +275,164 @@ I: ret void } +define void @inner_loop_callbr(i1 %PredEntry, i1 %PredA, i1 %PredB) { +; CHECK-LABEL: @inner_loop_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[I:%.*]] +; CHECK: A: +; CHECK-NEXT: [[OUTER1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER2:%.*]], [[G:%.*]] ] +; CHECK-NEXT: br label [[B:%.*]] +; CHECK: B: +; CHECK-NEXT: [[INNER1:%.*]] = phi i32 [ 0, [[A]] ], [ [[INNER2:%.*]], [[F:%.*]] ] +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PREDA:%.*]]) +; CHECK-NEXT: to label [[D:%.*]] [label %B.target.B.target.C] +; CHECK: C: +; CHECK-NEXT: tail call fastcc void @check(i32 1) #[[ATTR0]] +; CHECK-NEXT: br label [[H:%.*]] +; CHECK: D: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PREDB:%.*]]) +; CHECK-NEXT: to label [[D_TARGET_D_TARGET_E:%.*]] [label %F] +; CHECK: E: +; CHECK-NEXT: tail call fastcc void @check(i32 2) #[[ATTR0]] +; CHECK-NEXT: br label [[H]] +; CHECK: F: +; CHECK-NEXT: [[INNER2]] = add i32 [[INNER1]], 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i32 [[INNER2]], 20 +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[CMP1]]) +; CHECK-NEXT: to label [[B]] [label %F.target.G] +; CHECK: G: +; CHECK-NEXT: [[OUTER2]] = add i32 [[OUTER1]], 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[OUTER2]], 10 +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[CMP2]]) +; CHECK-NEXT: to label [[A]] [label %G.target.I] +; CHECK: H: +; CHECK-NEXT: unreachable +; CHECK: I: +; CHECK-NEXT: ret void +; CHECK: B.target.C: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD:%.*]] +; CHECK: D.target.E: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: G.target.I: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[MERGED_BB_IDX:%.*]] = phi i32 [ 0, [[B_TARGET_C:%.*]] ], [ 1, [[D_TARGET_E:%.*]] ], [ 2, [[G_TARGET_I:%.*]] ] +; CHECK-NEXT: [[C_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX]], 0 +; CHECK-NEXT: br i1 [[C_PREDICATE]], label [[C:%.*]], label [[LOOP_EXIT_GUARD1:%.*]] +; CHECK: loop.exit.guard1: +; CHECK-NEXT: [[E_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX]], 1 +; CHECK-NEXT: br i1 [[E_PREDICATE]], label [[E:%.*]], label [[I]] +; CHECK: B.target.B.target.C: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD2:%.*]] +; CHECK: D.target.D.target.E: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD2]] +; CHECK: F.target.G: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD2]] +; CHECK: loop.exit.guard2: +; CHECK-NEXT: [[MERGED_BB_IDX4:%.*]] = phi i32 [ 0, [[B_TARGET_B_TARGET_C:%.*]] ], [ 1, [[D_TARGET_D_TARGET_E]] ], [ 2, [[F_TARGET_G:%.*]] ] +; CHECK-NEXT: [[B_TARGET_C_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX4]], 0 +; CHECK-NEXT: br i1 [[B_TARGET_C_PREDICATE]], label [[B_TARGET_C]], label [[LOOP_EXIT_GUARD3:%.*]] +; CHECK: loop.exit.guard3: +; CHECK-NEXT: [[D_TARGET_E_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX4]], 1 +; CHECK-NEXT: br i1 [[D_TARGET_E_PREDICATE]], label [[D_TARGET_E]], label [[G]] +; +; BOOLEAN-LABEL: @inner_loop_callbr( +; BOOLEAN-NEXT: entry: +; BOOLEAN-NEXT: br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[I:%.*]] +; BOOLEAN: A: +; BOOLEAN-NEXT: [[OUTER1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER2:%.*]], [[G:%.*]] ] +; BOOLEAN-NEXT: br label [[B:%.*]] +; BOOLEAN: B: +; BOOLEAN-NEXT: [[INNER1:%.*]] = phi i32 [ 0, [[A]] ], [ [[INNER2:%.*]], [[F:%.*]] ] +; BOOLEAN-NEXT: callbr void asm "", "r,!i"(i1 [[PREDA:%.*]]) +; BOOLEAN-NEXT: to label [[D:%.*]] [label %B.target.B.target.C] +; BOOLEAN: C: +; BOOLEAN-NEXT: tail call fastcc void @check(i32 1) #[[ATTR0]] +; BOOLEAN-NEXT: br label [[H:%.*]] +; BOOLEAN: D: +; BOOLEAN-NEXT: callbr void asm "", "r,!i"(i1 [[PREDB:%.*]]) +; BOOLEAN-NEXT: to label [[D_TARGET_D_TARGET_E:%.*]] [label %F] +; BOOLEAN: E: +; BOOLEAN-NEXT: tail call fastcc void @check(i32 2) #[[ATTR0]] +; BOOLEAN-NEXT: br label [[H]] +; BOOLEAN: F: +; BOOLEAN-NEXT: [[INNER2]] = add i32 [[INNER1]], 1 +; BOOLEAN-NEXT: [[CMP1:%.*]] = icmp ult i32 [[INNER2]], 20 +; BOOLEAN-NEXT: callbr void asm "", "r,!i"(i1 [[CMP1]]) +; BOOLEAN-NEXT: to label [[B]] [label %F.target.G] +; BOOLEAN: G: +; BOOLEAN-NEXT: [[OUTER2]] = add i32 [[OUTER1]], 1 +; BOOLEAN-NEXT: [[CMP2:%.*]] = icmp ult i32 [[OUTER2]], 10 +; BOOLEAN-NEXT: callbr void asm "", "r,!i"(i1 [[CMP2]]) +; BOOLEAN-NEXT: to label [[A]] [label %G.target.I] +; BOOLEAN: H: +; BOOLEAN-NEXT: unreachable +; BOOLEAN: I: +; BOOLEAN-NEXT: ret void +; BOOLEAN: B.target.C: +; BOOLEAN-NEXT: br label [[LOOP_EXIT_GUARD:%.*]] +; BOOLEAN: D.target.E: +; BOOLEAN-NEXT: br label [[LOOP_EXIT_GUARD]] +; BOOLEAN: G.target.I: +; BOOLEAN-NEXT: br label [[LOOP_EXIT_GUARD]] +; BOOLEAN: loop.exit.guard: +; BOOLEAN-NEXT: [[GUARD_C:%.*]] = phi i1 [ true, [[B_TARGET_C:%.*]] ], [ false, [[D_TARGET_E:%.*]] ], [ false, [[G_TARGET_I:%.*]] ] +; BOOLEAN-NEXT: [[GUARD_E:%.*]] = phi i1 [ false, [[B_TARGET_C]] ], [ true, [[D_TARGET_E]] ], [ false, [[G_TARGET_I]] ] +; BOOLEAN-NEXT: br i1 [[GUARD_C]], label [[C:%.*]], label [[LOOP_EXIT_GUARD1:%.*]] +; BOOLEAN: loop.exit.guard1: +; BOOLEAN-NEXT: br i1 [[GUARD_E]], label [[E:%.*]], label [[I]] +; BOOLEAN: B.target.B.target.C: +; BOOLEAN-NEXT: br label [[LOOP_EXIT_GUARD2:%.*]] +; BOOLEAN: D.target.D.target.E: +; BOOLEAN-NEXT: br label [[LOOP_EXIT_GUARD2]] +; BOOLEAN: F.target.G: +; BOOLEAN-NEXT: br label [[LOOP_EXIT_GUARD2]] +; BOOLEAN: loop.exit.guard2: +; BOOLEAN-NEXT: [[GUARD_B_TARGET_C:%.*]] = phi i1 [ true, [[B_TARGET_B_TARGET_C:%.*]] ], [ false, [[D_TARGET_D_TARGET_E]] ], [ false, [[F_TARGET_G:%.*]] ] +; BOOLEAN-NEXT: [[GUARD_D_TARGET_E:%.*]] = phi i1 [ false, [[B_TARGET_B_TARGET_C]] ], [ true, [[D_TARGET_D_TARGET_E]] ], [ false, [[F_TARGET_G]] ] +; BOOLEAN-NEXT: br i1 [[GUARD_B_TARGET_C]], label [[B_TARGET_C]], label [[LOOP_EXIT_GUARD3:%.*]] +; BOOLEAN: loop.exit.guard3: +; BOOLEAN-NEXT: br i1 [[GUARD_D_TARGET_E]], label [[D_TARGET_E]], label [[G]] +; +entry: + br i1 %PredEntry, label %A, label %I + +A: + %outer1 = phi i32 [ 0, %entry ], [ %outer2, %G ] + br label %B + +B: + %inner1 = phi i32 [ 0, %A ], [ %inner2, %F ] + callbr void asm "", "r,!i"(i1 %PredA) to label %D [label %C] + +C: + tail call fastcc void @check(i32 1) #0 + br label %H + +D: + callbr void asm "", "r,!i"(i1 %PredB) to label %E [label %F] + +E: + tail call fastcc void @check(i32 2) #0 + br label %H + +F: + %inner2 = add i32 %inner1, 1 + %cmp1 = icmp ult i32 %inner2, 20 + callbr void asm "", "r,!i"(i1 %cmp1) to label %B [label %G] + +G: + %outer2 = add i32 %outer1, 1 + %cmp2 = icmp ult i32 %outer2, 10 + callbr void asm "", "r,!i"(i1 %cmp2) to label %A [label %I] + +H: + unreachable + +I: + ret void +} + ; A loop with more exit blocks. define void @loop_five_exits(i1 %PredEntry, i1 %PredA, i1 %PredB, i1 %PredC, i1 %PredD) { @@ -341,6 +578,179 @@ L: ret void } +define void @loop_five_exits_callbr(i1 %PredEntry, i1 %PredA, i1 %PredB, i1 %PredC, i1 %PredD) { +; CHECK-LABEL: @loop_five_exits_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[L:%.*]] +; CHECK: A: +; CHECK-NEXT: [[INC1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC2:%.*]], [[I:%.*]] ] +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PREDA:%.*]]) +; CHECK-NEXT: to label [[A_TARGET_B:%.*]] [label %C] +; CHECK: B: +; CHECK-NEXT: tail call fastcc void @check(i32 1) #[[ATTR0]] +; CHECK-NEXT: br label [[J:%.*]] +; CHECK: C: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PREDB:%.*]]) +; CHECK-NEXT: to label [[C_TARGET_D:%.*]] [label %E] +; CHECK: D: +; CHECK-NEXT: tail call fastcc void @check(i32 2) #[[ATTR0]] +; CHECK-NEXT: br label [[J]] +; CHECK: E: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PREDC:%.*]]) +; CHECK-NEXT: to label [[E_TARGET_F:%.*]] [label %G] +; CHECK: F: +; CHECK-NEXT: tail call fastcc void @check(i32 3) #[[ATTR0]] +; CHECK-NEXT: br label [[K:%.*]] +; CHECK: G: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PREDD:%.*]]) +; CHECK-NEXT: to label [[G_TARGET_H:%.*]] [label %I] +; CHECK: H: +; CHECK-NEXT: tail call fastcc void @check(i32 4) #[[ATTR0]] +; CHECK-NEXT: br label [[K]] +; CHECK: I: +; CHECK-NEXT: [[INC2]] = add i32 [[INC1]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC2]], 10 +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[CMP]]) +; CHECK-NEXT: to label [[A]] [label %I.target.L] +; CHECK: J: +; CHECK-NEXT: br label [[L]] +; CHECK: K: +; CHECK-NEXT: br label [[L]] +; CHECK: L: +; CHECK-NEXT: ret void +; CHECK: A.target.B: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD:%.*]] +; CHECK: C.target.D: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: E.target.F: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: G.target.H: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: I.target.L: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[MERGED_BB_IDX:%.*]] = phi i32 [ 0, [[A_TARGET_B]] ], [ 1, [[C_TARGET_D]] ], [ 2, [[E_TARGET_F]] ], [ 3, [[G_TARGET_H]] ], [ 4, [[I_TARGET_L:%.*]] ] +; CHECK-NEXT: [[B_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX]], 0 +; CHECK-NEXT: br i1 [[B_PREDICATE]], label [[B:%.*]], label [[LOOP_EXIT_GUARD1:%.*]] +; CHECK: loop.exit.guard1: +; CHECK-NEXT: [[D_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX]], 1 +; CHECK-NEXT: br i1 [[D_PREDICATE]], label [[D:%.*]], label [[LOOP_EXIT_GUARD2:%.*]] +; CHECK: loop.exit.guard2: +; CHECK-NEXT: [[F_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX]], 2 +; CHECK-NEXT: br i1 [[F_PREDICATE]], label [[F:%.*]], label [[LOOP_EXIT_GUARD3:%.*]] +; CHECK: loop.exit.guard3: +; CHECK-NEXT: [[H_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX]], 3 +; CHECK-NEXT: br i1 [[H_PREDICATE]], label [[H:%.*]], label [[L]] +; +; BOOLEAN-LABEL: @loop_five_exits_callbr( +; BOOLEAN-NEXT: entry: +; BOOLEAN-NEXT: br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[L:%.*]] +; BOOLEAN: A: +; BOOLEAN-NEXT: [[INC1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC2:%.*]], [[I:%.*]] ] +; BOOLEAN-NEXT: callbr void asm "", "r,!i"(i1 [[PREDA:%.*]]) +; BOOLEAN-NEXT: to label [[A_TARGET_B:%.*]] [label %C] +; BOOLEAN: B: +; BOOLEAN-NEXT: tail call fastcc void @check(i32 1) #[[ATTR0]] +; BOOLEAN-NEXT: br label [[J:%.*]] +; BOOLEAN: C: +; BOOLEAN-NEXT: callbr void asm "", "r,!i"(i1 [[PREDB:%.*]]) +; BOOLEAN-NEXT: to label [[C_TARGET_D:%.*]] [label %E] +; BOOLEAN: D: +; BOOLEAN-NEXT: tail call fastcc void @check(i32 2) #[[ATTR0]] +; BOOLEAN-NEXT: br label [[J]] +; BOOLEAN: E: +; BOOLEAN-NEXT: callbr void asm "", "r,!i"(i1 [[PREDC:%.*]]) +; BOOLEAN-NEXT: to label [[E_TARGET_F:%.*]] [label %G] +; BOOLEAN: F: +; BOOLEAN-NEXT: tail call fastcc void @check(i32 3) #[[ATTR0]] +; BOOLEAN-NEXT: br label [[K:%.*]] +; BOOLEAN: G: +; BOOLEAN-NEXT: callbr void asm "", "r,!i"(i1 [[PREDD:%.*]]) +; BOOLEAN-NEXT: to label [[G_TARGET_H:%.*]] [label %I] +; BOOLEAN: H: +; BOOLEAN-NEXT: tail call fastcc void @check(i32 4) #[[ATTR0]] +; BOOLEAN-NEXT: br label [[K]] +; BOOLEAN: I: +; BOOLEAN-NEXT: [[INC2]] = add i32 [[INC1]], 1 +; BOOLEAN-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC2]], 10 +; BOOLEAN-NEXT: callbr void asm "", "r,!i"(i1 [[CMP]]) +; BOOLEAN-NEXT: to label [[A]] [label %I.target.L] +; BOOLEAN: J: +; BOOLEAN-NEXT: br label [[L]] +; BOOLEAN: K: +; BOOLEAN-NEXT: br label [[L]] +; BOOLEAN: L: +; BOOLEAN-NEXT: ret void +; BOOLEAN: A.target.B: +; BOOLEAN-NEXT: br label [[LOOP_EXIT_GUARD:%.*]] +; BOOLEAN: C.target.D: +; BOOLEAN-NEXT: br label [[LOOP_EXIT_GUARD]] +; BOOLEAN: E.target.F: +; BOOLEAN-NEXT: br label [[LOOP_EXIT_GUARD]] +; BOOLEAN: G.target.H: +; BOOLEAN-NEXT: br label [[LOOP_EXIT_GUARD]] +; BOOLEAN: I.target.L: +; BOOLEAN-NEXT: br label [[LOOP_EXIT_GUARD]] +; BOOLEAN: loop.exit.guard: +; BOOLEAN-NEXT: [[GUARD_B:%.*]] = phi i1 [ true, [[A_TARGET_B]] ], [ false, [[C_TARGET_D]] ], [ false, [[E_TARGET_F]] ], [ false, [[G_TARGET_H]] ], [ false, [[I_TARGET_L:%.*]] ] +; BOOLEAN-NEXT: [[GUARD_D:%.*]] = phi i1 [ false, [[A_TARGET_B]] ], [ true, [[C_TARGET_D]] ], [ false, [[E_TARGET_F]] ], [ false, [[G_TARGET_H]] ], [ false, [[I_TARGET_L]] ] +; BOOLEAN-NEXT: [[GUARD_F:%.*]] = phi i1 [ false, [[A_TARGET_B]] ], [ false, [[C_TARGET_D]] ], [ true, [[E_TARGET_F]] ], [ false, [[G_TARGET_H]] ], [ false, [[I_TARGET_L]] ] +; BOOLEAN-NEXT: [[GUARD_H:%.*]] = phi i1 [ false, [[A_TARGET_B]] ], [ false, [[C_TARGET_D]] ], [ false, [[E_TARGET_F]] ], [ true, [[G_TARGET_H]] ], [ false, [[I_TARGET_L]] ] +; BOOLEAN-NEXT: br i1 [[GUARD_B]], label [[B:%.*]], label [[LOOP_EXIT_GUARD1:%.*]] +; BOOLEAN: loop.exit.guard1: +; BOOLEAN-NEXT: br i1 [[GUARD_D]], label [[D:%.*]], label [[LOOP_EXIT_GUARD2:%.*]] +; BOOLEAN: loop.exit.guard2: +; BOOLEAN-NEXT: br i1 [[GUARD_F]], label [[F:%.*]], label [[LOOP_EXIT_GUARD3:%.*]] +; BOOLEAN: loop.exit.guard3: +; BOOLEAN-NEXT: br i1 [[GUARD_H]], label [[H:%.*]], label [[L]] +; +entry: + br i1 %PredEntry, label %A, label %L + +A: + %inc1 = phi i32 [ 0, %entry ], [ %inc2, %I ] + callbr void asm "", "r,!i"(i1 %PredA) to label %B [label %C] + +B: + tail call fastcc void @check(i32 1) #0 + br label %J + +C: + callbr void asm "", "r,!i"(i1 %PredB) to label %D [label %E] + +D: + tail call fastcc void @check(i32 2) #0 + br label %J + +E: + callbr void asm "", "r,!i"(i1 %PredC) to label %F [label %G] + +F: + tail call fastcc void @check(i32 3) #0 + br label %K + +G: + callbr void asm "", "r,!i"(i1 %PredD) to label %H [label %I] + +H: + tail call fastcc void @check(i32 4) #0 + br label %K + +I: + %inc2 = add i32 %inc1, 1 + %cmp = icmp ult i32 %inc2, 10 + callbr void asm "", "r,!i"(i1 %cmp) to label %A [label %L] + +J: + br label %L + +K: + br label %L + +L: + ret void +} + declare void @check(i32 noundef %i) #0 diff --git a/llvm/test/Transforms/UnifyLoopExits/nested.ll b/llvm/test/Transforms/UnifyLoopExits/nested.ll index 8fae2c4349a7b..2ec576a2efa89 100644 --- a/llvm/test/Transforms/UnifyLoopExits/nested.ll +++ b/llvm/test/Transforms/UnifyLoopExits/nested.ll @@ -78,3 +78,145 @@ exit: %exit.phi = phi i32 [%A4.phi, %A5], [%Z, %C] ret void } + +define void @nested_callbr(i1 %PredB3, i1 %PredB4, i1 %PredA4, i1 %PredA3, i32 %X, i32 %Y, i32 %Z) { +; CHECK-LABEL: @nested_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[A1:%.*]] +; CHECK: A1: +; CHECK-NEXT: br label [[B1:%.*]] +; CHECK: B1: +; CHECK-NEXT: br label [[B2:%.*]] +; CHECK: B2: +; CHECK-NEXT: [[X_INC:%.*]] = add i32 [[X:%.*]], 1 +; CHECK-NEXT: br label [[B3:%.*]] +; CHECK: B3: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PREDB3:%.*]]) +; CHECK-NEXT: to label [[B4:%.*]] [label %B3.target.A3] +; CHECK: B4: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PREDB4:%.*]]) +; CHECK-NEXT: to label [[B1]] [label %B4.target.A2] +; CHECK: A2: +; CHECK-NEXT: br label [[A4:%.*]] +; CHECK: A3: +; CHECK-NEXT: br label [[A4]] +; CHECK: A4: +; CHECK-NEXT: [[A4_PHI:%.*]] = phi i32 [ [[Y:%.*]], [[A3:%.*]] ], [ [[X_INC_MOVED:%.*]], [[A2:%.*]] ] +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PREDA4:%.*]]) +; CHECK-NEXT: to label [[A4_TARGET_C:%.*]] [label %A5] +; CHECK: A5: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[PREDA3:%.*]]) +; CHECK-NEXT: to label [[A5_TARGET_EXIT:%.*]] [label %A1] +; CHECK: C: +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[EXIT_PHI:%.*]] = phi i32 [ [[Z:%.*]], [[C:%.*]] ], [ [[EXIT_PHI_MOVED:%.*]], [[LOOP_EXIT_GUARD:%.*]] ] +; CHECK-NEXT: ret void +; CHECK: A4.target.C: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: A5.target.exit: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[EXIT_PHI_MOVED]] = phi i32 [ poison, [[A4_TARGET_C]] ], [ [[A4_PHI]], [[A5_TARGET_EXIT]] ] +; CHECK-NEXT: [[GUARD_C:%.*]] = phi i1 [ true, [[A4_TARGET_C]] ], [ false, [[A5_TARGET_EXIT]] ] +; CHECK-NEXT: br i1 [[GUARD_C]], label [[C]], label [[EXIT]] +; CHECK: B3.target.A3: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD1:%.*]] +; CHECK: B4.target.A2: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD1]] +; CHECK: loop.exit.guard1: +; CHECK-NEXT: [[X_INC_MOVED]] = phi i32 [ [[X_INC]], [[B3_TARGET_A3:%.*]] ], [ [[X_INC]], [[B4_TARGET_A2:%.*]] ] +; CHECK-NEXT: [[GUARD_A3:%.*]] = phi i1 [ true, [[B3_TARGET_A3]] ], [ false, [[B4_TARGET_A2]] ] +; CHECK-NEXT: br i1 [[GUARD_A3]], label [[A3]], label [[A2]] +; +entry: + br label %A1 + +A1: + br label %B1 + +B1: + br label %B2 + +B2: + %X.inc = add i32 %X, 1 + br label %B3 + +B3: + callbr void asm "", "r,!i"(i1 %PredB3) to label %B4 [label %A3] + +B4: + callbr void asm "", "r,!i"(i1 %PredB4) to label %B1 [label %A2] + +A2: + br label %A4 + +A3: + br label %A4 + +A4: + %A4.phi = phi i32 [%Y, %A3], [%X.inc, %A2] + callbr void asm "", "r,!i"(i1 %PredA4) to label %C [label %A5] + +A5: + callbr void asm "", "r,!i"(i1 %PredA3) to label %exit [label %A1] + +C: + br label %exit + +exit: + %exit.phi = phi i32 [%A4.phi, %A5], [%Z, %C] + ret void +} + +; Here, the newly created target loop that connects b to r1 needs to be part of +; the parent loop (the outer loop b participates in). Otherwise, it will be +; regarded as an additional loop entry point to this outer loop. +define void @nested_callbr_multiple_exits() { +; CHECK-LABEL: @nested_callbr_multiple_exits( +; CHECK-NEXT: br label [[A:%.*]] +; CHECK: a: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[B:%.*]] [] +; CHECK: b: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label [[C:%.*]] [label %b.target.b.target.r1] +; CHECK: c: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label [[C_TARGET_E:%.*]] [label %b] +; CHECK: e: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label [[A]] [label %e.target.r2] +; CHECK: r1: +; CHECK-NEXT: ret void +; CHECK: r2: +; CHECK-NEXT: ret void +; CHECK: b.target.r1: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD:%.*]] +; CHECK: e.target.r2: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[GUARD_R1:%.*]] = phi i1 [ true, [[B_TARGET_R1:%.*]] ], [ false, [[E_TARGET_R2:%.*]] ] +; CHECK-NEXT: br i1 [[GUARD_R1]], label [[R1:%.*]], label [[R2:%.*]] +; CHECK: b.target.b.target.r1: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD1:%.*]] +; CHECK: c.target.e: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD1]] +; CHECK: loop.exit.guard1: +; CHECK-NEXT: [[GUARD_B_TARGET_R1:%.*]] = phi i1 [ true, [[B_TARGET_B_TARGET_R1:%.*]] ], [ false, [[C_TARGET_E]] ] +; CHECK-NEXT: br i1 [[GUARD_B_TARGET_R1]], label [[B_TARGET_R1]], label [[E:%.*]] +; + br label %a +a: + callbr void asm "", ""() to label %b [] +b: + callbr void asm "", "!i"() to label %c [label %r1] +c: + callbr void asm "", "!i"() to label %e [label %b] +e: + callbr void asm "", "!i"() to label %a [label %r2] +r1: + ret void +r2: + ret void +} diff --git a/llvm/test/Transforms/UnifyLoopExits/restore-ssa.ll b/llvm/test/Transforms/UnifyLoopExits/restore-ssa.ll index 3e68df3e79260..ffe8026a535c0 100644 --- a/llvm/test/Transforms/UnifyLoopExits/restore-ssa.ll +++ b/llvm/test/Transforms/UnifyLoopExits/restore-ssa.ll @@ -57,6 +57,60 @@ return: ret i32 %phi } +define i32 @exiting-used-in-exit_callbr(ptr %arg1, ptr %arg2) local_unnamed_addr align 2 { +; CHECK-LABEL: @exiting-used-in-exit_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[A:%.*]] [] +; CHECK: A: +; CHECK-NEXT: [[MYTMP42:%.*]] = load i32, ptr [[ARG1:%.*]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[MYTMP42]], 0 +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[CMP1]]) +; CHECK-NEXT: to label [[B:%.*]] [label %A.target.return] +; CHECK: B: +; CHECK-NEXT: [[MYTMP41:%.*]] = load i32, ptr [[ARG2:%.*]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[MYTMP41]], 0 +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[CMP]]) +; CHECK-NEXT: to label [[A]] [label %B.target.C] +; CHECK: C: +; CHECK-NEXT: [[INC:%.*]] = add i32 [[MYTMP41_MOVED:%.*]], 1 +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[RETURN:%.*]] [] +; CHECK: return: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[INC]], [[C:%.*]] ], [ [[PHI_MOVED:%.*]], [[LOOP_EXIT_GUARD:%.*]] ] +; CHECK-NEXT: ret i32 [[PHI]] +; CHECK: A.target.return: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: B.target.C: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[MYTMP41_MOVED]] = phi i32 [ poison, [[A_TARGET_RETURN:%.*]] ], [ [[MYTMP41]], [[B_TARGET_C:%.*]] ] +; CHECK-NEXT: [[PHI_MOVED]] = phi i32 [ [[MYTMP42]], [[A_TARGET_RETURN]] ], [ poison, [[B_TARGET_C]] ] +; CHECK-NEXT: [[GUARD_RETURN:%.*]] = phi i1 [ true, [[A_TARGET_RETURN]] ], [ false, [[B_TARGET_C]] ] +; CHECK-NEXT: br i1 [[GUARD_RETURN]], label [[RETURN]], label [[C]] +; +entry: + callbr void asm "", ""() to label %A [] + +A: + %mytmp42 = load i32, ptr %arg1, align 4 + %cmp1 = icmp slt i32 %mytmp42, 0 + callbr void asm "", "r,!i"(i1 %cmp1) to label %B [label %return] + +B: + %mytmp41 = load i32, ptr %arg2, align 4 + %cmp = icmp slt i32 %mytmp41, 0 + callbr void asm "", "r,!i"(i1 %cmp) to label %A [label %C] + +C: + %inc = add i32 %mytmp41, 1 + callbr void asm "", ""() to label %return [] + +return: + %phi = phi i32 [ %inc, %C ], [ %mytmp42, %A ] + ret i32 %phi +} + ; Loop consists of A, B and C: ; - A is the header ; - A and C are exiting blocks @@ -112,6 +166,63 @@ return: ret i32 0 } +define i32 @internal-used-in-exit_callbr(ptr %arg1, ptr %arg2) local_unnamed_addr align 2 { +; CHECK-LABEL: @internal-used-in-exit_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MYTMP42:%.*]] = load i32, ptr [[ARG1:%.*]], align 4 +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[A:%.*]] [] +; CHECK: A: +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[MYTMP42]], 0 +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[CMP1]]) +; CHECK-NEXT: to label [[B:%.*]] [label %A.target.return] +; CHECK: B: +; CHECK-NEXT: [[MYTMP41:%.*]] = load i32, ptr [[ARG2:%.*]], align 4 +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[C:%.*]] [] +; CHECK: C: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[MYTMP42]], 0 +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[CMP]]) +; CHECK-NEXT: to label [[A]] [label %C.target.D] +; CHECK: D: +; CHECK-NEXT: [[INC:%.*]] = add i32 [[MYTMP41_MOVED:%.*]], 1 +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[RETURN:%.*]] [] +; CHECK: return: +; CHECK-NEXT: ret i32 0 +; CHECK: A.target.return: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD:%.*]] +; CHECK: C.target.D: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[MYTMP41_MOVED]] = phi i32 [ poison, [[A_TARGET_RETURN:%.*]] ], [ [[MYTMP41]], [[C_TARGET_D:%.*]] ] +; CHECK-NEXT: [[GUARD_RETURN:%.*]] = phi i1 [ true, [[A_TARGET_RETURN]] ], [ false, [[C_TARGET_D]] ] +; CHECK-NEXT: br i1 [[GUARD_RETURN]], label [[RETURN]], label [[D:%.*]] +; +entry: + %mytmp42 = load i32, ptr %arg1, align 4 + callbr void asm "", ""() to label %A [] + +A: + %cmp1 = icmp slt i32 %mytmp42, 0 + callbr void asm "", "r,!i"(i1 %cmp1) to label %B [label %return] + +B: + %mytmp41 = load i32, ptr %arg2, align 4 + callbr void asm "", ""() to label %C [] + +C: + %cmp = icmp slt i32 %mytmp42, 0 + callbr void asm "", "r,!i"(i1 %cmp) to label %A [label %D] + +D: + %inc = add i32 %mytmp41, 1 + callbr void asm "", ""() to label %return [] + +return: + ret i32 0 +} + ; Loop consists of A, B and C: ; - A is the header ; - A and C are exiting blocks @@ -172,6 +283,68 @@ return: ret i32 %phi } +define i32 @mixed-use-in-exit_callbr(ptr %arg1, ptr %arg2) local_unnamed_addr align 2 { +; CHECK-LABEL: @mixed-use-in-exit_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MYTMP42:%.*]] = load i32, ptr [[ARG1:%.*]], align 4 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[MYTMP42]], 0 +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[CMP2]]) +; CHECK-NEXT: to label [[A:%.*]] [label %return] +; CHECK: A: +; CHECK-NEXT: [[MYTMP43:%.*]] = add i32 [[MYTMP42]], 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[MYTMP42]], 0 +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[CMP1]]) +; CHECK-NEXT: to label [[B:%.*]] [label %A.target.return] +; CHECK: B: +; CHECK-NEXT: [[MYTMP41:%.*]] = load i32, ptr [[ARG2:%.*]], align 4 +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[C:%.*]] [] +; CHECK: C: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[MYTMP42]], 0 +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[CMP]]) +; CHECK-NEXT: to label [[A]] [label %C.target.D] +; CHECK: D: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[RETURN:%.*]] [] +; CHECK: return: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[MYTMP41_MOVED:%.*]], [[D:%.*]] ], [ [[MYTMP42]], [[ENTRY:%.*]] ], [ [[PHI_MOVED:%.*]], [[LOOP_EXIT_GUARD:%.*]] ] +; CHECK-NEXT: ret i32 [[PHI]] +; CHECK: A.target.return: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: C.target.D: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[MYTMP41_MOVED]] = phi i32 [ poison, [[A_TARGET_RETURN:%.*]] ], [ [[MYTMP41]], [[C_TARGET_D:%.*]] ] +; CHECK-NEXT: [[PHI_MOVED]] = phi i32 [ [[MYTMP43]], [[A_TARGET_RETURN]] ], [ poison, [[C_TARGET_D]] ] +; CHECK-NEXT: [[GUARD_RETURN:%.*]] = phi i1 [ true, [[A_TARGET_RETURN]] ], [ false, [[C_TARGET_D]] ] +; CHECK-NEXT: br i1 [[GUARD_RETURN]], label [[RETURN]], label [[D]] +; +entry: + %mytmp42 = load i32, ptr %arg1, align 4 + %cmp2 = icmp slt i32 %mytmp42, 0 + callbr void asm "", "r,!i"(i1 %cmp2) to label %A [label %return] + +A: + %mytmp43 = add i32 %mytmp42, 1 + %cmp1 = icmp slt i32 %mytmp42, 0 + callbr void asm "", "r,!i"(i1 %cmp1) to label %B [label %return] + +B: + %mytmp41 = load i32, ptr %arg2, align 4 + callbr void asm "", ""() to label %C [] + +C: + %cmp = icmp slt i32 %mytmp42, 0 + callbr void asm "", "r,!i"(i1 %cmp) to label %A [label %D] + +D: + callbr void asm "", ""() to label %return [] + +return: + %phi = phi i32 [ %mytmp41, %D ], [ %mytmp43, %A ], [%mytmp42, %entry] + ret i32 %phi +} + ; Loop consists of A, B and C: ; - A is the header ; - A and C are exiting blocks @@ -236,3 +409,66 @@ return: %phi = phi i32 [ %mytmp41, %D ], [ %mytmp42, %E ] ret i32 %phi } + +define i32 @phi-via-external-block_callbr(ptr %arg1, ptr %arg2) local_unnamed_addr align 2 { +; CHECK-LABEL: @phi-via-external-block_callbr( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MYTMP42:%.*]] = load i32, ptr [[ARG1:%.*]], align 4 +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[A:%.*]] [] +; CHECK: A: +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[MYTMP42]], 0 +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[CMP1]]) +; CHECK-NEXT: to label [[B:%.*]] [label %A.target.E] +; CHECK: B: +; CHECK-NEXT: [[MYTMP41:%.*]] = load i32, ptr [[ARG2:%.*]], align 4 +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[C:%.*]] [] +; CHECK: C: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[MYTMP42]], 0 +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[CMP]]) +; CHECK-NEXT: to label [[A]] [label %C.target.D] +; CHECK: D: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[RETURN:%.*]] [] +; CHECK: E: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label [[RETURN]] [] +; CHECK: return: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[MYTMP41_MOVED:%.*]], [[D:%.*]] ], [ [[MYTMP42]], [[E:%.*]] ] +; CHECK-NEXT: ret i32 [[PHI]] +; CHECK: A.target.E: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD:%.*]] +; CHECK: C.target.D: +; CHECK-NEXT: br label [[LOOP_EXIT_GUARD]] +; CHECK: loop.exit.guard: +; CHECK-NEXT: [[MYTMP41_MOVED]] = phi i32 [ poison, [[A_TARGET_E:%.*]] ], [ [[MYTMP41]], [[C_TARGET_D:%.*]] ] +; CHECK-NEXT: [[GUARD_E:%.*]] = phi i1 [ true, [[A_TARGET_E]] ], [ false, [[C_TARGET_D]] ] +; CHECK-NEXT: br i1 [[GUARD_E]], label [[E]], label [[D]] +; +entry: + %mytmp42 = load i32, ptr %arg1, align 4 + callbr void asm "", ""() to label %A [] + +A: + %cmp1 = icmp slt i32 %mytmp42, 0 + callbr void asm "", "r,!i"(i1 %cmp1) to label %B [label %E] + +B: + %mytmp41 = load i32, ptr %arg2, align 4 + callbr void asm "", ""() to label %C [] + +C: + %cmp = icmp slt i32 %mytmp42, 0 + callbr void asm "", "r,!i"(i1 %cmp) to label %A [label %D] + +D: + callbr void asm "", ""() to label %return [] + +E: + callbr void asm "", ""() to label %return [] + +return: + %phi = phi i32 [ %mytmp41, %D ], [ %mytmp42, %E ] + ret i32 %phi +} diff --git a/llvm/test/Transforms/UnifyLoopExits/undef-phis.ll b/llvm/test/Transforms/UnifyLoopExits/undef-phis.ll index 05f50fcc37d6e..e65e2549a21c8 100644 --- a/llvm/test/Transforms/UnifyLoopExits/undef-phis.ll +++ b/llvm/test/Transforms/UnifyLoopExits/undef-phis.ll @@ -56,3 +56,71 @@ mbb5291: ; preds = %mbb4321 store volatile [2 x i32] %i5293, ptr addrspace(5) null, align 4 ret void } + +define fastcc void @undef_phi_callbr(i64 %i5247, i1 %i4530, i1 %i4936.not) { +; CHECK-LABEL: define fastcc void @undef_phi_callbr( +; CHECK-SAME: i64 [[I5247:%.*]], i1 [[I4530:%.*]], i1 [[I4936_NOT:%.*]]) { +; CHECK-NEXT: [[MBB:.*:]] +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label %[[MBB3932:.*]] [] +; CHECK: [[MBB3932]]: +; CHECK-NEXT: callbr void asm "", ""() +; CHECK-NEXT: to label %[[MBB4454:.*]] [] +; CHECK: [[MBB4321:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[I5247]] to i32 +; CHECK-NEXT: [[I5290:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[I5290]]) +; CHECK-NEXT: to label %[[MBB3932]] [label %mbb4321.target.mbb5291] +; CHECK: [[MBB4454]]: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[I4530]]) +; CHECK-NEXT: to label %[[MBB4535:.*]] [label %mbb4454.target.mbb4454.target.mbb4531] +; CHECK: [[MBB4531:.*]]: +; CHECK-NEXT: ret void +; CHECK: [[MBB4535]]: +; CHECK-NEXT: callbr void asm "", "r,!i"(i1 [[I4936_NOT]]) +; CHECK-NEXT: to label %[[MBB4535_TARGET_MBB4321:.*]] [label %mbb4454] +; CHECK: [[MBB5291:.*]]: +; CHECK-NEXT: [[I5293:%.*]] = insertvalue [2 x i32] zeroinitializer, i32 [[DOTMOVED:%.*]], 1 +; CHECK-NEXT: store volatile [2 x i32] [[I5293]], ptr addrspace(5) null, align 4 +; CHECK-NEXT: ret void +; CHECK: [[MBB4454_TARGET_MBB4531:.*]]: +; CHECK-NEXT: br label %[[LOOP_EXIT_GUARD:.*]] +; CHECK: [[MBB4321_TARGET_MBB5291:.*]]: +; CHECK-NEXT: br label %[[LOOP_EXIT_GUARD]] +; CHECK: [[LOOP_EXIT_GUARD]]: +; CHECK-NEXT: [[DOTMOVED]] = phi i32 [ poison, %[[MBB4454_TARGET_MBB4531]] ], [ [[TMP0]], %[[MBB4321_TARGET_MBB5291]] ] +; CHECK-NEXT: [[GUARD_MBB4531:%.*]] = phi i1 [ true, %[[MBB4454_TARGET_MBB4531]] ], [ false, %[[MBB4321_TARGET_MBB5291]] ] +; CHECK-NEXT: br i1 [[GUARD_MBB4531]], label %[[MBB4531]], label %[[MBB5291]] +; CHECK: [[MBB4454_TARGET_MBB4454_TARGET_MBB4531:.*]]: +; CHECK-NEXT: br label %[[LOOP_EXIT_GUARD1:.*]] +; CHECK: [[MBB4535_TARGET_MBB4321]]: +; CHECK-NEXT: br label %[[LOOP_EXIT_GUARD1]] +; CHECK: [[LOOP_EXIT_GUARD1]]: +; CHECK-NEXT: [[GUARD_MBB4454_TARGET_MBB4531:%.*]] = phi i1 [ true, %[[MBB4454_TARGET_MBB4454_TARGET_MBB4531]] ], [ false, %[[MBB4535_TARGET_MBB4321]] ] +; CHECK-NEXT: br i1 [[GUARD_MBB4454_TARGET_MBB4531]], label %[[MBB4454_TARGET_MBB4531]], label %[[MBB4321]] +; +mbb: + callbr void asm "", ""() to label %mbb3932 [] + +mbb3932: ; preds = %mbb4321, %mbb + callbr void asm "", ""() to label %mbb4454 [] + +mbb4321: ; preds = %mbb4535 + %0 = trunc i64 %i5247 to i32 + %i5290 = icmp eq i32 %0, 0 + callbr void asm "", "r,!i"(i1 %i5290) to label %mbb3932 [label %mbb5291] + +mbb4454: ; preds = %mbb4535, %mbb3932 + callbr void asm "", "r,!i"(i1 %i4530) to label %mbb4535 [label %mbb4531] + +mbb4531: ; preds = %mbb4454 + ret void + +mbb4535: ; preds = %mbb4454 + callbr void asm "", "r,!i"(i1 %i4936.not) to label %mbb4321 [label %mbb4454] + +mbb5291: ; preds = %mbb4321 + %i5293 = insertvalue [2 x i32] zeroinitializer, i32 %0, 1 + store volatile [2 x i32] %i5293, ptr addrspace(5) null, align 4 + ret void +} From c94a1f2159755348568c40941f5c38c4dcaecec2 Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Thu, 30 Oct 2025 09:34:14 -0400 Subject: [PATCH 188/539] bunch of small changes to fix a number of LIT tests on z/OS (#165567) A collection of small changes to get a number of lit tests working on z/OS. --- clang/lib/Driver/ToolChains/ZOS.cpp | 4 ++-- clang/test/CodeGenCXX/ubsan-coroutines.cpp | 1 + clang/test/Driver/fat-archive-unbundle-ext.c | 2 +- clang/test/Headers/cuda_with_openmp.cu | 2 +- llvm/test/lit.cfg.py | 15 +++++++++++---- 5 files changed, 16 insertions(+), 8 deletions(-) diff --git a/clang/lib/Driver/ToolChains/ZOS.cpp b/clang/lib/Driver/ToolChains/ZOS.cpp index 57bcb3c306cef..9a3c45323a3cf 100644 --- a/clang/lib/Driver/ToolChains/ZOS.cpp +++ b/clang/lib/Driver/ToolChains/ZOS.cpp @@ -75,7 +75,7 @@ void zos::Assembler::ConstructJob(Compilation &C, const JobAction &JA, const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as")); C.addCommand(std::make_unique(JA, *this, ResponseFileSupport::None(), - Exec, CmdArgs, Inputs)); + Exec, CmdArgs, Inputs, Output)); } static std::string getLEHLQ(const ArgList &Args) { @@ -213,7 +213,7 @@ void zos::Linker::ConstructJob(Compilation &C, const JobAction &JA, const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath()); C.addCommand(std::make_unique(JA, *this, ResponseFileSupport::None(), - Exec, CmdArgs, Inputs)); + Exec, CmdArgs, Inputs, Output)); } ToolChain::RuntimeLibType ZOS::GetDefaultRuntimeLibType() const { diff --git a/clang/test/CodeGenCXX/ubsan-coroutines.cpp b/clang/test/CodeGenCXX/ubsan-coroutines.cpp index 04ab0505f1401..60c89a47f9046 100644 --- a/clang/test/CodeGenCXX/ubsan-coroutines.cpp +++ b/clang/test/CodeGenCXX/ubsan-coroutines.cpp @@ -1,6 +1,7 @@ // This test merely verifies that emitting the object file does not cause a // crash when the LLVM coroutines passes are run. // RUN: %clang_cc1 -emit-obj -std=c++2a -fsanitize=null %s -o %t.o +// UNSUPPORTED: target={{.*}}-zos{{.*}} namespace std { template struct coroutine_traits { diff --git a/clang/test/Driver/fat-archive-unbundle-ext.c b/clang/test/Driver/fat-archive-unbundle-ext.c index e797acccf02b4..d658ad05b345c 100644 --- a/clang/test/Driver/fat-archive-unbundle-ext.c +++ b/clang/test/Driver/fat-archive-unbundle-ext.c @@ -1,5 +1,5 @@ // REQUIRES: x86-registered-target -// UNSUPPORTED: target={{.*-windows.*}}, target={{.*}}-macosx{{.*}}, target={{.*-darwin.*}}, target={{.*}}-aix{{.*}} +// UNSUPPORTED: target={{.*-windows.*}}, target={{.*}}-macosx{{.*}}, target={{.*-darwin.*}}, target={{.*}}-aix{{.*}}, target={{.*}}-zos{{.*}} // Generate dummy fat object // RUN: %clang -O0 --target=%itanium_abi_triple %s -c -o %t.host.o diff --git a/clang/test/Headers/cuda_with_openmp.cu b/clang/test/Headers/cuda_with_openmp.cu index efde4ecdc6626..8ea0de5972ff2 100644 --- a/clang/test/Headers/cuda_with_openmp.cu +++ b/clang/test/Headers/cuda_with_openmp.cu @@ -2,7 +2,7 @@ // Reported in https://bugs.llvm.org/show_bug.cgi?id=48014 ///==========================================================================/// -// REQUIRES: nvptx-registered-target +// REQUIRES: nvptx-registered-target, host-supports-cuda // RUN: %clang -x cuda -fopenmp -c %s -o - --cuda-path=%S/../Driver/Inputs/CUDA/usr/local/cuda -nocudalib -isystem %S/Inputs/include -isystem %S/../../lib/Headers -fsyntax-only diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 781240aac94b6..11a5a5785a6ec 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -753,10 +753,17 @@ def host_unwind_supports_jit(): config.available_features.add("unix-sockets") # .debug_frame is not emitted for targeting Windows x64, aarch64/arm64, AIX, or Apple Silicon Mac. -if not re.match( - r"^(x86_64|aarch64|arm64|powerpc|powerpc64).*-(windows-cygnus|windows-gnu|windows-msvc|aix)", - config.target_triple, -) and not re.match(r"^arm64(e)?-apple-(macos|darwin)", config.target_triple): +if ( + not re.match( + r"^(x86_64|aarch64|arm64|powerpc|powerpc64).*-(windows-cygnus|windows-gnu|windows-msvc|aix)", + config.target_triple, + ) + and not re.match( + r"^arm64(e)?-apple-(macos|darwin)", + config.target_triple, + ) + and not re.match(r".*-zos.*", config.target_triple) +): config.available_features.add("debug_frame") if config.enable_backtrace: From 5f4589c26fbfcfbd64eb06dbd8ddac3bb678ef5b Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 30 Oct 2025 13:35:21 +0000 Subject: [PATCH 189/539] Revert "[LLDB][Windows]: Don't pass duplicate HANDLEs to CreateProcess" (#165717) Reverts llvm/llvm-project#165281 Because our Windows on Arm buildbot is red all over: https://lab.llvm.org/buildbot/#/builders/141/builds/12624 --- .../Host/windows/ProcessLauncherWindows.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/lldb/source/Host/windows/ProcessLauncherWindows.cpp b/lldb/source/Host/windows/ProcessLauncherWindows.cpp index e1b4b7e48c5a6..f5adadaf061bf 100644 --- a/lldb/source/Host/windows/ProcessLauncherWindows.cpp +++ b/lldb/source/Host/windows/ProcessLauncherWindows.cpp @@ -16,7 +16,6 @@ #include "llvm/Support/Program.h" #include -#include #include using namespace lldb; @@ -92,13 +91,13 @@ ProcessLauncherWindows::LaunchProcess(const ProcessLaunchInfo &launch_info, startupinfo.hStdOutput = stdout_handle ? stdout_handle : ::GetStdHandle(STD_OUTPUT_HANDLE); - std::unordered_set inherited_handles; + std::vector inherited_handles; if (startupinfo.hStdError) - inherited_handles.insert(startupinfo.hStdError); + inherited_handles.push_back(startupinfo.hStdError); if (startupinfo.hStdInput) - inherited_handles.insert(startupinfo.hStdInput); + inherited_handles.push_back(startupinfo.hStdInput); if (startupinfo.hStdOutput) - inherited_handles.insert(startupinfo.hStdOutput); + inherited_handles.push_back(startupinfo.hStdOutput); SIZE_T attributelist_size = 0; InitializeProcThreadAttributeList(/*lpAttributeList=*/nullptr, @@ -121,15 +120,13 @@ ProcessLauncherWindows::LaunchProcess(const ProcessLaunchInfo &launch_info, const FileAction *act = launch_info.GetFileActionAtIndex(i); if (act->GetAction() == FileAction::eFileActionDuplicate && act->GetFD() == act->GetActionArgument()) - inherited_handles.insert(reinterpret_cast(act->GetFD())); + inherited_handles.push_back(reinterpret_cast(act->GetFD())); } if (!inherited_handles.empty()) { - std::vector handles(inherited_handles.begin(), - inherited_handles.end()); if (!UpdateProcThreadAttribute( startupinfoex.lpAttributeList, /*dwFlags=*/0, - PROC_THREAD_ATTRIBUTE_HANDLE_LIST, handles.data(), - handles.size() * sizeof(HANDLE), + PROC_THREAD_ATTRIBUTE_HANDLE_LIST, inherited_handles.data(), + inherited_handles.size() * sizeof(HANDLE), /*lpPreviousValue=*/nullptr, /*lpReturnSize=*/nullptr)) { error = Status(::GetLastError(), eErrorTypeWin32); return HostProcess(); From 069ff69559c9ef5c9d53408771169f630b219566 Mon Sep 17 00:00:00 2001 From: SKill Date: Thu, 30 Oct 2025 14:39:15 +0100 Subject: [PATCH 190/539] [clang] Use File Location for debug info resolution. (#163982) To improve debuggability, the macro arguments should be resolved to their original location rather than macro expansion location. [PR in cation](https://github.com/user-attachments/assets/994fb89f-83be-4c21-a79c-f8e51d818f7b) fixes #160667 --- clang/lib/CodeGen/CGDebugInfo.cpp | 17 ++++++----- clang/test/DebugInfo/Generic/macro-info.c | 35 +++++++++++++++++++++++ 2 files changed, 45 insertions(+), 7 deletions(-) create mode 100644 clang/test/DebugInfo/Generic/macro-info.c diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index fd2f6dcf182b5..ca579c915f49d 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -345,7 +345,7 @@ void CGDebugInfo::setLocation(SourceLocation Loc) { if (Loc.isInvalid()) return; - CurLoc = CGM.getContext().getSourceManager().getExpansionLoc(Loc); + CurLoc = CGM.getContext().getSourceManager().getFileLoc(Loc); // If we've changed files in the middle of a lexical scope go ahead // and create a new lexical scope with file node if it's different @@ -572,7 +572,7 @@ llvm::DIFile *CGDebugInfo::getOrCreateFile(SourceLocation Loc) { FileName = TheCU->getFile()->getFilename(); CSInfo = TheCU->getFile()->getChecksum(); } else { - PresumedLoc PLoc = SM.getPresumedLoc(Loc); + PresumedLoc PLoc = SM.getPresumedLoc(SM.getFileLoc(Loc)); FileName = PLoc.getFilename(); if (FileName.empty()) { @@ -599,7 +599,8 @@ llvm::DIFile *CGDebugInfo::getOrCreateFile(SourceLocation Loc) { if (CSKind) CSInfo.emplace(*CSKind, Checksum); } - return createFile(FileName, CSInfo, getSource(SM, SM.getFileID(Loc))); + return createFile(FileName, CSInfo, + getSource(SM, SM.getFileID(SM.getFileLoc(Loc)))); } llvm::DIFile *CGDebugInfo::createFile( @@ -654,7 +655,7 @@ unsigned CGDebugInfo::getLineNumber(SourceLocation Loc) { if (Loc.isInvalid()) return 0; SourceManager &SM = CGM.getContext().getSourceManager(); - return SM.getPresumedLoc(Loc).getLine(); + return SM.getPresumedLoc(SM.getFileLoc(Loc)).getLine(); } unsigned CGDebugInfo::getColumnNumber(SourceLocation Loc, bool Force) { @@ -666,7 +667,8 @@ unsigned CGDebugInfo::getColumnNumber(SourceLocation Loc, bool Force) { if (Loc.isInvalid() && CurLoc.isInvalid()) return 0; SourceManager &SM = CGM.getContext().getSourceManager(); - PresumedLoc PLoc = SM.getPresumedLoc(Loc.isValid() ? Loc : CurLoc); + PresumedLoc PLoc = + SM.getPresumedLoc(Loc.isValid() ? SM.getFileLoc(Loc) : CurLoc); return PLoc.isValid() ? PLoc.getColumn() : 0; } @@ -5002,7 +5004,7 @@ void CGDebugInfo::EmitLocation(CGBuilderTy &Builder, SourceLocation Loc) { // Update our current location setLocation(Loc); - if (CurLoc.isInvalid() || CurLoc.isMacroID() || LexicalBlockStack.empty()) + if (CurLoc.isInvalid() || LexicalBlockStack.empty()) return; llvm::MDNode *Scope = LexicalBlockStack.back(); @@ -6278,7 +6280,8 @@ void CGDebugInfo::EmitGlobalAlias(const llvm::GlobalValue *GV, void CGDebugInfo::AddStringLiteralDebugInfo(llvm::GlobalVariable *GV, const StringLiteral *S) { SourceLocation Loc = S->getStrTokenLoc(0); - PresumedLoc PLoc = CGM.getContext().getSourceManager().getPresumedLoc(Loc); + SourceManager &SM = CGM.getContext().getSourceManager(); + PresumedLoc PLoc = SM.getPresumedLoc(SM.getFileLoc(Loc)); if (!PLoc.isValid()) return; diff --git a/clang/test/DebugInfo/Generic/macro-info.c b/clang/test/DebugInfo/Generic/macro-info.c new file mode 100644 index 0000000000000..ec49eb5d65f9c --- /dev/null +++ b/clang/test/DebugInfo/Generic/macro-info.c @@ -0,0 +1,35 @@ +// RUN: %clang_cc1 %s -debug-info-kind=standalone -emit-llvm -o - | FileCheck %s + +#define GLOBAL(num) global## num +#define DECL_GLOBAL(x) int x +#define SAME_ORDER(x, y) x; y +#define SWAP_ORDER(x,y) y; x + + + +SAME_ORDER( + int +// CHECK: DIGlobalVariable(name: "global",{{.*}} line: [[@LINE+1]] + GLOBAL // <- global + () = 42, + const char* s() { +// CHECK: DIGlobalVariable({{.*}}line: [[@LINE+1]],{{.*}} type: [[TYPEID:![0-9]+]] + return "1234567890"; + } +) + +SWAP_ORDER( + int GLOBAL( // <- global2 + 2) = 43, +// CHECK: DIGlobalVariable(name: "global3",{{.*}} line: [[@LINE+3]] +// CHECK: DIGlobalVariable(name: "global2",{{.*}} line: [[@LINE-3]] + DECL_GLOBAL( + GLOBAL( // <- global3 + 3)) = 44 +); + + +DECL_GLOBAL( +// CHECK: DIGlobalVariable(name: "global4",{{.*}} line: [[@LINE+1]] + GLOBAL( // <- global4 + 4)); From db530ac810a404a6f3e1aee788feec737eabd7c4 Mon Sep 17 00:00:00 2001 From: Hsiangkai Wang Date: Thu, 30 Oct 2025 13:43:49 +0000 Subject: [PATCH 191/539] [mlir][gpu] Loose the condition to convert scf.parallel to gpu.launch (#164978) Use LocalAliasAnalysis to improve handling of side effects in nested scf.parallel. If the written memory outside nested scf.parallel is not alias to the memory accessed inside the nested loop, we can convert it to gpu.launch. --- mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp | 64 +++++++++++++++++-- .../Conversion/SCFToGPU/parallel_loop.mlir | 32 ++++++++++ 2 files changed, 90 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp index 7d0a236b6f69a..76a822b05a652 100644 --- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp +++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp @@ -14,6 +14,7 @@ #include "mlir/Conversion/SCFToGPU/SCFToGPU.h" +#include "mlir/Analysis/AliasAnalysis/LocalAliasAnalysis.h" #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" @@ -27,6 +28,7 @@ #include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/RegionUtils.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/Support/DebugLog.h" #include @@ -625,18 +627,49 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, bool seenSideeffects = false; // Whether we have left a nesting scope (and hence are no longer innermost). bool leftNestingScope = false; + LocalAliasAnalysis aliasAnalysis; + llvm::DenseSet writtenBuffer; while (!worklist.empty()) { Operation *op = worklist.pop_back_val(); // Now walk over the body and clone it. // TODO: This is only correct if there either is no further scf.parallel - // nested or this code is side-effect free. Otherwise we might need - // predication. We are overly conservative for now and only allow - // side-effects in the innermost scope. + // nested or this code has side-effect but the memory buffer is not + // alias to inner loop access buffer. Otherwise we might need + // predication. if (auto nestedParallel = dyn_cast(op)) { // Before entering a nested scope, make sure there have been no - // sideeffects until now. - if (seenSideeffects) - return failure(); + // sideeffects until now or the nested operations do not access the + // buffer written by outer scope. + if (seenSideeffects) { + WalkResult walkRes = nestedParallel.walk([&](Operation *nestedOp) { + if (isMemoryEffectFree(nestedOp)) + return WalkResult::advance(); + + auto memEffectInterface = dyn_cast(nestedOp); + if (!memEffectInterface) + return WalkResult::advance(); + + SmallVector effects; + memEffectInterface.getEffects(effects); + for (const MemoryEffects::EffectInstance &effect : effects) { + if (isa(effect.getEffect()) || + isa(effect.getEffect())) { + Value baseBuffer = effect.getValue(); + if (!baseBuffer) + return WalkResult::interrupt(); + for (Value val : writtenBuffer) { + if (aliasAnalysis.alias(baseBuffer, val) != + AliasResult::NoAlias) { + return WalkResult::interrupt(); + } + } + } + } + return WalkResult::advance(); + }); + if (walkRes.wasInterrupted()) + return failure(); + } // A nested scf.parallel needs insertion of code to compute indices. // Insert that now. This will also update the worklist with the loops // body. @@ -650,6 +683,7 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, rewriter.setInsertionPointAfter(parent); leftNestingScope = true; seenSideeffects = false; + writtenBuffer.clear(); } else if (auto reduceOp = dyn_cast(op)) { // Convert scf.reduction op auto parentLoop = op->getParentOfType(); @@ -682,6 +716,24 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, Operation *clone = rewriter.clone(*op, cloningMap); cloningMap.map(op->getResults(), clone->getResults()); // Check for side effects. + if (!isMemoryEffectFree(clone)) { + // Record the buffer accessed by the operations with write effects. + if (auto memEffectInterface = + dyn_cast(clone)) { + SmallVector effects; + memEffectInterface.getEffects(effects); + for (const MemoryEffects::EffectInstance &effect : effects) { + if (isa(effect.getEffect())) { + Value writtenBase = effect.getValue(); + // Conservatively return failure if we cannot find the written + // address. + if (!writtenBase) + return failure(); + writtenBuffer.insert(writtenBase); + } + } + } + } // TODO: Handle region side effects properly. seenSideeffects |= !isMemoryEffectFree(clone) || clone->getNumRegions() != 0; diff --git a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir index 1dbce05be85b4..26f5a3e1f0ac0 100644 --- a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir +++ b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir @@ -641,3 +641,35 @@ func.func @parallel_reduction_1d_outside() { // CHECK: scf.parallel // CHECK-NEXT: scf.parallel // CHECK: scf.reduce + +// ----- + +// CHECK-LABEL: @nested_parallel_with_side_effect +func.func @nested_parallel_with_side_effect() { + %c65536 = arith.constant 65536 : index + %c2 = arith.constant 2 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %alloc_0 = memref.alloc() : memref<2x256x256xf32> + %alloc_1 = memref.alloc() : memref<2x4x256x256xf32> + %alloc_2 = memref.alloc() : memref<4x4xf32> + %alloc_3 = memref.alloc() : memref<4x4xf32> + scf.parallel (%arg2, %arg3, %arg4) = (%c0, %c0, %c0) to (%c2, %c4, %c65536) step (%c1, %c1, %c1) { + %1 = arith.remsi %arg4, %c256 : index + %2 = arith.divsi %arg4, %c256 : index + %4 = memref.load %alloc_0[%arg2, %2, %1] : memref<2x256x256xf32> + memref.store %4, %alloc_1[%arg2, %arg3, %2, %1] : memref<2x4x256x256xf32> + scf.parallel (%arg5) = (%c0) to (%c4) step (%c1) { + %5 = memref.load %alloc_2[%arg5, %c0] : memref<4x4xf32> + memref.store %5, %alloc_3[%arg5, %c0] : memref<4x4xf32> + scf.reduce + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + scf.reduce + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + return +} + +// CHECK: gpu.launch +// CHECK-NOT: scf.parallel From 8fe0132fc27e29063e83fb0983ef05ad196a2332 Mon Sep 17 00:00:00 2001 From: John Brawn Date: Thu, 30 Oct 2025 13:53:15 +0000 Subject: [PATCH 192/539] [LSR] Don't count conditional loads/store as enabling pre/post-index (#159573) When a load/store is conditionally executed in a loop it isn't a candidate for pre/post-index addressing, as the increment of the address would only happen on those loop iterations where the load/store is executed. Detect this and only discount the AddRec cost when the load/store is unconditional. --- .../Transforms/Scalar/LoopStrengthReduce.cpp | 38 ++++- .../Thumb2/LowOverheadLoops/minloop.ll | 70 +++++---- .../LoopStrengthReduce/AArch64/prefer-all.ll | 144 +++++++++++++++++- 3 files changed, 209 insertions(+), 43 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 1a279b6198182..001215abcfb26 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1318,6 +1318,11 @@ class LSRUse { /// the loop, in which case some special-case heuristics may be used. bool AllFixupsOutsideLoop = true; + /// This records whether all of the fixups using this LSRUse are unconditional + /// within the loop, meaning they will be executed on every path to the loop + /// latch. This includes fixups before early exits. + bool AllFixupsUnconditional = true; + /// RigidFormula is set to true to guarantee that this use will be associated /// with a single formula--the one that initially matched. Some SCEV /// expressions cannot be expanded. This allows LSR to consider the registers @@ -1421,16 +1426,22 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg, if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) || TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) { const SCEV *Start; - const SCEVConstant *Step; - if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step)))) + const APInt *Step; + if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) { // If the step size matches the base offset, we could use pre-indexed // addressing. - if (((AMK & TTI::AMK_PreIndexed) && F.BaseOffset.isFixed() && - Step->getAPInt() == F.BaseOffset.getFixedValue()) || - ((AMK & TTI::AMK_PostIndexed) && !isa(Start) && - SE->isLoopInvariant(Start, L))) + bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) && + F.BaseOffset.isFixed() && + *Step == F.BaseOffset.getFixedValue(); + bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) && + !isa(Start) && + SE->isLoopInvariant(Start, L); + // We can only pre or post index when the load/store is unconditional. + if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional) LoopCost = 0; + } } + // If the loop counts down to zero and we'll be using a hardware loop then // the addrec will be combined into the hardware loop instruction. if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() && @@ -1783,6 +1794,9 @@ void LSRUse::print(raw_ostream &OS) const { if (AllFixupsOutsideLoop) OS << ", all-fixups-outside-loop"; + if (AllFixupsUnconditional) + OS << ", all-fixups-unconditional"; + if (WidestFixupType) OS << ", widest fixup type: " << *WidestFixupType; } @@ -2213,6 +2227,7 @@ class LSRInstance { void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); void CountRegisters(const Formula &F, size_t LUIdx); bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F); + bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const; void CollectLoopInvariantFixupsAndFormulae(); @@ -3607,6 +3622,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LF.PostIncLoops = TmpPostIncLoops; LF.Offset = Offset; LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); + LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF); // Create SCEV as Formula for calculating baseline cost if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) { @@ -3680,6 +3696,14 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { return true; } +/// Test whether this fixup will be executed each time the corresponding IV +/// increment instruction is executed. +bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const { + // If the fixup block dominates the IV increment block then there is no path + // through the loop to the increment that doesn't pass through the fixup. + return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent()); +} + /// Check for other uses of loop-invariant values which we're tracking. These /// other uses will pin these values in registers, making them less profitable /// for elimination. @@ -3803,6 +3827,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { LF.OperandValToReplace = U; LF.Offset = Offset; LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); + LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF); if (!LU.WidestFixupType || SE.getTypeSizeInBits(LU.WidestFixupType) < SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) @@ -4940,6 +4965,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n'); LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; + LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional; // Transfer the fixups of LU to LUThatHas. for (LSRFixup &Fixup : LU.Fixups) { diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll index 9c36bae6fac13..ec257bcf123f3 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll @@ -6,77 +6,81 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: ldr.w r12, [r0] ; CHECK-NEXT: subs.w r9, r1, #1 ; CHECK-NEXT: beq .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader -; CHECK-NEXT: and r8, r9, #3 +; CHECK-NEXT: and r6, r9, #3 ; CHECK-NEXT: subs r7, r1, #2 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB0_4 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: b .LBB0_6 +; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: cbnz r6, .LBB0_7 +; CHECK-NEXT: b .LBB0_10 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: mov.w r10, #0 ; CHECK-NEXT: b .LBB0_10 ; CHECK-NEXT: .LBB0_4: @ %while.body.preheader.new ; CHECK-NEXT: bic r7, r9, #3 -; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: str r6, [sp] @ 4-byte Spill ; CHECK-NEXT: subs r7, #4 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r10, #0 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: movs r7, #4 ; CHECK-NEXT: .LBB0_5: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r10, [r0, #16]! -; CHECK-NEXT: sub.w r9, r9, #4 -; CHECK-NEXT: ldrd r5, r4, [r0, #-12] -; CHECK-NEXT: ldr r11, [r0, #-4] +; CHECK-NEXT: ldr r11, [r0, #16]! +; CHECK-NEXT: ldrd r5, r7, [r0, #-12] +; CHECK-NEXT: ldr r4, [r0, #-4] ; CHECK-NEXT: cmp r12, r5 -; CHECK-NEXT: it gt -; CHECK-NEXT: subgt r6, r7, #3 ; CHECK-NEXT: csel r5, r5, r12, gt -; CHECK-NEXT: cmp r5, r4 +; CHECK-NEXT: csinc r6, r10, r8, le +; CHECK-NEXT: cmp r5, r7 ; CHECK-NEXT: it gt -; CHECK-NEXT: subgt r6, r7, #2 -; CHECK-NEXT: csel r5, r4, r5, gt -; CHECK-NEXT: cmp r5, r11 +; CHECK-NEXT: addgt.w r6, r8, #2 +; CHECK-NEXT: csel r7, r7, r5, gt +; CHECK-NEXT: cmp r7, r4 ; CHECK-NEXT: it gt -; CHECK-NEXT: subgt r6, r7, #1 -; CHECK-NEXT: csel r5, r11, r5, gt -; CHECK-NEXT: cmp r5, r10 -; CHECK-NEXT: csel r6, r7, r6, gt -; CHECK-NEXT: add.w r7, r7, #4 -; CHECK-NEXT: csel r12, r10, r5, gt +; CHECK-NEXT: addgt.w r6, r8, #3 +; CHECK-NEXT: csel r7, r4, r7, gt +; CHECK-NEXT: add.w r8, r8, #4 +; CHECK-NEXT: cmp r7, r11 +; CHECK-NEXT: csel r10, r8, r6, gt +; CHECK-NEXT: csel r12, r11, r7, gt ; CHECK-NEXT: le lr, .LBB0_5 -; CHECK-NEXT: .LBB0_6: @ %while.end.loopexit.unr-lcssa -; CHECK-NEXT: cmp.w r8, #0 -; CHECK-NEXT: beq .LBB0_10 -; CHECK-NEXT: @ %bb.7: @ %while.body.epil +; CHECK-NEXT: @ %bb.6: @ %while.end.loopexit.unr-lcssa.loopexit +; CHECK-NEXT: ldr r6, [sp] @ 4-byte Reload +; CHECK-NEXT: sub.w r9, r9, r8 +; CHECK-NEXT: cbz r6, .LBB0_10 +; CHECK-NEXT: .LBB0_7: @ %while.body.epil ; CHECK-NEXT: ldr r7, [r0, #4] ; CHECK-NEXT: sub.w r1, r1, r9 ; CHECK-NEXT: cmp r12, r7 -; CHECK-NEXT: csel r6, r1, r6, gt +; CHECK-NEXT: csel r10, r1, r10, gt ; CHECK-NEXT: csel r12, r7, r12, gt -; CHECK-NEXT: cmp.w r8, #1 +; CHECK-NEXT: cmp r6, #1 ; CHECK-NEXT: beq .LBB0_10 ; CHECK-NEXT: @ %bb.8: @ %while.body.epil.1 ; CHECK-NEXT: ldr r7, [r0, #8] ; CHECK-NEXT: cmp r12, r7 -; CHECK-NEXT: csinc r6, r6, r1, le +; CHECK-NEXT: csinc r10, r10, r1, le ; CHECK-NEXT: csel r12, r7, r12, gt -; CHECK-NEXT: cmp.w r8, #2 +; CHECK-NEXT: cmp r6, #2 ; CHECK-NEXT: beq .LBB0_10 ; CHECK-NEXT: @ %bb.9: @ %while.body.epil.2 ; CHECK-NEXT: ldr r0, [r0, #12] ; CHECK-NEXT: cmp r12, r0 ; CHECK-NEXT: it gt -; CHECK-NEXT: addgt r6, r1, #2 +; CHECK-NEXT: addgt.w r10, r1, #2 ; CHECK-NEXT: csel r12, r0, r12, gt ; CHECK-NEXT: .LBB0_10: @ %while.end ; CHECK-NEXT: str.w r12, [r2] -; CHECK-NEXT: str r6, [r3] +; CHECK-NEXT: str.w r10, [r3] +; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %0 = load i32, ptr %pSrc, align 4 diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll index db30fd23b0c9d..1944a9c800355 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll @@ -119,8 +119,6 @@ for.end: ; We can't use postindex addressing on the conditional load of qval and can't ; convert the loop condition to a compare with zero, so we should instead use ; offset addressing. -; FIXME: Currently we don't notice the load of qval is conditional, and attempt -; postindex addressing anyway. define i32 @conditional_load(ptr %p, ptr %q, ptr %n) { ; CHECK-LABEL: define i32 @conditional_load( ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[N:%.*]]) { @@ -128,7 +126,6 @@ define i32 @conditional_load(ptr %p, ptr %q, ptr %n) { ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ] -; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ] ; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: [[RET:%.*]] = phi i32 [ [[RET_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: [[PVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4 @@ -136,6 +133,8 @@ define i32 @conditional_load(ptr %p, ptr %q, ptr %n) { ; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4 ; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label %[[FOR_INC]], label %[[IF_THEN:.*]] ; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[IDX]], 2 +; CHECK-NEXT: [[LSR_IV:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP0]] ; CHECK-NEXT: [[QVAL:%.*]] = load i32, ptr [[LSR_IV]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[RET]], [[QVAL]] ; CHECK-NEXT: br label %[[FOR_INC]] @@ -143,7 +142,6 @@ define i32 @conditional_load(ptr %p, ptr %q, ptr %n) { ; CHECK-NEXT: [[RET_NEXT]] = phi i32 [ [[ADD]], %[[IF_THEN]] ], [ [[RET]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[IDX_NEXT]] = add nuw nsw i64 [[IDX]], 1 ; CHECK-NEXT: [[NVAL:%.*]] = load volatile i64, ptr [[N]], align 8 -; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[IDX_NEXT]], [[NVAL]] ; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]] ; CHECK: [[EXIT]]: @@ -176,3 +174,141 @@ for.inc: exit: ret i32 %ret.next } + +; We can use postindex addressing for both loads here, even though the second +; may not be executed on every loop iteration. +define i32 @early_exit_load(ptr %p, ptr %q, ptr %n) { +; CHECK-LABEL: define i32 @early_exit_load( +; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ] +; CHECK-NEXT: [[RET_PHI:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[PVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[PVAL]], 0 +; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4 +; CHECK-NEXT: br i1 [[CMP1]], label %[[FOR_INC]], label %[[EXIT:.*]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[QVAL:%.*]] = load i32, ptr [[LSR_IV]], align 4 +; CHECK-NEXT: [[ADD]] = add nsw i32 [[QVAL]], [[RET_PHI]] +; CHECK-NEXT: [[IDX_NEXT]] = add nuw nsw i64 [[IDX]], 1 +; CHECK-NEXT: [[NVAL:%.*]] = load volatile i64, ptr [[N]], align 8 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 4 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i64 [[IDX_NEXT]], [[NVAL]] +; CHECK-NEXT: br i1 [[CMP2]], label %[[FOR_BODY]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RET:%.*]] = phi i32 [ [[RET_PHI]], %[[FOR_BODY]] ], [ [[ADD]], %[[FOR_INC]] ] +; CHECK-NEXT: ret i32 [[RET]] +; +entry: + br label %for.body + +for.body: + %ret.phi = phi i32 [ %add, %for.inc ], [ 0, %entry ] + %idx = phi i64 [ %idx.next, %for.inc ], [ 0, %entry ] + %paddr = getelementptr inbounds nuw i32, ptr %p, i64 %idx + %pval = load i32, ptr %paddr, align 4 + %cmp1 = icmp eq i32 %pval, 0 + br i1 %cmp1, label %for.inc, label %exit + +for.inc: + %qaddr = getelementptr inbounds nuw i32, ptr %q, i64 %idx + %qval = load i32, ptr %qaddr, align 4 + %add = add nsw i32 %qval, %ret.phi + %idx.next = add nuw nsw i64 %idx, 1 + %nval = load volatile i64, ptr %n, align 8 + %cmp2 = icmp slt i64 %idx.next, %nval + br i1 %cmp2, label %for.body, label %exit + +exit: + %ret = phi i32 [ %ret.phi, %for.body ], [ %add, %for.inc ] + ret i32 %ret +} + +; The control-flow before and after the load of qval shouldn't prevent postindex +; addressing from happening. +; FIXME: We choose postindex addressing, but the scevgep is placed in for.inc so +; during codegen we will fail to actually generate a postindex load. +define void @middle_block_load(ptr %p, ptr %q, i64 %n) { +; CHECK-LABEL: define void @middle_block_load( +; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[LSR_IV2:%.*]] = phi ptr [ [[SCEVGEP3:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ] +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[FOR_INC]] ], [ [[N]], %[[ENTRY]] ] +; CHECK-NEXT: [[PVAL:%.*]] = load i32, ptr [[LSR_IV2]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[PVAL]], 0 +; CHECK-NEXT: [[SCEVGEP3]] = getelementptr i8, ptr [[LSR_IV2]], i64 4 +; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN1:.*]], label %[[IF_ELSE1:.*]] +; CHECK: [[IF_THEN1]]: +; CHECK-NEXT: tail call void @otherfn1() +; CHECK-NEXT: br label %[[IF_END:.*]] +; CHECK: [[IF_ELSE1]]: +; CHECK-NEXT: tail call void @otherfn2() +; CHECK-NEXT: br label %[[IF_END]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: [[QVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4 +; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[QVAL]], 0 +; CHECK-NEXT: br i1 [[CMP2]], label %[[IF_THEN2:.*]], label %[[IF_ELSE2:.*]] +; CHECK: [[IF_THEN2]]: +; CHECK-NEXT: tail call void @otherfn1() +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[IF_ELSE2]]: +; CHECK-NEXT: tail call void @otherfn2() +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[CMP3]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %idx = phi i64 [ %idx.next, %for.inc ], [ 0, %entry ] + %paddr = getelementptr inbounds nuw i32, ptr %p, i64 %idx + %pval = load i32, ptr %paddr, align 4 + %cmp1 = icmp sgt i32 %pval, 0 + br i1 %cmp1, label %if.then1, label %if.else1 + +if.then1: + tail call void @otherfn1() + br label %if.end + +if.else1: + tail call void @otherfn2() + br label %if.end + +if.end: + %qaddr = getelementptr inbounds nuw i32, ptr %q, i64 %idx + %qval = load i32, ptr %qaddr, align 4 + %cmp2 = icmp sgt i32 %qval, 0 + br i1 %cmp2, label %if.then2, label %if.else2 + +if.then2: + tail call void @otherfn1() + br label %for.inc + +if.else2: + tail call void @otherfn2() + br label %for.inc + +for.inc: + %idx.next = add nuw nsw i64 %idx, 1 + %cmp3 = icmp eq i64 %idx.next, %n + br i1 %cmp3, label %exit, label %for.body + +exit: + ret void +} + +declare dso_local void @otherfn1() +declare dso_local void @otherfn2() From 8aa512c2fe0f356a94e580c4295631ed8f212fd1 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 30 Oct 2025 07:10:52 -0700 Subject: [PATCH 193/539] [mlir] Remove unused "using" decls (NFC) (#165652) Identified with misc-unused-using-decls. --- mlir/lib/Dialect/Tensor/IR/TensorOps.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp index ac7200294a3a6..110bfdce72ea4 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -41,10 +41,6 @@ using namespace mlir; using namespace mlir::tensor; -using llvm::divideCeilSigned; -using llvm::divideFloorSigned; -using llvm::mod; - /// Materialize a single constant operation from a given attribute value with /// the desired resultant type. Operation *TensorDialect::materializeConstant(OpBuilder &builder, From f084b46fa161699c9b0b7a48fd1ba111256826dd Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 30 Oct 2025 07:11:01 -0700 Subject: [PATCH 194/539] [Hexagon] Remove a redundant cast (NFC) (#165654) *getInstrInfo() is already of type const HexagonInstrInfo &. --- llvm/lib/Target/Hexagon/HexagonSubtarget.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index b9cdd6a2a3767..ce2de752f3b3a 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -544,7 +544,7 @@ int HexagonSubtarget::updateLatency(MachineInstr &SrcInst, if (!hasV60Ops()) return Latency; - auto &QII = static_cast(*getInstrInfo()); + const HexagonInstrInfo &QII = *getInstrInfo(); // BSB scheduling. if (QII.isHVXVec(SrcInst) || useBSBScheduling()) Latency = (Latency + 1) >> 1; From 5dcfc38aa8aa094f5f8740e323019ec3274dadb6 Mon Sep 17 00:00:00 2001 From: Vigneshwar Jayakumar Date: Thu, 30 Oct 2025 09:23:04 -0500 Subject: [PATCH 195/539] [LICM] Sink unused l-invariant loads in preheader. (#157559) Unused loop invariant loads were not sunk from the preheader to the exit block, increasing live range. This commit moves the sinkUnusedInvariant logic from indvarsimplify to LICM also adds functionality to sink unused load that's not clobbered by the loop body. --- llvm/lib/Transforms/Scalar/IndVarSimplify.cpp | 85 -------- llvm/lib/Transforms/Scalar/LICM.cpp | 87 +++++++- .../AMDGPU/schedule-amdgpu-trackers.ll | 6 +- .../AMDGPU/vgpr-spill-emergency-stack-slot.ll | 2 - .../combine-sext-and-shl-after-isel.ll | 100 ++++------ .../AMDGPU/addrspace-7-doesnt-crash.ll | 2 +- .../IndVarSimplify/ARM/code-size.ll | 22 +-- .../ARM/indvar-unroll-imm-cost.ll | 4 +- .../X86/inner-loop-by-latch-cond.ll | 2 +- .../IndVarSimplify/exit-count-select.ll | 14 +- .../IndVarSimplify/finite-exit-comparisons.ll | 6 +- .../Transforms/IndVarSimplify/pr116483.ll | 8 +- .../test/Transforms/IndVarSimplify/pr24783.ll | 2 +- .../test/Transforms/IndVarSimplify/pr39673.ll | 2 +- .../test/Transforms/IndVarSimplify/pr63763.ll | 6 +- .../IndVarSimplify/replace-loop-exit-folds.ll | 21 +- .../rewrite-loop-exit-values-phi.ll | 8 +- .../scev-expander-preserve-lcssa.ll | 14 +- .../IndVarSimplify/scev-invalidation.ll | 4 +- .../Transforms/IndVarSimplify/sentinel.ll | 14 +- .../IndVarSimplify/sink-from-preheader.ll | 32 --- .../IndVarSimplify/sink-trapping.ll | 19 -- .../Transforms/IndVarSimplify/zext-nuw.ll | 2 +- llvm/test/Transforms/LICM/scalar-promote.ll | 6 +- .../{IndVarSimplify => LICM}/sink-alloca.ll | 6 +- .../Transforms/LICM/sink-from-preheader.ll | 185 ++++++++++++++++++ llvm/test/Transforms/LICM/sink-trapping.ll | 28 +++ .../invalidate-scev-after-hoisting.ll | 2 +- .../LoopDistribute/laa-invalidation.ll | 2 +- .../invariant-store-vectorization.ll | 2 +- .../AArch64/indvars-vectorization.ll | 2 +- .../PhaseOrdering/AArch64/interleave_vec.ll | 4 +- .../PhaseOrdering/AArch64/std-find.ll | 2 +- .../PhaseOrdering/ARM/arm_mult_q15.ll | 20 +- .../X86/pr48844-br-to-switch-vectorization.ll | 6 +- .../test/Transforms/PhaseOrdering/X86/vdiv.ll | 49 ++--- 36 files changed, 453 insertions(+), 323 deletions(-) delete mode 100644 llvm/test/Transforms/IndVarSimplify/sink-from-preheader.ll delete mode 100644 llvm/test/Transforms/IndVarSimplify/sink-trapping.ll rename llvm/test/Transforms/{IndVarSimplify => LICM}/sink-alloca.ll (89%) create mode 100644 llvm/test/Transforms/LICM/sink-from-preheader.ll create mode 100644 llvm/test/Transforms/LICM/sink-trapping.ll diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 7ebcc219efc15..4ba4ba3850e58 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -162,8 +162,6 @@ class IndVarSimplify { const SCEV *ExitCount, PHINode *IndVar, SCEVExpander &Rewriter); - bool sinkUnusedInvariants(Loop *L); - public: IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, const DataLayout &DL, TargetLibraryInfo *TLI, @@ -1079,85 +1077,6 @@ linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB, return true; } -//===----------------------------------------------------------------------===// -// sinkUnusedInvariants. A late subpass to cleanup loop preheaders. -//===----------------------------------------------------------------------===// - -/// If there's a single exit block, sink any loop-invariant values that -/// were defined in the preheader but not used inside the loop into the -/// exit block to reduce register pressure in the loop. -bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { - BasicBlock *ExitBlock = L->getExitBlock(); - if (!ExitBlock) return false; - - BasicBlock *Preheader = L->getLoopPreheader(); - if (!Preheader) return false; - - bool MadeAnyChanges = false; - for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) { - - // Skip BB Terminator. - if (Preheader->getTerminator() == &I) - continue; - - // New instructions were inserted at the end of the preheader. - if (isa(I)) - break; - - // Don't move instructions which might have side effects, since the side - // effects need to complete before instructions inside the loop. Also don't - // move instructions which might read memory, since the loop may modify - // memory. Note that it's okay if the instruction might have undefined - // behavior: LoopSimplify guarantees that the preheader dominates the exit - // block. - if (I.mayHaveSideEffects() || I.mayReadFromMemory()) - continue; - - // Skip debug or pseudo instructions. - if (I.isDebugOrPseudoInst()) - continue; - - // Skip eh pad instructions. - if (I.isEHPad()) - continue; - - // Don't sink alloca: we never want to sink static alloca's out of the - // entry block, and correctly sinking dynamic alloca's requires - // checks for stacksave/stackrestore intrinsics. - // FIXME: Refactor this check somehow? - if (isa(&I)) - continue; - - // Determine if there is a use in or before the loop (direct or - // otherwise). - bool UsedInLoop = false; - for (Use &U : I.uses()) { - Instruction *User = cast(U.getUser()); - BasicBlock *UseBB = User->getParent(); - if (PHINode *P = dyn_cast(User)) { - unsigned i = - PHINode::getIncomingValueNumForOperand(U.getOperandNo()); - UseBB = P->getIncomingBlock(i); - } - if (UseBB == Preheader || L->contains(UseBB)) { - UsedInLoop = true; - break; - } - } - - // If there is, the def must remain in the preheader. - if (UsedInLoop) - continue; - - // Otherwise, sink it to the exit block. - I.moveBefore(ExitBlock->getFirstInsertionPt()); - SE->forgetValue(&I); - MadeAnyChanges = true; - } - - return MadeAnyChanges; -} - static void replaceExitCond(BranchInst *BI, Value *NewCond, SmallVectorImpl &DeadInsts) { auto *OldCond = BI->getCondition(); @@ -2065,10 +1984,6 @@ bool IndVarSimplify::run(Loop *L) { // The Rewriter may not be used from this point on. - // Loop-invariant instructions in the preheader that aren't used in the - // loop may be sunk below the loop to reduce register pressure. - Changed |= sinkUnusedInvariants(L); - // rewriteFirstIterationLoopExitValues does not rely on the computation of // trip count and therefore can further simplify exit values in addition to // rewriteLoopExitValues. diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index b2c526b41502b..d13b9909660ec 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -211,9 +211,15 @@ static Instruction *cloneInstructionInExitBlock( static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU); -static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest, - ICFLoopSafetyInfo &SafetyInfo, - MemorySSAUpdater &MSSAU, ScalarEvolution *SE); +static void moveInstructionBefore( + Instruction &I, BasicBlock::iterator Dest, ICFLoopSafetyInfo &SafetyInfo, + MemorySSAUpdater &MSSAU, ScalarEvolution *SE, + MemorySSA::InsertionPlace Point = MemorySSA::BeforeTerminator); + +static bool sinkUnusedInvariantsFromPreheaderToExit( + Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo, + MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT, + SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE); static void foreachMemoryAccess(MemorySSA *MSSA, Loop *L, function_ref Fn); @@ -471,6 +477,12 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE) : sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE); + + // sink pre-header defs that are unused in-loop into the unique exit to reduce + // pressure. + Changed |= sinkUnusedInvariantsFromPreheaderToExit(L, AA, &SafetyInfo, MSSAU, + SE, DT, Flags, ORE); + Flags.setIsSink(false); if (Preheader) Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L, @@ -1456,19 +1468,80 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest, ICFLoopSafetyInfo &SafetyInfo, - MemorySSAUpdater &MSSAU, - ScalarEvolution *SE) { + MemorySSAUpdater &MSSAU, ScalarEvolution *SE, + MemorySSA::InsertionPlace Point) { SafetyInfo.removeInstruction(&I); SafetyInfo.insertInstructionTo(&I, Dest->getParent()); I.moveBefore(*Dest->getParent(), Dest); if (MemoryUseOrDef *OldMemAcc = cast_or_null( MSSAU.getMemorySSA()->getMemoryAccess(&I))) - MSSAU.moveToPlace(OldMemAcc, Dest->getParent(), - MemorySSA::BeforeTerminator); + MSSAU.moveToPlace(OldMemAcc, Dest->getParent(), Point); if (SE) SE->forgetBlockAndLoopDispositions(&I); } +// If there's a single exit block, sink any loop-invariant values that were +// defined in the preheader but not used inside the loop into the exit block +// to reduce register pressure in the loop. +static bool sinkUnusedInvariantsFromPreheaderToExit( + Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo, + MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT, + SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE) { + BasicBlock *ExitBlock = L->getExitBlock(); + if (!ExitBlock) + return false; + + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + return false; + + bool MadeAnyChanges = false; + + for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) { + + // Skip terminator. + if (Preheader->getTerminator() == &I) + continue; + + // New instructions were inserted at the end of the preheader. + if (isa(I)) + break; + + // Don't move instructions which might have side effects, since the side + // effects need to complete before instructions inside the loop. Note that + // it's okay if the instruction might have undefined behavior: LoopSimplify + // guarantees that the preheader dominates the exit block. + if (I.mayHaveSideEffects()) + continue; + + if (!canSinkOrHoistInst(I, AA, DT, L, MSSAU, true, SinkFlags, nullptr)) + continue; + + // Determine if there is a use in or before the loop (direct or + // otherwise). + bool UsedInLoopOrPreheader = false; + for (Use &U : I.uses()) { + auto *UserI = cast(U.getUser()); + BasicBlock *UseBB = UserI->getParent(); + if (auto *PN = dyn_cast(UserI)) { + UseBB = PN->getIncomingBlock(U); + } + if (UseBB == Preheader || L->contains(UseBB)) { + UsedInLoopOrPreheader = true; + break; + } + } + if (UsedInLoopOrPreheader) + continue; + + moveInstructionBefore(I, ExitBlock->getFirstInsertionPt(), *SafetyInfo, + MSSAU, SE, MemorySSA::Beginning); + MadeAnyChanges = true; + } + + return MadeAnyChanges; +} + static Instruction *sinkThroughTriviallyReplaceablePHI( PHINode *TPN, Instruction *I, LoopInfo *LI, SmallDenseMap &SunkCopies, diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll index c5732531f5423..48ed5c4dedfb2 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll @@ -73,10 +73,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % } ; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure: -; GFX908: NumSgprs: 64 -; GFX908-GCNTRACKERS: NumSgprs: 64 +; GFX908: NumSgprs: 56 +; GFX908-GCNTRACKERS: NumSgprs: 56 ; GFX908: NumVgprs: 43 -; GFX908-GCNTRACKERS: NumVgprs: 39 +; GFX908-GCNTRACKERS: NumVgprs: 40 ; GFX908: Occupancy: 5 ; GFX908-GCNTRACKERS: Occupancy: 6 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index db49339ea1f78..9c16b3c8a3f86 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -22,8 +22,6 @@ ; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000 ; OFFREG is offset system SGPR -; GCN: buffer_store_dword {{v[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 640 diff --git a/llvm/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll b/llvm/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll index 00a77f92c0413..530169ff09486 100644 --- a/llvm/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll +++ b/llvm/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll @@ -212,37 +212,33 @@ define hidden void @testCaller(i1 %incond) local_unnamed_addr align 2 nounwind { ; CHECK-NEXT: std r30, 48(r1) # 8-byte Folded Spill ; CHECK-NEXT: andi. r3, r3, 1 ; CHECK-NEXT: li r3, -1 +; CHECK-NEXT: li r4, 0 ; CHECK-NEXT: li r30, 0 ; CHECK-NEXT: crmove 4*cr2+lt, gt ; CHECK-NEXT: std r29, 40(r1) # 8-byte Folded Spill ; CHECK-NEXT: b .LBB3_2 -; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB3_1: # %if.end116 ; CHECK-NEXT: # ; CHECK-NEXT: bl callee ; CHECK-NEXT: nop ; CHECK-NEXT: mr r3, r29 -; CHECK-NEXT: .LBB3_2: # %cond.end.i.i -; CHECK-NEXT: # =>This Loop Header: Depth=1 -; CHECK-NEXT: # Child Loop BB3_3 Depth 2 -; CHECK-NEXT: lwz r29, 0(r3) -; CHECK-NEXT: li r5, 0 -; CHECK-NEXT: extsw r4, r29 -; CHECK-NEXT: .p2align 5 -; CHECK-NEXT: .LBB3_3: # %while.body5.i -; CHECK-NEXT: # Parent Loop BB3_2 Depth=1 -; CHECK-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-NEXT: addi r5, r5, -1 -; CHECK-NEXT: cmpwi r5, 0 -; CHECK-NEXT: bgt cr0, .LBB3_3 -; CHECK-NEXT: # %bb.4: # %while.cond12.preheader.i +; CHECK-NEXT: li r4, 0 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB3_2: # %while.body5.i ; CHECK-NEXT: # +; CHECK-NEXT: addi r4, r4, -1 +; CHECK-NEXT: cmpwi r4, 0 +; CHECK-NEXT: bgt cr0, .LBB3_2 +; CHECK-NEXT: # %bb.3: # %while.cond12.preheader.i +; CHECK-NEXT: # +; CHECK-NEXT: lwz r29, 0(r3) ; CHECK-NEXT: bc 12, 4*cr2+lt, .LBB3_1 -; CHECK-NEXT: # %bb.5: # %for.cond99.preheader +; CHECK-NEXT: # %bb.4: # %for.cond99.preheader ; CHECK-NEXT: # +; CHECK-NEXT: extsw r4, r29 ; CHECK-NEXT: ld r5, 0(r3) -; CHECK-NEXT: sldi r4, r4, 2 ; CHECK-NEXT: stw r3, 0(r3) +; CHECK-NEXT: sldi r4, r4, 2 ; CHECK-NEXT: stwx r30, r5, r4 ; CHECK-NEXT: b .LBB3_1 ; @@ -256,37 +252,33 @@ define hidden void @testCaller(i1 %incond) local_unnamed_addr align 2 nounwind { ; CHECK-BE-NEXT: std r30, 64(r1) # 8-byte Folded Spill ; CHECK-BE-NEXT: andi. r3, r3, 1 ; CHECK-BE-NEXT: li r3, -1 +; CHECK-BE-NEXT: li r4, 0 ; CHECK-BE-NEXT: li r30, 0 ; CHECK-BE-NEXT: crmove 4*cr2+lt, gt ; CHECK-BE-NEXT: std r29, 56(r1) # 8-byte Folded Spill ; CHECK-BE-NEXT: b .LBB3_2 -; CHECK-BE-NEXT: .p2align 4 ; CHECK-BE-NEXT: .LBB3_1: # %if.end116 ; CHECK-BE-NEXT: # ; CHECK-BE-NEXT: bl callee ; CHECK-BE-NEXT: nop ; CHECK-BE-NEXT: mr r3, r29 -; CHECK-BE-NEXT: .LBB3_2: # %cond.end.i.i -; CHECK-BE-NEXT: # =>This Loop Header: Depth=1 -; CHECK-BE-NEXT: # Child Loop BB3_3 Depth 2 -; CHECK-BE-NEXT: lwz r29, 0(r3) -; CHECK-BE-NEXT: li r5, 0 -; CHECK-BE-NEXT: extsw r4, r29 -; CHECK-BE-NEXT: .p2align 5 -; CHECK-BE-NEXT: .LBB3_3: # %while.body5.i -; CHECK-BE-NEXT: # Parent Loop BB3_2 Depth=1 -; CHECK-BE-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-BE-NEXT: addi r5, r5, -1 -; CHECK-BE-NEXT: cmpwi r5, 0 -; CHECK-BE-NEXT: bgt cr0, .LBB3_3 -; CHECK-BE-NEXT: # %bb.4: # %while.cond12.preheader.i +; CHECK-BE-NEXT: li r4, 0 +; CHECK-BE-NEXT: .p2align 4 +; CHECK-BE-NEXT: .LBB3_2: # %while.body5.i +; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: addi r4, r4, -1 +; CHECK-BE-NEXT: cmpwi r4, 0 +; CHECK-BE-NEXT: bgt cr0, .LBB3_2 +; CHECK-BE-NEXT: # %bb.3: # %while.cond12.preheader.i ; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: lwz r29, 0(r3) ; CHECK-BE-NEXT: bc 12, 4*cr2+lt, .LBB3_1 -; CHECK-BE-NEXT: # %bb.5: # %for.cond99.preheader +; CHECK-BE-NEXT: # %bb.4: # %for.cond99.preheader ; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: extsw r4, r29 ; CHECK-BE-NEXT: ld r5, 0(r3) -; CHECK-BE-NEXT: sldi r4, r4, 2 ; CHECK-BE-NEXT: stw r3, 0(r3) +; CHECK-BE-NEXT: sldi r4, r4, 2 ; CHECK-BE-NEXT: stwx r30, r5, r4 ; CHECK-BE-NEXT: b .LBB3_1 ; @@ -300,32 +292,28 @@ define hidden void @testCaller(i1 %incond) local_unnamed_addr align 2 nounwind { ; CHECK-P9-NEXT: std r0, 80(r1) ; CHECK-P9-NEXT: std r30, 48(r1) # 8-byte Folded Spill ; CHECK-P9-NEXT: li r3, -1 +; CHECK-P9-NEXT: li r4, 0 ; CHECK-P9-NEXT: li r30, 0 ; CHECK-P9-NEXT: std r29, 40(r1) # 8-byte Folded Spill ; CHECK-P9-NEXT: crmove 4*cr2+lt, gt ; CHECK-P9-NEXT: b .LBB3_2 -; CHECK-P9-NEXT: .p2align 4 ; CHECK-P9-NEXT: .LBB3_1: # %if.end116 ; CHECK-P9-NEXT: # ; CHECK-P9-NEXT: bl callee ; CHECK-P9-NEXT: nop ; CHECK-P9-NEXT: mr r3, r29 -; CHECK-P9-NEXT: .LBB3_2: # %cond.end.i.i -; CHECK-P9-NEXT: # =>This Loop Header: Depth=1 -; CHECK-P9-NEXT: # Child Loop BB3_3 Depth 2 -; CHECK-P9-NEXT: lwz r29, 0(r3) ; CHECK-P9-NEXT: li r4, 0 -; CHECK-P9-NEXT: .p2align 5 -; CHECK-P9-NEXT: .LBB3_3: # %while.body5.i -; CHECK-P9-NEXT: # Parent Loop BB3_2 Depth=1 -; CHECK-P9-NEXT: # => This Inner Loop Header: Depth=2 +; CHECK-P9-NEXT: .p2align 4 +; CHECK-P9-NEXT: .LBB3_2: # %while.body5.i +; CHECK-P9-NEXT: # ; CHECK-P9-NEXT: addi r4, r4, -1 ; CHECK-P9-NEXT: cmpwi r4, 0 -; CHECK-P9-NEXT: bgt cr0, .LBB3_3 -; CHECK-P9-NEXT: # %bb.4: # %while.cond12.preheader.i +; CHECK-P9-NEXT: bgt cr0, .LBB3_2 +; CHECK-P9-NEXT: # %bb.3: # %while.cond12.preheader.i ; CHECK-P9-NEXT: # +; CHECK-P9-NEXT: lwz r29, 0(r3) ; CHECK-P9-NEXT: bc 12, 4*cr2+lt, .LBB3_1 -; CHECK-P9-NEXT: # %bb.5: # %for.cond99.preheader +; CHECK-P9-NEXT: # %bb.4: # %for.cond99.preheader ; CHECK-P9-NEXT: # ; CHECK-P9-NEXT: ld r4, 0(r3) ; CHECK-P9-NEXT: extswsli r5, r29, 2 @@ -343,32 +331,28 @@ define hidden void @testCaller(i1 %incond) local_unnamed_addr align 2 nounwind { ; CHECK-P9-BE-NEXT: std r0, 96(r1) ; CHECK-P9-BE-NEXT: std r30, 64(r1) # 8-byte Folded Spill ; CHECK-P9-BE-NEXT: li r3, -1 +; CHECK-P9-BE-NEXT: li r4, 0 ; CHECK-P9-BE-NEXT: li r30, 0 ; CHECK-P9-BE-NEXT: std r29, 56(r1) # 8-byte Folded Spill ; CHECK-P9-BE-NEXT: crmove 4*cr2+lt, gt ; CHECK-P9-BE-NEXT: b .LBB3_2 -; CHECK-P9-BE-NEXT: .p2align 4 ; CHECK-P9-BE-NEXT: .LBB3_1: # %if.end116 ; CHECK-P9-BE-NEXT: # ; CHECK-P9-BE-NEXT: bl callee ; CHECK-P9-BE-NEXT: nop ; CHECK-P9-BE-NEXT: mr r3, r29 -; CHECK-P9-BE-NEXT: .LBB3_2: # %cond.end.i.i -; CHECK-P9-BE-NEXT: # =>This Loop Header: Depth=1 -; CHECK-P9-BE-NEXT: # Child Loop BB3_3 Depth 2 -; CHECK-P9-BE-NEXT: lwz r29, 0(r3) ; CHECK-P9-BE-NEXT: li r4, 0 -; CHECK-P9-BE-NEXT: .p2align 5 -; CHECK-P9-BE-NEXT: .LBB3_3: # %while.body5.i -; CHECK-P9-BE-NEXT: # Parent Loop BB3_2 Depth=1 -; CHECK-P9-BE-NEXT: # => This Inner Loop Header: Depth=2 +; CHECK-P9-BE-NEXT: .p2align 4 +; CHECK-P9-BE-NEXT: .LBB3_2: # %while.body5.i +; CHECK-P9-BE-NEXT: # ; CHECK-P9-BE-NEXT: addi r4, r4, -1 ; CHECK-P9-BE-NEXT: cmpwi r4, 0 -; CHECK-P9-BE-NEXT: bgt cr0, .LBB3_3 -; CHECK-P9-BE-NEXT: # %bb.4: # %while.cond12.preheader.i +; CHECK-P9-BE-NEXT: bgt cr0, .LBB3_2 +; CHECK-P9-BE-NEXT: # %bb.3: # %while.cond12.preheader.i ; CHECK-P9-BE-NEXT: # +; CHECK-P9-BE-NEXT: lwz r29, 0(r3) ; CHECK-P9-BE-NEXT: bc 12, 4*cr2+lt, .LBB3_1 -; CHECK-P9-BE-NEXT: # %bb.5: # %for.cond99.preheader +; CHECK-P9-BE-NEXT: # %bb.4: # %for.cond99.preheader ; CHECK-P9-BE-NEXT: # ; CHECK-P9-BE-NEXT: ld r4, 0(r3) ; CHECK-P9-BE-NEXT: extswsli r5, r29, 2 diff --git a/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll b/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll index 08dcf1d7a0091..8e932e0c00d4f 100644 --- a/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll +++ b/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll @@ -7,11 +7,11 @@ define void @f(ptr addrspace(7) %arg) { ; CHECK-LABEL: define void @f ; CHECK-SAME: (ptr addrspace(7) [[ARG:%.*]]) { ; CHECK-NEXT: bb: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(7) [[ARG]], i32 8 ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: ; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB1]] ; CHECK: bb2: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(7) [[ARG]], i32 8 ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb3: ; CHECK-NEXT: [[I4:%.*]] = load i32, ptr addrspace(7) [[SCEVGEP]], align 4 diff --git a/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll b/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll index 2003b1a72206d..3c6535da486aa 100644 --- a/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll +++ b/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll @@ -4,33 +4,31 @@ define i32 @remove_loop(i32 %size) #0 { ; CHECK-V8M-LABEL: @remove_loop( -; CHECK-V8M-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-V8M-NEXT: entry: -; CHECK-V8M-NEXT: br label %[[WHILE_COND:.*]] -; CHECK-V8M: while.cond: -; CHECK-V8M-NEXT: br i1 false, label %[[WHILE_COND]], label %[[WHILE_END:.*]] -; CHECK-V8M: while.end: -; CHECK-V8M-NEXT: [[TMP0:%.*]] = add i32 [[SIZE]], 31 +; CHECK-V8M-NEXT: [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 31 ; CHECK-V8M-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SIZE]], i32 31) ; CHECK-V8M-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[UMIN]] ; CHECK-V8M-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 5 ; CHECK-V8M-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 5 ; CHECK-V8M-NEXT: [[TMP4:%.*]] = sub i32 [[SIZE]], [[TMP3]] +; CHECK-V8M-NEXT: br label [[WHILE_COND:%.*]] +; CHECK-V8M: while.cond: +; CHECK-V8M-NEXT: br i1 false, label [[WHILE_COND]], label [[WHILE_END:%.*]] +; CHECK-V8M: while.end: ; CHECK-V8M-NEXT: ret i32 [[TMP4]] ; ; CHECK-V8A-LABEL: @remove_loop( -; CHECK-V8A-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-V8A-NEXT: entry: -; CHECK-V8A-NEXT: br label %[[WHILE_COND:.*]] -; CHECK-V8A: while.cond: -; CHECK-V8A-NEXT: br i1 false, label %[[WHILE_COND]], label %[[WHILE_END:.*]] -; CHECK-V8A: while.end: -; CHECK-V8A-NEXT: [[TMP0:%.*]] = add i32 [[SIZE]], 31 +; CHECK-V8A-NEXT: [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 31 ; CHECK-V8A-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SIZE]], i32 31) ; CHECK-V8A-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[UMIN]] ; CHECK-V8A-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 5 ; CHECK-V8A-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 5 ; CHECK-V8A-NEXT: [[TMP4:%.*]] = sub i32 [[SIZE]], [[TMP3]] +; CHECK-V8A-NEXT: br label [[WHILE_COND:%.*]] +; CHECK-V8A: while.cond: +; CHECK-V8A-NEXT: br i1 false, label [[WHILE_COND]], label [[WHILE_END:%.*]] +; CHECK-V8A: while.end: ; CHECK-V8A-NEXT: ret i32 [[TMP4]] ; entry: diff --git a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll index 2261423766792..382f026e7de6a 100644 --- a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll +++ b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll @@ -77,6 +77,8 @@ define dso_local arm_aapcscc void @test(ptr nocapture %pDest, ptr nocapture read ; CHECK-NEXT: [[CMP2780:%.*]] = icmp ugt i32 [[ADD25]], [[J_0_LCSSA]] ; CHECK-NEXT: br i1 [[CMP2780]], label [[FOR_BODY29_PREHEADER:%.*]], label [[FOR_END40]] ; CHECK: for.body29.preheader: +; CHECK-NEXT: [[TMP10:%.*]] = sub nsw i32 [[ADD25]], [[J_0_LCSSA]] +; CHECK-NEXT: [[SCEVGEP93:%.*]] = getelementptr i16, ptr [[PSRCB_ADDR_1_LCSSA]], i32 [[TMP10]] ; CHECK-NEXT: br label [[FOR_BODY29:%.*]] ; CHECK: for.body29: ; CHECK-NEXT: [[J_184:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY29]] ], [ [[J_0_LCSSA]], [[FOR_BODY29_PREHEADER]] ] @@ -100,8 +102,6 @@ define dso_local arm_aapcscc void @test(ptr nocapture %pDest, ptr nocapture read ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ADD25]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END40_LOOPEXIT:%.*]], label [[FOR_BODY29]] ; CHECK: for.end40.loopexit: -; CHECK-NEXT: [[TMP10:%.*]] = sub nsw i32 [[ADD25]], [[J_0_LCSSA]] -; CHECK-NEXT: [[SCEVGEP93:%.*]] = getelementptr i16, ptr [[PSRCB_ADDR_1_LCSSA]], i32 [[TMP10]] ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i16, ptr [[PSRCA_ADDR_1_LCSSA]], i32 [[TMP10]] ; CHECK-NEXT: [[SCEVGEP94:%.*]] = getelementptr i32, ptr [[PDEST_ADDR_1_LCSSA]], i32 [[TMP10]] ; CHECK-NEXT: br label [[FOR_END40]] diff --git a/llvm/test/Transforms/IndVarSimplify/X86/inner-loop-by-latch-cond.ll b/llvm/test/Transforms/IndVarSimplify/X86/inner-loop-by-latch-cond.ll index 0fa6e34cf186e..0eb9debce8177 100644 --- a/llvm/test/Transforms/IndVarSimplify/X86/inner-loop-by-latch-cond.ll +++ b/llvm/test/Transforms/IndVarSimplify/X86/inner-loop-by-latch-cond.ll @@ -14,6 +14,7 @@ define void @test(i64 %a) { ; CHECK: outer_header: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 21, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[I:%.*]] = phi i64 [ 20, [[ENTRY]] ], [ [[I_NEXT:%.*]], [[OUTER_LATCH]] ] +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: br label [[INNER_HEADER:%.*]] ; CHECK: inner_header: ; CHECK-NEXT: [[J:%.*]] = phi i64 [ 1, [[OUTER_HEADER]] ], [ [[J_NEXT:%.*]], [[INNER_HEADER]] ] @@ -22,7 +23,6 @@ define void @test(i64 %a) { ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[J_NEXT]], [[INDVARS_IV]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[INNER_HEADER]], label [[OUTER_LATCH]] ; CHECK: outer_latch: -; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND2:%.*]] = icmp ne i64 [[I_NEXT]], 40 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: br i1 [[COND2]], label [[OUTER_HEADER]], label [[RETURN:%.*]] diff --git a/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll b/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll index 1592b84480e3f..829092f2f4bd4 100644 --- a/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll +++ b/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=indvars -S | FileCheck %s +; RUN: opt < %s -passes='require,indvars,loop-mssa(licm)' -S | FileCheck %s define i32 @logical_and_2ops(i32 %n, i32 %m) { ; CHECK-LABEL: @logical_and_2ops( @@ -56,10 +56,10 @@ define i32 @logical_and_3ops(i32 %n, i32 %m, i32 %k) { ; CHECK: loop: ; CHECK-NEXT: br i1 false, label [[LOOP]], label [[EXIT:%.*]] ; CHECK: exit: -; CHECK-NEXT: [[TMP0:%.*]] = freeze i32 [[K:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 [[M:%.*]] -; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP1]]) -; CHECK-NEXT: [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N:%.*]]) +; CHECK-NEXT: [[N:%.*]] = freeze i32 [[K:%.*]] +; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 [[N]]) +; CHECK-NEXT: [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N1:%.*]]) ; CHECK-NEXT: ret i32 [[UMIN1]] ; entry: @@ -84,10 +84,10 @@ define i32 @logical_or_3ops(i32 %n, i32 %m, i32 %k) { ; CHECK: loop: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: -; CHECK-NEXT: [[TMP0:%.*]] = freeze i32 [[K:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 [[M:%.*]] -; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP1]]) -; CHECK-NEXT: [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N:%.*]]) +; CHECK-NEXT: [[N:%.*]] = freeze i32 [[K:%.*]] +; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 [[N]]) +; CHECK-NEXT: [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N1:%.*]]) ; CHECK-NEXT: ret i32 [[UMIN1]] ; entry: diff --git a/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll b/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll index e006d9f6696ca..f798eb281f51a 100644 --- a/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll +++ b/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll @@ -932,6 +932,9 @@ for.end: ; preds = %for.body, %entry define i16 @ult_multiuse_profit(i16 %n.raw, i8 %start) mustprogress { ; CHECK-LABEL: @ult_multiuse_profit( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[START:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP2]] to i16 +; CHECK-NEXT: [[UMAX:%.*]] = call i16 @llvm.umax.i16(i16 [[TMP1]], i16 254) ; CHECK-NEXT: [[TMP0:%.*]] = trunc i16 254 to i8 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -940,9 +943,6 @@ define i16 @ult_multiuse_profit(i16 %n.raw, i8 %start) mustprogress { ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[START:%.*]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i16 -; CHECK-NEXT: [[UMAX:%.*]] = call i16 @llvm.umax.i16(i16 [[TMP2]], i16 254) ; CHECK-NEXT: ret i16 [[UMAX]] ; entry: diff --git a/llvm/test/Transforms/IndVarSimplify/pr116483.ll b/llvm/test/Transforms/IndVarSimplify/pr116483.ll index 093e25a3caa81..e9e0d22bf960a 100644 --- a/llvm/test/Transforms/IndVarSimplify/pr116483.ll +++ b/llvm/test/Transforms/IndVarSimplify/pr116483.ll @@ -4,16 +4,16 @@ define i32 @test() { ; CHECK-LABEL: define i32 @test() { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: br label %[[LOOP_BODY:.*]] -; CHECK: [[LOOP_BODY]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[LOOP_BODY]] -; CHECK: [[EXIT]]: ; CHECK-NEXT: [[XOR:%.*]] = xor i32 0, 3 ; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[XOR]], 329 ; CHECK-NEXT: [[CONV:%.*]] = trunc i32 [[MUL]] to i16 ; CHECK-NEXT: [[SEXT:%.*]] = shl i16 [[CONV]], 8 ; CHECK-NEXT: [[CONV1:%.*]] = ashr i16 [[SEXT]], 8 ; CHECK-NEXT: [[CONV3:%.*]] = zext i16 [[CONV1]] to i32 +; CHECK-NEXT: br label %[[LOOP_BODY:.*]] +; CHECK: [[LOOP_BODY]]: +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[LOOP_BODY]] +; CHECK: [[EXIT]]: ; CHECK-NEXT: ret i32 [[CONV3]] ; entry: diff --git a/llvm/test/Transforms/IndVarSimplify/pr24783.ll b/llvm/test/Transforms/IndVarSimplify/pr24783.ll index c521bcaf59d49..37ecf42ea0fd3 100644 --- a/llvm/test/Transforms/IndVarSimplify/pr24783.ll +++ b/llvm/test/Transforms/IndVarSimplify/pr24783.ll @@ -7,11 +7,11 @@ target triple = "powerpc64-unknown-linux-gnu" define void @f(ptr %end.s, ptr %loc, i32 %p) { ; CHECK-LABEL: @f( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[END:%.*]] = getelementptr inbounds i32, ptr [[END_S:%.*]], i32 [[P:%.*]] ; CHECK-NEXT: br label [[WHILE_BODY_I:%.*]] ; CHECK: while.body.i: ; CHECK-NEXT: br i1 true, label [[LOOP_EXIT:%.*]], label [[WHILE_BODY_I]] ; CHECK: loop.exit: -; CHECK-NEXT: [[END:%.*]] = getelementptr inbounds i32, ptr [[END_S:%.*]], i32 [[P:%.*]] ; CHECK-NEXT: store ptr [[END]], ptr [[LOC:%.*]], align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/IndVarSimplify/pr39673.ll b/llvm/test/Transforms/IndVarSimplify/pr39673.ll index 7b093b34b91ad..3cee1ab7be881 100644 --- a/llvm/test/Transforms/IndVarSimplify/pr39673.ll +++ b/llvm/test/Transforms/IndVarSimplify/pr39673.ll @@ -148,6 +148,7 @@ loop2.end: ; preds = %loop2 define i16 @neg_loop_carried(i16 %arg) { ; CHECK-LABEL: @neg_loop_carried( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[ARG:%.*]], 2 ; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop1: ; CHECK-NEXT: [[L1:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[L1_ADD:%.*]], [[LOOP1]] ] @@ -155,7 +156,6 @@ define i16 @neg_loop_carried(i16 %arg) { ; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i16 [[L1_ADD]], 2 ; CHECK-NEXT: br i1 [[CMP1]], label [[LOOP1]], label [[LOOP2_PREHEADER:%.*]] ; CHECK: loop2.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[ARG:%.*]], 2 ; CHECK-NEXT: br label [[LOOP2:%.*]] ; CHECK: loop2: ; CHECK-NEXT: [[K2:%.*]] = phi i16 [ [[K2_ADD:%.*]], [[LOOP2]] ], [ [[TMP0]], [[LOOP2_PREHEADER]] ] diff --git a/llvm/test/Transforms/IndVarSimplify/pr63763.ll b/llvm/test/Transforms/IndVarSimplify/pr63763.ll index 427db1e67410a..a5fde67d6140a 100644 --- a/llvm/test/Transforms/IndVarSimplify/pr63763.ll +++ b/llvm/test/Transforms/IndVarSimplify/pr63763.ll @@ -16,13 +16,13 @@ define i32 @test(i1 %c) { ; CHECK-NEXT: [[CONV2:%.*]] = ashr exact i32 [[SEXT]], 24 ; CHECK-NEXT: [[INVARIANT_OP:%.*]] = sub nsw i32 7, [[CONV2]] ; CHECK-NEXT: call void @use(i32 [[INVARIANT_OP]]) +; CHECK-NEXT: [[SEXT_US:%.*]] = shl i32 [[SEL]], 24 +; CHECK-NEXT: [[CONV2_US:%.*]] = ashr exact i32 [[SEXT_US]], 24 +; CHECK-NEXT: [[INVARIANT_OP_US:%.*]] = sub nsw i32 7, [[CONV2_US]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: -; CHECK-NEXT: [[SEXT_US:%.*]] = shl i32 [[SEL]], 24 -; CHECK-NEXT: [[CONV2_US:%.*]] = ashr exact i32 [[SEXT_US]], 24 -; CHECK-NEXT: [[INVARIANT_OP_US:%.*]] = sub nsw i32 7, [[CONV2_US]] ; CHECK-NEXT: ret i32 [[INVARIANT_OP_US]] ; entry: diff --git a/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll b/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll index b3162de0f2245..7cdc98a6c4f7c 100644 --- a/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll +++ b/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll @@ -4,22 +4,21 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" define i32 @remove_loop(i32 %size) { -; CHECK-LABEL: define i32 @remove_loop( -; CHECK-SAME: i32 [[SIZE:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[WHILE_COND:.*]] -; CHECK: [[WHILE_COND]]: -; CHECK-NEXT: [[SIZE_ADDR_0:%.*]] = phi i32 [ [[SIZE]], %[[ENTRY]] ], [ [[SUB:%.*]], %[[WHILE_COND]] ] -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[SIZE_ADDR_0]], 31 -; CHECK-NEXT: [[SUB]] = add i32 [[SIZE_ADDR_0]], -32 -; CHECK-NEXT: br i1 [[CMP]], label %[[WHILE_COND]], label %[[WHILE_END:.*]] -; CHECK: [[WHILE_END]]: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SIZE]], 31 +; CHECK-LABEL: @remove_loop( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 31 ; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SIZE]], i32 31) ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[UMIN]] ; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 5 ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 5 ; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[SIZE]], [[TMP3]] +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[SIZE_ADDR_0:%.*]] = phi i32 [ [[SIZE]], [[ENTRY:%.*]] ], [ [[SUB:%.*]], [[WHILE_COND]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[SIZE_ADDR_0]], 31 +; CHECK-NEXT: [[SUB]] = add i32 [[SIZE_ADDR_0]], -32 +; CHECK-NEXT: br i1 [[CMP]], label [[WHILE_COND]], label [[WHILE_END:%.*]] +; CHECK: while.end: ; CHECK-NEXT: ret i32 [[TMP4]] ; entry: diff --git a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll index 84ae79d53e25e..41fce3681c3a3 100644 --- a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll +++ b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll @@ -76,6 +76,10 @@ define i64 @narow_canonical_iv_wide_multiplied_iv(i32 %x, i64 %y, ptr %0) { ; CHECK-LABEL: @narow_canonical_iv_wide_multiplied_iv( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SMAX:%.*]] = tail call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 1) +; CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[SMAX]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[Y:%.*]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP3]], 1 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -84,10 +88,6 @@ define i64 @narow_canonical_iv_wide_multiplied_iv(i32 %x, i64 %y, ptr %0) { ; CHECK-NEXT: [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], [[SMAX]] ; CHECK-NEXT: br i1 [[EC]], label [[LOOP]], label [[EXIT:%.*]] ; CHECK: exit: -; CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[SMAX]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[Y:%.*]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP3]], 1 ; CHECK-NEXT: ret i64 [[TMP6]] ; entry: diff --git a/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll b/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll index 14e06fe06b412..aca553e536119 100644 --- a/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll +++ b/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll @@ -23,8 +23,8 @@ define void @test1(i8 %x, ptr %ptr) { ; CHECK-NEXT: br label [[WHILE_COND192:%.*]] ; CHECK: while.cond192: ; CHECK-NEXT: switch i8 [[X:%.*]], label [[WHILE_BODY205:%.*]] [ -; CHECK-NEXT: i8 59, label [[WHILE_COND215_PREHEADER:%.*]] -; CHECK-NEXT: i8 10, label [[IF_END224_LOOPEXIT1:%.*]] +; CHECK-NEXT: i8 59, label [[WHILE_COND215_PREHEADER:%.*]] +; CHECK-NEXT: i8 10, label [[IF_END224_LOOPEXIT1:%.*]] ; CHECK-NEXT: ] ; CHECK: while.cond215.preheader: ; CHECK-NEXT: br label [[WHILE_COND215:%.*]] @@ -103,8 +103,8 @@ define void @test2(i16 %x) { ; CHECK-NEXT: br label [[FOR_COND:%.*]] ; CHECK: for.cond: ; CHECK-NEXT: switch i16 [[X:%.*]], label [[RETURN_LOOPEXIT1:%.*]] [ -; CHECK-NEXT: i16 41, label [[FOR_END:%.*]] -; CHECK-NEXT: i16 43, label [[FOR_COND]] +; CHECK-NEXT: i16 41, label [[FOR_END:%.*]] +; CHECK-NEXT: i16 43, label [[FOR_COND]] ; CHECK-NEXT: ] ; CHECK: for.end: ; CHECK-NEXT: [[I_0_LCSSA2:%.*]] = phi i32 [ 0, [[FOR_COND]] ] @@ -336,6 +336,7 @@ if.end1824: ; preds = %for.end1326 define void @test5(ptr %header, i32 %conv, i8 %n) { ; CHECK-LABEL: @test5( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[SHL:%.*]] = shl nuw nsw i32 [[CONV:%.*]], 2 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: br label [[FOR_INNER:%.*]] @@ -358,7 +359,6 @@ define void @test5(ptr %header, i32 %conv, i8 %n) { ; CHECK-NEXT: br i1 false, label [[FOR_BODY]], label [[WHILE_COND_PREHEADER:%.*]] ; CHECK: while.cond.preheader: ; CHECK-NEXT: [[ADD85_LCSSA:%.*]] = phi i32 [ [[ADD85]], [[FOR_INC]] ] -; CHECK-NEXT: [[SHL:%.*]] = shl nuw nsw i32 [[CONV:%.*]], 2 ; CHECK-NEXT: br label [[WHILE_COND:%.*]] ; CHECK: while.cond: ; CHECK-NEXT: [[POS_8:%.*]] = phi i32 [ [[INC114:%.*]], [[WHILE_BODY:%.*]] ], [ [[ADD85_LCSSA]], [[WHILE_COND_PREHEADER]] ] @@ -427,8 +427,8 @@ define void @test6(i8 %x) { ; CHECK-NEXT: br label [[WHILE_COND192:%.*]] ; CHECK: while.cond192: ; CHECK-NEXT: switch i8 [[X:%.*]], label [[WHILE_BODY205:%.*]] [ -; CHECK-NEXT: i8 59, label [[WHILE_COND215_PREHEADER:%.*]] -; CHECK-NEXT: i8 10, label [[IF_END224:%.*]] +; CHECK-NEXT: i8 59, label [[WHILE_COND215_PREHEADER:%.*]] +; CHECK-NEXT: i8 10, label [[IF_END224:%.*]] ; CHECK-NEXT: ] ; CHECK: while.cond215.preheader: ; CHECK-NEXT: [[I_7_LCSSA:%.*]] = phi i32 [ 0, [[WHILE_COND192]] ] diff --git a/llvm/test/Transforms/IndVarSimplify/scev-invalidation.ll b/llvm/test/Transforms/IndVarSimplify/scev-invalidation.ll index a92d328df99ca..ad69812838569 100644 --- a/llvm/test/Transforms/IndVarSimplify/scev-invalidation.ll +++ b/llvm/test/Transforms/IndVarSimplify/scev-invalidation.ll @@ -46,12 +46,12 @@ for.end106: ; preds = %for.cond define i32 @test_pr58439(i32 %a) { ; CHECK-LABEL: @test_pr58439( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[OR:%.*]] = or i32 [[A:%.*]], 1 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: br i1 false, label [[LOOP]], label [[EXIT:%.*]] ; CHECK: exit: ; CHECK-NEXT: [[C_EXT_LCSSA:%.*]] = phi i32 [ 0, [[LOOP]] ] -; CHECK-NEXT: [[OR:%.*]] = or i32 [[A:%.*]], 1 ; CHECK-NEXT: [[RES:%.*]] = add i32 [[C_EXT_LCSSA]], [[OR]] ; CHECK-NEXT: ret i32 [[RES]] ; @@ -76,6 +76,7 @@ define i8 @l(i32 %inc, i1 %tobool.not.i) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] ; CHECK: outer.header: +; CHECK-NEXT: [[AND:%.*]] = and i32 1, [[INC:%.*]] ; CHECK-NEXT: br label [[INNER:%.*]] ; CHECK: inner: ; CHECK-NEXT: [[C_05_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[INNER]] ], [ 0, [[OUTER_HEADER]] ] @@ -86,7 +87,6 @@ define i8 @l(i32 %inc, i1 %tobool.not.i) { ; CHECK: outer.latch: ; CHECK-NEXT: [[C_05_I_LCSSA:%.*]] = phi i32 [ [[C_05_I]], [[INNER]] ] ; CHECK-NEXT: [[LCSSA:%.*]] = phi i32 [ 0, [[INNER]] ] -; CHECK-NEXT: [[AND:%.*]] = and i32 1, [[INC:%.*]] ; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[AND]] to i8 ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[C_05_I_LCSSA]] to i8 ; CHECK-NEXT: [[TMP2:%.*]] = sub i8 [[TMP0]], [[TMP1]] diff --git a/llvm/test/Transforms/IndVarSimplify/sentinel.ll b/llvm/test/Transforms/IndVarSimplify/sentinel.ll index 523414167956b..4f12308f3b01a 100644 --- a/llvm/test/Transforms/IndVarSimplify/sentinel.ll +++ b/llvm/test/Transforms/IndVarSimplify/sentinel.ll @@ -9,19 +9,19 @@ define void @test(i1 %arg) personality ptr @snork { ; CHECK-NEXT: bb: ; CHECK-NEXT: br label [[BB4:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = add i32 [[INDVARS_IV:%.*]], 1 -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[TMP6:%.*]], [[INDVARS_IV]] -; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMAX:%.*]] ; CHECK-NEXT: br i1 [[ARG:%.*]], label [[BB2:%.*]], label [[BB4]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[TMP1]], [[BB1:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[TMP1:%.*]], [[BB1:%.*]] ] ; CHECK-NEXT: ret void ; CHECK: bb4: -; CHECK-NEXT: [[INDVARS_IV]] = phi i32 [ [[INDVARS_IV_NEXT]], [[BB1]] ], [ undef, [[BB:%.*]] ] -; CHECK-NEXT: [[SMAX]] = call i32 @llvm.smax.i32(i32 [[INDVARS_IV]], i32 36) -; CHECK-NEXT: [[TMP6]] = invoke i32 @quux() [ "deopt"(i32 0, i32 0, i32 0, i32 180, i32 0, i32 25, i32 0, i32 7, ptr null, i32 7, ptr null, i32 7, ptr null, i32 3, i32 [[INDVARS_IV]], i32 3, i32 undef, i32 7, ptr null, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 4, double undef, i32 7, ptr null, i32 4, i64 undef, i32 7, ptr null, i32 0, ptr addrspace(1) undef, i32 3, i32 undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 7, ptr null) ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[BB1]] ], [ undef, [[BB:%.*]] ] +; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[INDVARS_IV]], i32 36) +; CHECK-NEXT: [[TMP6:%.*]] = invoke i32 @quux() [ "deopt"(i32 0, i32 0, i32 0, i32 180, i32 0, i32 25, i32 0, i32 7, ptr null, i32 7, ptr null, i32 7, ptr null, i32 3, i32 [[INDVARS_IV]], i32 3, i32 undef, i32 7, ptr null, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 4, double undef, i32 7, ptr null, i32 4, i64 undef, i32 7, ptr null, i32 0, ptr addrspace(1) undef, i32 3, i32 undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 7, ptr null) ] ; CHECK-NEXT: to label [[BB7:%.*]] unwind label [[BB15:%.*]] ; CHECK: bb7: +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[TMP6]], [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1]] = sub i32 [[TMP0]], [[SMAX]] ; CHECK-NEXT: br label [[BB9:%.*]] ; CHECK: bb9: ; CHECK-NEXT: br i1 true, label [[BB1]], label [[BB9]] diff --git a/llvm/test/Transforms/IndVarSimplify/sink-from-preheader.ll b/llvm/test/Transforms/IndVarSimplify/sink-from-preheader.ll deleted file mode 100644 index 89583f9131518..0000000000000 --- a/llvm/test/Transforms/IndVarSimplify/sink-from-preheader.ll +++ /dev/null @@ -1,32 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=indvars -indvars-predicate-loops=0 -S | FileCheck %s -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" -target triple = "i386-apple-darwin10.0" - -; We make sinking here, Changed flag should be set properly. -define i32 @test(i32 %a, i32 %b, i32 %N) { -; CHECK-LABEL: @test( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]] -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]] -; CHECK: exit: -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: ret i32 [[ADD]] -; -entry: - %add = add i32 %a, %b - br label %loop - -loop: - %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] - %iv.next = add i32 %iv, 1 - %cmp = icmp slt i32 %iv.next, %N - br i1 %cmp, label %loop, label %exit - -exit: - ret i32 %add -} diff --git a/llvm/test/Transforms/IndVarSimplify/sink-trapping.ll b/llvm/test/Transforms/IndVarSimplify/sink-trapping.ll deleted file mode 100644 index d2478be5a8fcc..0000000000000 --- a/llvm/test/Transforms/IndVarSimplify/sink-trapping.ll +++ /dev/null @@ -1,19 +0,0 @@ -; RUN: opt < %s -passes=indvars -S | FileCheck %s - -declare i1 @b() - -define i32 @a(i32 %x) nounwind { -for.body.preheader: - %y = sdiv i32 10, %x - br label %for.body - -for.body: - %cmp = call i1 @b() - br i1 %cmp, label %for.body, label %for.end.loopexit - -for.end.loopexit: - ret i32 %y -} -; CHECK: for.end.loopexit: -; CHECK: sdiv -; CHECK: ret diff --git a/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll b/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll index 17921afc5ff06..abe7a3e618dd8 100644 --- a/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll +++ b/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll @@ -24,13 +24,13 @@ define void @_Z3fn1v() { ; CHECK-NEXT: [[X8:%.*]] = icmp ult i32 0, 4 ; CHECK-NEXT: br i1 [[X8]], label [[DOTPREHEADER_LR_PH:%.*]], label [[X22]] ; CHECK: .preheader.lr.ph: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[K_09]], i64 [[TMP5]] ; CHECK-NEXT: br label [[DOTPREHEADER:%.*]] ; CHECK: .preheader: ; CHECK-NEXT: br label [[X17:%.*]] ; CHECK: x17: ; CHECK-NEXT: br i1 false, label [[DOTPREHEADER]], label [[DOT_CRIT_EDGE_8:%.*]] ; CHECK: ._crit_edge.8: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[K_09]], i64 [[TMP5]] ; CHECK-NEXT: br label [[X22]] ; CHECK: x22: ; CHECK-NEXT: [[K_1_LCSSA:%.*]] = phi ptr [ [[SCEVGEP]], [[DOT_CRIT_EDGE_8]] ], [ [[K_09]], [[DOTPREHEADER4]] ] diff --git a/llvm/test/Transforms/LICM/scalar-promote.ll b/llvm/test/Transforms/LICM/scalar-promote.ll index 3af65df55a099..e6cc457bd55b4 100644 --- a/llvm/test/Transforms/LICM/scalar-promote.ll +++ b/llvm/test/Transforms/LICM/scalar-promote.ll @@ -43,9 +43,9 @@ define void @test2(i32 %i) { ; CHECK-LABEL: define void @test2( ; CHECK-SAME: i32 [[I:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[X1:%.*]] = getelementptr i32, ptr @X, i64 1 ; CHECK-NEXT: [[X2:%.*]] = getelementptr i32, ptr @X, i64 1 -; CHECK-NEXT: [[X1_PROMOTED:%.*]] = load i32, ptr [[X1]], align 4 +; CHECK-NEXT: [[X3:%.*]] = getelementptr i32, ptr @X, i64 1 +; CHECK-NEXT: [[X1_PROMOTED:%.*]] = load i32, ptr [[X2]], align 4 ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[A1:%.*]] = phi i32 [ [[V:%.*]], %[[LOOP]] ], [ [[X1_PROMOTED]], %[[ENTRY]] ] @@ -53,7 +53,7 @@ define void @test2(i32 %i) { ; CHECK-NEXT: br i1 false, label %[[LOOP]], label %[[EXIT:.*]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[V_LCSSA:%.*]] = phi i32 [ [[V]], %[[LOOP]] ] -; CHECK-NEXT: store i32 [[V_LCSSA]], ptr [[X1]], align 4 +; CHECK-NEXT: store i32 [[V_LCSSA]], ptr [[X2]], align 4 ; CHECK-NEXT: ret void ; Entry: diff --git a/llvm/test/Transforms/IndVarSimplify/sink-alloca.ll b/llvm/test/Transforms/LICM/sink-alloca.ll similarity index 89% rename from llvm/test/Transforms/IndVarSimplify/sink-alloca.ll rename to llvm/test/Transforms/LICM/sink-alloca.ll index 0997bf6128869..2bf9350b71ea7 100644 --- a/llvm/test/Transforms/IndVarSimplify/sink-alloca.ll +++ b/llvm/test/Transforms/LICM/sink-alloca.ll @@ -1,9 +1,9 @@ -; RUN: opt < %s -passes=indvars -S | FileCheck %s +; RUN: opt < %s -passes=licm -verify-memoryssa -S | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i386-apple-darwin10.0" ; PR4775 -; Indvars shouldn't sink the alloca out of the entry block, even though +; LICM shouldn't sink the alloca out of the entry block, even though ; it's not used until after the loop. define i32 @main() nounwind { ; CHECK: entry: @@ -25,7 +25,7 @@ while.end: ; preds = %while.cond declare i32 @bar() ; -; Indvars shouldn't sink the first alloca between the stacksave and stackrestore +; LICM shouldn't sink the first alloca between the stacksave and stackrestore ; intrinsics. declare ptr @a(...) declare ptr @llvm.stacksave() nounwind diff --git a/llvm/test/Transforms/LICM/sink-from-preheader.ll b/llvm/test/Transforms/LICM/sink-from-preheader.ll new file mode 100644 index 0000000000000..bbe3d3b285c15 --- /dev/null +++ b/llvm/test/Transforms/LICM/sink-from-preheader.ll @@ -0,0 +1,185 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=licm -verify-memoryssa -S | FileCheck %s + +; We perform sinking here, Changed flag should be set properly. +define i32 @test(i32 %a, i32 %b, i32 %N) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret i32 [[ADD]] +; +entry: + %add = add i32 %a, %b + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %iv.next = add i32 %iv, 1 + %cmp = icmp slt i32 %iv.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret i32 %add +} + +define i32 @test_with_unused_load(i32 %a, ptr %b, i32 %N) { +; CHECK-LABEL: @test_with_unused_load( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[A:%.*]], [[LOAD]] +; CHECK-NEXT: ret i32 [[ADD]] +; +entry: + %load = load i32, ptr %b + %add = add i32 %a, %load + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %iv.next = add i32 %iv, 1 + %cmp = icmp slt i32 %iv.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret i32 %add +} + +define i32 @test_with_unused_load_modified_store(i32 %a, ptr %b, i32 %N) { +; CHECK-LABEL: @test_with_unused_load_modified_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[B:%.*]], align 4 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], [[A:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[SMAX:%.*]] = phi i32 [ [[IV_NEXT]], [[LOOP]] ] +; CHECK-NEXT: store i32 [[SMAX]], ptr [[B]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[A]], [[LOAD]] +; CHECK-NEXT: ret i32 [[ADD]] +; +entry: + %load = load i32, ptr %b + %add = add i32 %a, %load + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %iv.next = add i32 %iv, %a + store i32 %iv.next, ptr %b + %cmp = icmp slt i32 %iv.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret i32 %add +} + +; Volatile loads must not be sunk. +define i32 @test_with_volatile_load_no_sink(i32 %a, ptr %b, i32 %N) { +; CHECK-LABEL: @test_with_volatile_load_no_sink( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LD:%.*]] = load volatile i32, ptr [[B:%.*]], align 4 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[A:%.*]], [[LD]] +; CHECK-NEXT: ret i32 [[ADD]] +; +entry: + %ld = load volatile i32, ptr %b, align 4 + %add = add i32 %a, %ld + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %iv.next = add i32 %iv, 1 + %cmp = icmp slt i32 %iv.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret i32 %add +} + +; Ordered/atomic loads must not be sunk. +define i32 @test_with_atomic_load_no_sink(i32 %a, ptr %b, i32 %N) { +; CHECK-LABEL: @test_with_atomic_load_no_sink( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LD:%.*]] = load atomic i32, ptr [[B:%.*]] acquire, align 4 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[A:%.*]], [[LD]] +; CHECK-NEXT: ret i32 [[ADD]] +; +entry: + %ld = load atomic i32, ptr %b acquire, align 4 + %add = add i32 %a, %ld + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %iv.next = add i32 %iv, 1 + %cmp = icmp slt i32 %iv.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret i32 %add +} + +declare void @clobber(ptr) + +; Calls that may write memory in the loop should prevent sinking the load. +define i32 @test_with_unused_load_clobbered_by_call(i32 %a, ptr %b, i32 %N) { +; CHECK-LABEL: @test_with_unused_load_clobbered_by_call( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr [[B:%.*]], align 4 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: call void @clobber(ptr [[B]]) +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[A:%.*]], [[LD]] +; CHECK-NEXT: ret i32 [[ADD]] +; +entry: + %ld = load i32, ptr %b, align 4 + %add = add i32 %a, %ld + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %iv.next = add i32 %iv, 1 + call void @clobber(ptr %b) + %cmp = icmp slt i32 %iv.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret i32 %add +} diff --git a/llvm/test/Transforms/LICM/sink-trapping.ll b/llvm/test/Transforms/LICM/sink-trapping.ll new file mode 100644 index 0000000000000..f4d260d973987 --- /dev/null +++ b/llvm/test/Transforms/LICM/sink-trapping.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -passes=licm -verify-memoryssa -S | FileCheck %s + +declare i1 @b() + +define i32 @a(i32 %x) nounwind { +; CHECK-LABEL: define i32 @a( +; CHECK-SAME: i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[FOR_BODY_PREHEADER:.*:]] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[CMP:%.*]] = call i1 @b() +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END_LOOPEXIT:.*]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[Y:%.*]] = sdiv i32 10, [[X]] +; CHECK-NEXT: ret i32 [[Y]] +; +for.body.preheader: + %y = sdiv i32 10, %x + br label %for.body + +for.body: + %cmp = call i1 @b() + br i1 %cmp, label %for.body, label %for.end.loopexit + +for.end.loopexit: + ret i32 %y +} diff --git a/llvm/test/Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll b/llvm/test/Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll index bdd51c2b6bc53..6c19aaad03ba8 100644 --- a/llvm/test/Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll +++ b/llvm/test/Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll @@ -84,13 +84,13 @@ define i32 @scev_invalidation_after_deleting(ptr %src) { ; CHECK: inner.2.preheader: ; CHECK-NEXT: br label [[INNER_3_PH:%.*]] ; CHECK: inner.3.ph: +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 0 to i32 ; CHECK-NEXT: br label [[INNER_3:%.*]] ; CHECK: inner.3: ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[SRC:%.*]], align 4 ; CHECK-NEXT: br i1 false, label [[OUTER_LATCH]], label [[INNER_3]] ; CHECK: outer.latch: ; CHECK-NEXT: [[L_LCSSA:%.*]] = phi i32 [ [[L]], [[INNER_3]] ] -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 0 to i32 ; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nsw i32 [[L_LCSSA]], [[TRUNC]] ; CHECK-NEXT: br label [[OUTER_HEADER]] ; diff --git a/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll b/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll index 62c5627ac2d38..4a55c0e9e11d5 100644 --- a/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll +++ b/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll @@ -4,11 +4,11 @@ define void @test_pr50940(ptr %A, ptr %B) { ; CHECK-LABEL: @test_pr50940( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 ; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] ; CHECK: outer.header: ; CHECK-NEXT: br i1 false, label [[OUTER_LATCH:%.*]], label [[INNER_PH:%.*]] ; CHECK: inner.ph: -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 ; CHECK-NEXT: [[GEP_A_3:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 3 ; CHECK-NEXT: br label [[INNER_LVER_CHECK:%.*]] ; CHECK: inner.lver.check: diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll index eea22374ade30..abed18a57b90e 100644 --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -380,7 +380,6 @@ define void @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP8]], 8589934588 -; CHECK-NEXT: [[IND_END:%.*]] = add nuw nsw i64 [[N_VEC]], [[TMP4]] ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX5_PROMOTED]], i64 0 ; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr i32, ptr [[VAR2]], i64 [[TMP4]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -396,6 +395,7 @@ define void @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP17]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[IND_END:%.*]] = add nuw nsw i64 [[N_VEC]], [[TMP4]] ; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]]) ; CHECK-NEXT: store i32 [[TMP19]], ptr [[ARRAYIDX5]], align 4, !alias.scope [[META27:![0-9]+]], !noalias [[META23]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]] diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll index 8d20a3ba8ed08..d311f547f2e51 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll @@ -43,7 +43,6 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[FOR_BODY_PREHEADER13]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP8]], -8 -; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -64,6 +63,7 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER13]] ; CHECK: for.body.preheader14: diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll index 2dceb27165c4d..f2ae327778f4a 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll @@ -1040,7 +1040,6 @@ define void @saxpy_5(i64 %n, float %a, ptr readonly %x, ptr noalias %y) { ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[LOOP_PREHEADER11:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775806 -; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 5 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[A]], i64 0 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <10 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -1058,10 +1057,11 @@ define void @saxpy_5(i64 %n, float %a, ptr readonly %x, ptr noalias %y) { ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[N_VEC]], 5 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT]], label %[[LOOP_PREHEADER11]] ; CHECK: [[LOOP_PREHEADER11]]: -; CHECK-NEXT: [[I1_PH:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[I1_PH:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[TMP16]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0 ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label %[[LOOP:.*]] diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll index a3b8736a06ec7..338d9259b635c 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll @@ -9,7 +9,6 @@ define i64 @std_find_i16_constant_offset_with_assumptions(ptr %first.coerce, i16 ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[FIRST_COERCE]], i64 2) ] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[FIRST_COERCE]], i64 256) ] -; CHECK-NEXT: [[COERCE_VAL_IP:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 256 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -27,6 +26,7 @@ define i64 @std_find_i16_constant_offset_with_assumptions(ptr %first.coerce, i16 ; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]] ; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_SPLIT]]: +; CHECK-NEXT: [[COERCE_VAL_IP:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 256 ; CHECK-NEXT: br i1 [[TMP2]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[RETURN:.*]] ; CHECK: [[VECTOR_EARLY_EXIT]]: ; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP0]], i1 true) diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll index 5127b7d37f0b4..7c349fb77be20 100644 --- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll +++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll @@ -18,22 +18,15 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY_PREHEADER15:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -8 -; CHECK-NEXT: [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7 -; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_END7:%.*]] = getelementptr i8, ptr [[PSRCA:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[TMP2]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRCA]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRCA:%.*]], i32 [[OFFSET_IDX]] ; CHECK-NEXT: [[OFFSET_IDX13:%.*]] = shl i32 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[OFFSET_IDX13]] +; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[OFFSET_IDX13]] ; CHECK-NEXT: [[OFFSET_IDX15:%.*]] = shl i32 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[OFFSET_IDX15]] +; CHECK-NEXT: [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[OFFSET_IDX15]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 ; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> ; CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x i16>, ptr [[NEXT_GEP16]], align 2 @@ -47,6 +40,13 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7 +; CHECK-NEXT: [[TMP13:%.*]] = shl i32 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END7:%.*]] = getelementptr i8, ptr [[PSRCA]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = shl i32 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP12:%.*]] = shl i32 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[TMP12]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[BLOCKSIZE]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER15]] ; CHECK: while.body.preheader15: diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll index dcfebe32302be..6e95b63270e6c 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll @@ -46,7 +46,6 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 124 ; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[BB12_PREHEADER11:%.*]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: -; AVX2-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP3]], 24 ; AVX2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775776 ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX2: vector.body: @@ -80,6 +79,7 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; AVX2-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX2: middle.block: +; AVX2-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP3]], 24 ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], label [[EXIT]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; AVX2: vec.epilog.iter.check: @@ -90,8 +90,6 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; AVX2: vec.epilog.ph: ; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; AVX2-NEXT: [[N_VEC10:%.*]] = and i64 [[TMP3]], 9223372036854775800 -; AVX2-NEXT: [[TMP21:%.*]] = shl i64 [[N_VEC10]], 2 -; AVX2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP21]] ; AVX2-NEXT: br label [[BB12:%.*]] ; AVX2: vec.epilog.vector.body: ; AVX2-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[BB12_PREHEADER11]] ], [ [[INDEX_NEXT16:%.*]], [[BB12]] ] @@ -106,6 +104,8 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; AVX2-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC10]] ; AVX2-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[BB12]], !llvm.loop [[LOOP4:![0-9]+]] ; AVX2: vec.epilog.middle.block: +; AVX2-NEXT: [[TMP27:%.*]] = shl i64 [[N_VEC10]], 2 +; AVX2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP27]] ; AVX2-NEXT: [[CMP_N17:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]] ; AVX2-NEXT: br i1 [[CMP_N17]], label [[EXIT]], label [[BB12_PREHEADER1]] ; AVX2: bb12.preheader: diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll index bfb8554e6243c..4562072b7b450 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll @@ -16,8 +16,8 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-SAME: ptr writeonly captures(none) [[X:%.*]], ptr readonly captures(none) [[Y:%.*]], double [[A:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N]], 0 -; CHECK-NEXT: br i1 [[CMP1]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]] -; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br i1 [[CMP1]], label %[[ITER_CHECK:.*]], label %[[FOR_END:.*]] +; CHECK: [[ITER_CHECK]]: ; CHECK-NEXT: [[X4:%.*]] = ptrtoint ptr [[X]] to i64 ; CHECK-NEXT: [[Y5:%.*]] = ptrtoint ptr [[Y]] to i64 ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 @@ -25,12 +25,11 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[X4]], [[Y5]] ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128 ; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[MIN_ITERS_CHECK]], i1 true, i1 [[DIFF_CHECK]] -; CHECK-NEXT: br i1 [[OR_COND]], label %[[FOR_BODY_PREHEADER9:.*]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br i1 [[OR_COND]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: ; CHECK-NEXT: [[MIN_ITERS_CHECK6:%.*]] = icmp ult i32 [[N]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK6]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH1:.*]] -; CHECK: [[VECTOR_PH1]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 12 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK6]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483632 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer @@ -40,7 +39,7 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: [[TMP4:%.*]] = fdiv fast <4 x double> splat (double 1.000000e+00), [[BROADCAST_SPLAT]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 32 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 64 @@ -65,13 +64,14 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 12 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER9]], label %[[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER]], label %[[VEC_EPILOG_PH]], !prof [[PROF10:![0-9]+]] ; CHECK: [[VEC_EPILOG_PH]]: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_VEC11:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483644 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT14]], <4 x double> poison, <4 x i32> zeroinitializer @@ -86,12 +86,12 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: store <4 x double> [[TMP40]], ptr [[TMP41]], align 8, !tbaa [[DOUBLE_TBAA3]] ; CHECK-NEXT: [[INDEX_NEXT16]] = add nuw i64 [[INDEX12]], 4 ; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC11]] -; CHECK-NEXT: br i1 [[TMP42]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP42]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N17:%.*]] = icmp eq i64 [[N_VEC11]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[CMP_N17]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER9]] -; CHECK: [[FOR_BODY_PREHEADER9]]: -; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: br i1 [[CMP_N17]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[TMP43:%.*]] = sub nsw i64 [[WIDE_TRIP_COUNT]], [[INDVARS_IV_PH]] ; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP43]], 7 ; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 @@ -110,13 +110,13 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 1 ; CHECK-NEXT: [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1 ; CHECK-NEXT: [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER]] -; CHECK-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label %[[FOR_BODY_PROL_LOOPEXIT]], label %[[FOR_BODY_PROL]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label %[[FOR_BODY_PROL_LOOPEXIT]], label %[[FOR_BODY_PROL]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: [[FOR_BODY_PROL_LOOPEXIT]]: -; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER9]] ], [ [[INDVARS_IV_NEXT_PROL]], %[[FOR_BODY_PROL]] ] +; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_PROL]], %[[FOR_BODY_PROL]] ] ; CHECK-NEXT: [[TMP20:%.*]] = sub nsw i64 [[INDVARS_IV_PH]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: [[TMP21:%.*]] = icmp ugt i64 [[TMP20]], -8 -; CHECK-NEXT: br i1 [[TMP21]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER9_NEW:.*]] -; CHECK: [[FOR_BODY_PREHEADER9_NEW]]: +; CHECK-NEXT: br i1 [[TMP21]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER_NEW:.*]] +; CHECK: [[FOR_BODY_PREHEADER_NEW]]: ; CHECK-NEXT: [[TMP22:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: [[TMP23:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: [[TMP24:%.*]] = fdiv fast double 1.000000e+00, [[A]] @@ -127,7 +127,7 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: [[TMP29:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_PREHEADER9_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[T0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA3]] ; CHECK-NEXT: [[TMP30:%.*]] = fmul fast double [[T0]], [[TMP22]] @@ -177,7 +177,7 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: store double [[TMP37]], ptr [[ARRAYIDX2_7]], align 8, !tbaa [[DOUBLE_TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8 ; CHECK-NEXT: [[EXITCOND_NOT_7:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_7]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT_7]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_7]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: [[FOR_END]]: ; CHECK-NEXT: ret void ; @@ -232,8 +232,9 @@ attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"=" ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]], [[META9:![0-9]+]]} ; CHECK: [[META8]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META9]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META8]], [[META9]]} -; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META12:![0-9]+]]} -; CHECK: [[META12]] = !{!"llvm.loop.unroll.disable"} -; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META8]]} +; CHECK: [[PROF10]] = !{!"branch_weights", i32 4, i32 12} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META8]], [[META9]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]]} +; CHECK: [[META13]] = !{!"llvm.loop.unroll.disable"} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META8]]} ;. From 4e868f602de33194129911be0016ad230a34b6c2 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 30 Oct 2025 14:00:55 +0000 Subject: [PATCH 196/539] Reapply "[lit] Support more ulimit options" This reverts commit 57722ddce172f569f04a50b76ccb2fc524adf8f5. This caused some MacOS test failures due to resource there having issues with RLIMIT_STACK. The underlying syscall fails with EINVAL despite the values being correct. For now, move this to the non Darwin test. --- llvm/utils/lit/lit/TestRunner.py | 4 ++++ llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py | 4 ++++ .../lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_okay.txt | 1 + llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py | 2 ++ llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt | 2 ++ llvm/utils/lit/tests/shtest-ulimit-nondarwin.py | 2 ++ llvm/utils/lit/tests/shtest-ulimit.py | 2 ++ 7 files changed, 17 insertions(+) diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index 9fba96a1471a0..4a9b3c618e4f3 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -612,6 +612,10 @@ def executeBuiltinUlimit(cmd, shenv): shenv.ulimit["RLIMIT_AS"] = new_limit * 1024 elif cmd.args[1] == "-n": shenv.ulimit["RLIMIT_NOFILE"] = new_limit + elif cmd.args[1] == "-s": + shenv.ulimit["RLIMIT_STACK"] = new_limit * 1024 + elif cmd.args[1] == "-f": + shenv.ulimit["RLIMIT_FSIZE"] = new_limit else: raise InternalShellError( cmd, "'ulimit' does not support option: %s" % cmd.args[1] diff --git a/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py b/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py index 33d2d59ff0dbe..a9dc2595497e7 100644 --- a/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py +++ b/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py @@ -17,6 +17,10 @@ def main(argv): resource.setrlimit(resource.RLIMIT_AS, limit) elif limit_str == "RLIMIT_NOFILE": resource.setrlimit(resource.RLIMIT_NOFILE, limit) + elif limit_str == "RLIMIT_STACK": + resource.setrlimit(resource.RLIMIT_STACK, limit) + elif limit_str == "RLIMIT_FSIZE": + resource.setrlimit(resource.RLIMIT_FSIZE, limit) process_output = subprocess.run(command_args) sys.exit(process_output.returncode) diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_okay.txt b/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_okay.txt index dbdd0037e70a7..a5fac7b1d126d 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_okay.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_okay.txt @@ -1,4 +1,5 @@ # RUN: ulimit -v 1048576 +# RUN: ulimit -s 256 # RUN: %{python} %S/../shtest-ulimit/print_limits.py # Fail the test so that we can assert on the output. # RUN: not echo return diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py b/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py index 632f954fa8fde..c732c0429e661 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py +++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py @@ -2,3 +2,5 @@ print("RLIMIT_AS=" + str(resource.getrlimit(resource.RLIMIT_AS)[0])) print("RLIMIT_NOFILE=" + str(resource.getrlimit(resource.RLIMIT_NOFILE)[0])) +print("RLIMIT_STACK=" + str(resource.getrlimit(resource.RLIMIT_STACK)[0])) +print("RLIMIT_FSIZE=" + str(resource.getrlimit(resource.RLIMIT_FSIZE)[0])) diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt index 4edf1c303a092..d38dc44fa033d 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt @@ -1,4 +1,6 @@ # RUN: ulimit -n 50 +# RUN: ulimit -s 256 +# RUN: ulimit -f 5 # RUN: %{python} %S/print_limits.py # Fail the test so that we can assert on the output. # RUN: not echo return diff --git a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py index 2d96feae5b58e..022e8b5f41892 100644 --- a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py +++ b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py @@ -10,4 +10,6 @@ # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_okay.txt ({{[^)]*}}) # CHECK: ulimit -v 1048576 +# CHECK: ulimit -s 256 # CHECK: RLIMIT_AS=1073741824 +# CHECK: RLIMIT_STACK=262144 diff --git a/llvm/utils/lit/tests/shtest-ulimit.py b/llvm/utils/lit/tests/shtest-ulimit.py index 09cd475b737c1..21e5a5e2491d1 100644 --- a/llvm/utils/lit/tests/shtest-ulimit.py +++ b/llvm/utils/lit/tests/shtest-ulimit.py @@ -19,7 +19,9 @@ # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_okay.txt ({{[^)]*}}) # CHECK: ulimit -n 50 +# CHECK: ulimit -f 5 # CHECK: RLIMIT_NOFILE=50 +# CHECK: RLIMIT_FSIZE=5 # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_reset.txt ({{[^)]*}}) # CHECK: RLIMIT_NOFILE=[[BASE_NOFILE_LIMIT]] From da45069cb0eff7c1245237fa072adfcc5276b8c9 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 30 Oct 2025 14:49:22 +0000 Subject: [PATCH 197/539] [lit] Remove setting stack size in ulimit_okay.txt This was supposed to be in 6ccd1e8626f331f2ec2b172c3e7e8fffee66ac95 but got left out because I forgot to save the file inside of VSCode. This was causing test failures on MacOS due to the previously mentioned failures setting ulimit that caused the patch to be reverted in the first place. https://lab.llvm.org/buildbot/#/builders/190/builds/29990 --- llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt index d38dc44fa033d..b1f2396b35d69 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt @@ -1,5 +1,4 @@ # RUN: ulimit -n 50 -# RUN: ulimit -s 256 # RUN: ulimit -f 5 # RUN: %{python} %S/print_limits.py # Fail the test so that we can assert on the output. From 8014bbb018ec6286754736a7d91e85e62308bdc0 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Thu, 30 Oct 2025 10:57:27 -0400 Subject: [PATCH 198/539] [LoopUnroll][NFCI] Clean up remainder followup metadata handling (#165272) Followup metadata for remainder loops is handled by two implementations, both added by 7244852557ca6: 1. `tryToUnrollLoop` in `LoopUnrollPass.cpp`. 2. `CloneLoopBlocks` in `LoopUnrollRuntime.cpp`. As far as I can tell, 2 is useless: I added `assert(!NewLoopID)` for the `NewLoopID` returned by the `makeFollowupLoopID` call, and it never fails throughout check-all for my build. Moreover, if 2 were useful, it appears it would have a bug caused by 7cd826a321d9. That commit skips adding loop metadata to a new remainder loop if the remainder loop itself is to be completely unrolled because it will then no longer be a loop. However, that commit incorrectly assumes that `UnrollRemainder` dictates complete unrolling of a remainder loop, and thus it skips adding loop metadata even if the remainder loop will be only partially unrolled. To avoid further confusion here, this patch removes 2. check-all continues to pass for my build. If 2 actually is useful, please advise so we can create a test that covers that usage. Near 2, this patch retains the `UnrollRemainder` guard on the `setLoopAlreadyUnrolled` call, which adds `llvm.loop.unroll.disable` to the remainder loop. That behavior exists both before and after 7cd826a321d9. The logic appears to be that remainder loop unrolling (whether complete or partial) is opt-in. That is, unless `UnrollRemainder` is true, `UnrollRuntimeLoopRemainder` skips running remainder loop unrolling, and `llvm.loop.unroll.disable` suppresses any later attempt at it. This patch also extends testing of remainder loop followup metadata to be sure remainder loop partial unrolling is handled correctly by 1. --- .../Transforms/Utils/LoopUnrollRuntime.cpp | 19 ++-------- llvm/test/Transforms/LoopUnroll/followup.ll | 35 +++++++++++++------ 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 6312831cf0ee0..7a2b8da6ffd21 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -460,25 +460,10 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, Loop *NewLoop = NewLoops[L]; assert(NewLoop && "L should have been cloned"); - MDNode *LoopID = NewLoop->getLoopID(); - - // Only add loop metadata if the loop is not going to be completely - // unrolled. - if (UnrollRemainder) - return NewLoop; - - std::optional NewLoopID = makeFollowupLoopID( - LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder}); - if (NewLoopID) { - NewLoop->setLoopID(*NewLoopID); - - // Do not setLoopAlreadyUnrolled if loop attributes have been defined - // explicitly. - return NewLoop; - } // Add unroll disable metadata to disable future unrolling for this loop. - NewLoop->setLoopAlreadyUnrolled(); + if (!UnrollRemainder) + NewLoop->setLoopAlreadyUnrolled(); return NewLoop; } diff --git a/llvm/test/Transforms/LoopUnroll/followup.ll b/llvm/test/Transforms/LoopUnroll/followup.ll index 051e43d52b3be..9dda76e70efac 100644 --- a/llvm/test/Transforms/LoopUnroll/followup.ll +++ b/llvm/test/Transforms/LoopUnroll/followup.ll @@ -1,9 +1,20 @@ -; RUN: opt < %s -S -passes=loop-unroll -unroll-count=2 | FileCheck %s -check-prefixes=COUNT,COMMON -; RUN: opt < %s -S -passes=loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true | FileCheck %s -check-prefixes=EPILOG,COMMON -; RUN: opt < %s -S -passes=loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefixes=PROLOG,COMMON -; -; Check that followup-attributes are applied after LoopUnroll. +; Check that followup attributes are applied after LoopUnroll. ; +; We choose -unroll-count=3 because it produces partial unrolling of remainder +; loops. Complete unrolling would leave no remainder loop to which to copy +; followup attributes. + +; DEFINE: %{unroll} = opt < %s -S -passes=loop-unroll -unroll-count=3 +; DEFINE: %{epilog} = %{unroll} -unroll-runtime -unroll-runtime-epilog=true +; DEFINE: %{prolog} = %{unroll} -unroll-runtime -unroll-runtime-epilog=false +; DEFINE: %{fc} = FileCheck %s -check-prefixes + +; RUN: %{unroll} | %{fc} COMMON,COUNT +; RUN: %{epilog} | %{fc} COMMON,EPILOG,EPILOG-NO-UNROLL +; RUN: %{prolog} | %{fc} COMMON,PROLOG,PROLOG-NO-UNROLL +; RUN: %{epilog} -unroll-remainder | %{fc} COMMON,EPILOG,EPILOG-UNROLL +; RUN: %{prolog} -unroll-remainder | %{fc} COMMON,PROLOG,PROLOG-UNROLL + target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" define i32 @test(ptr nocapture %a, i32 %n) nounwind uwtable readonly { @@ -36,15 +47,17 @@ for.end: ; preds = %for.body, %entry ; COMMON-LABEL: @test( -; COUNT: br i1 %exitcond.1, label %for.end.loopexit, label %for.body, !llvm.loop ![[LOOP:[0-9]+]] +; COUNT: br i1 %exitcond.2, label %for.end.loopexit, label %for.body, !llvm.loop ![[LOOP:[0-9]+]] ; COUNT: ![[FOLLOWUP_ALL:[0-9]+]] = !{!"FollowupAll"} ; COUNT: ![[FOLLOWUP_UNROLLED:[0-9]+]] = !{!"FollowupUnrolled"} ; COUNT: ![[LOOP]] = distinct !{![[LOOP]], ![[FOLLOWUP_ALL]], ![[FOLLOWUP_UNROLLED]]} -; EPILOG: br i1 %niter.ncmp.7, label %for.end.loopexit.unr-lcssa, label %for.body, !llvm.loop ![[LOOP_0:[0-9]+]] -; EPILOG: br i1 %epil.iter.cmp, label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !llvm.loop ![[LOOP_2:[0-9]+]] +; EPILOG: br i1 %niter.ncmp.2, label %for.end.loopexit.unr-lcssa, label %for.body, !llvm.loop ![[LOOP_0:[0-9]+]] +; EPILOG-NO-UNROLL: br i1 %epil.iter.cmp, label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !llvm.loop ![[LOOP_2:[0-9]+]] +; EPILOG-UNROLL: br i1 %epil.iter.cmp, label %for.body.epil.1, label %for.end.loopexit.epilog-lcssa +; EPILOG-UNROLL: br i1 %epil.iter.cmp.1, label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !llvm.loop ![[LOOP_2:[0-9]+]] ; EPILOG: ![[LOOP_0]] = distinct !{![[LOOP_0]], ![[FOLLOWUP_ALL:[0-9]+]], ![[FOLLOWUP_UNROLLED:[0-9]+]]} ; EPILOG: ![[FOLLOWUP_ALL]] = !{!"FollowupAll"} @@ -53,8 +66,10 @@ for.end: ; preds = %for.body, %entry ; EPILOG: ![[FOLLOWUP_REMAINDER]] = !{!"FollowupRemainder"} -; PROLOG: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit.unr-lcssa, !llvm.loop ![[LOOP_0:[0-9]+]] -; PROLOG: br i1 %exitcond.7, label %for.end.loopexit.unr-lcssa, label %for.body, !llvm.loop ![[LOOP_2:[0-9]+]] +; PROLOG-UNROLL: br i1 %prol.iter.cmp, label %for.body.prol.1, label %for.body.prol.loopexit.unr-lcssa +; PROLOG-UNROLL: br i1 %prol.iter.cmp.1, label %for.body.prol, label %for.body.prol.loopexit.unr-lcssa, !llvm.loop ![[LOOP_0:[0-9]+]] +; PROLOG-NO-UNROLL: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit.unr-lcssa, !llvm.loop ![[LOOP_0:[0-9]+]] +; PROLOG: br i1 %exitcond.2, label %for.end.loopexit.unr-lcssa, label %for.body, !llvm.loop ![[LOOP_2:[0-9]+]] ; PROLOG: ![[LOOP_0]] = distinct !{![[LOOP_0]], ![[FOLLOWUP_ALL:[0-9]+]], ![[FOLLOWUP_REMAINDER:[0-9]+]]} ; PROLOG: ![[FOLLOWUP_ALL]] = !{!"FollowupAll"} From f296b4e198bc4174b0445991a5dfc65b65bfd740 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Thu, 30 Oct 2025 16:01:46 +0100 Subject: [PATCH 199/539] [bazel] Add missing dependency for 9d5c35408e7a38b3062667bbebb3c0953fa2fae4 --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 7156bea81d6b5..101dfb7cf68ae 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -7900,6 +7900,7 @@ cc_library( deps = [ ":AffineDialect", ":AffineToStandard", + ":Analysis", ":ArithDialect", ":ComplexDialect", ":ConversionPassIncGen", From a620b9c940e14f280148e071401e5dea9d19b6f8 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Thu, 30 Oct 2025 15:07:37 +0000 Subject: [PATCH 200/539] [lldb][test] Fix libc++ API tests on older Clang versions Both of these fail on our Clang-19 macOS bots. --- .../optional/TestDataFormatterLibcxxOptionalSimulator.py | 2 ++ .../TestLibcxxInternalsRecognizer.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/optional/TestDataFormatterLibcxxOptionalSimulator.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/optional/TestDataFormatterLibcxxOptionalSimulator.py index 3fefe87dcad97..7463f8897901f 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/optional/TestDataFormatterLibcxxOptionalSimulator.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/optional/TestDataFormatterLibcxxOptionalSimulator.py @@ -53,6 +53,8 @@ def _run_test(self, defines): # causing this test to fail. This was reverted in newer version of clang # with commit 52a9ba7ca. @skipIf(compiler="clang", compiler_version=["=", "17"]) + @skipIf(compiler="clang", compiler_version=["=", "18"]) + @skipIf(compiler="clang", compiler_version=["=", "19"]) @functools.wraps(LibcxxOptionalDataFormatterSimulatorTestCase._run_test) def test_method(self, defines=defines): LibcxxOptionalDataFormatterSimulatorTestCase._run_test(self, defines) diff --git a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py index d8a729b322fe4..2f942da604ff2 100644 --- a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py +++ b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py @@ -9,7 +9,7 @@ class LibCxxInternalsRecognizerTestCase(TestBase): NO_DEBUG_INFO_TESTCASE = True @add_test_categories(["libc++"]) - @skipIf(compiler="clang", compiler_version=["<", "19.0"]) + @skipIf(compiler="clang", compiler_version=["<=", "19.0"]) def test_frame_recognizer(self): """Test that implementation details of libc++ are hidden""" self.build() From 09a0a218f0ec492063a738592f0a6f5356322b62 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Thu, 30 Oct 2025 10:10:45 -0500 Subject: [PATCH 201/539] [flang] One more fix for dumping evaluate::Expr (#165730) Clang doesn't have "std::string_view" in the type list. --- flang/include/flang/Semantics/dump-expr.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flang/include/flang/Semantics/dump-expr.h b/flang/include/flang/Semantics/dump-expr.h index 2dbd4cb60be59..5a78e13b19e5d 100644 --- a/flang/include/flang/Semantics/dump-expr.h +++ b/flang/include/flang/Semantics/dump-expr.h @@ -48,10 +48,11 @@ class DumpEvaluateExpr { // "... [with T = xyz; std::string_view = ...]" #ifdef __clang__ std::string_view front("[T = "); + std::string_view back("]"); #else std::string_view front("[with T = "); -#endif std::string_view back("; std::string_view ="); +#endif #elif defined(_MSC_VER) #define DUMP_EXPR_SHOW_TYPE From 62ceba2443bc9753a05fa887c5df8acd074598b8 Mon Sep 17 00:00:00 2001 From: Ivan Kosarev Date: Thu, 30 Oct 2025 15:14:56 +0000 Subject: [PATCH 202/539] [AMDGPU][MC][NFC] Use the lit substitution to extract instruction codes in tests. (#165450) Instead of invoking sed directly. Partially reverts https://github.com/llvm/llvm-project/pull/119778 . --- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index d85ea799ed3d7..399a6441629ca 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --version 5 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding -comment-column=0 %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | sed -n 's#.*\(\[0x[0-9a-fx,]\{1,\}\]\)#\1#p' | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding -comment-column=0 | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding -comment-column=0 | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding -comment-column=0 %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | sed -n 's#.*\(\[0x[0-9a-fx,]\{1,\}\]\)#\1#p' | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding -comment-column=0 | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding -comment-column=0 | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s v_bfrev_b32_e32 v5, v1 // GFX12: v_bfrev_b32_e32 v5, v1 ; encoding: [0x01,0x71,0x0a,0x7e] From ddacbfcff7972be9ce24245213de90fe6ab3a4e1 Mon Sep 17 00:00:00 2001 From: vangthao95 Date: Thu, 30 Oct 2025 08:19:12 -0700 Subject: [PATCH 203/539] [AMDGPU][GlobalISel] Fix issue with copy_scc_vcc on gfx7 (#165355) When selecting for G_AMDGPU_COPY_SCC_VCC, we use S_CMP_LG_U64 or S_CMP_LG_U32 for wave64 and wave32 respectively. However, on gfx7 we do not have the S_CMP_LG_U64 instruction. Work around this issue by using S_OR_B64 instead. --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 20 ++++-- .../GlobalISel/inst-select-copy-scc-vcc.ll | 66 +++++++++++++++++++ .../GlobalISel/inst-select-copy-scc-vcc.mir | 37 +++++++++++ 3 files changed, 118 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 9ce12243016f4..aed325cf627bc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -221,12 +221,22 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const { const DebugLoc &DL = I.getDebugLoc(); MachineBasicBlock *BB = I.getParent(); + Register VCCReg = I.getOperand(1).getReg(); + MachineInstr *Cmp; + + if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + unsigned CmpOpc = + STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; + Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0); + } else { + // For gfx7 and earlier, S_CMP_LG_U64 doesn't exist, so we use S_OR_B64 + // which sets SCC as a side effect. + Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst) + .addReg(VCCReg) + .addReg(VCCReg); + } - unsigned CmpOpc = - STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; - MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)) - .addReg(I.getOperand(1).getReg()) - .addImm(0); if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI)) return false; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll new file mode 100644 index 0000000000000..1a7ccf0835686 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s + +define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr addrspace(1) %out) { +; GFX7-LABEL: fcmp_uniform_select: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x9 +; GFX7-NEXT: s_load_dword s3, s[4:5], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_f32_e64 s[4:5], s6, 0 +; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[4:5] +; GFX7-NEXT: s_cselect_b32 s4, 1, 0 +; GFX7-NEXT: s_and_b32 s4, s4, 1 +; GFX7-NEXT: s_cmp_lg_u32 s4, 0 +; GFX7-NEXT: s_cselect_b32 s3, s7, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: fcmp_uniform_select: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_f32_e64 s[4:5], s0, 0 +; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8-NEXT: s_and_b32 s0, s0, 1 +; GFX8-NEXT: s_cmp_lg_u32 s0, 0 +; GFX8-NEXT: s_cselect_b32 s0, s1, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_uniform_select: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_f32_e64 s0, s0, 0 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cselect_b32 s0, 1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, s0, 1 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cselect_b32 s0, s1, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_endpgm + %cmp = fcmp oeq float %a, 0.0 + %sel = select i1 %cmp, i32 %b, i32 %c + store i32 %sel, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir new file mode 100644 index 0000000000000..67cc0169af619 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir @@ -0,0 +1,37 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX7 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GF8 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11 %s + +--- +name: test_copy_scc_vcc +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + ; GFX7-LABEL: name: test_copy_scc_vcc + ; GFX7: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX7-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[DEF]], [[DEF]], implicit-def $scc + ; GFX7-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $scc + ; GFX7-NEXT: $sgpr0 = COPY [[COPY]] + ; GFX7-NEXT: S_ENDPGM 0, implicit $sgpr0 + ; + ; GF8-LABEL: name: test_copy_scc_vcc + ; GF8: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GF8-NEXT: S_CMP_LG_U64 [[DEF]], 0, implicit-def $scc + ; GF8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $scc + ; GF8-NEXT: $sgpr0 = COPY [[COPY]] + ; GF8-NEXT: S_ENDPGM 0, implicit $sgpr0 + ; + ; GFX11-LABEL: name: test_copy_scc_vcc + ; GFX11: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX11-NEXT: S_CMP_LG_U32 [[DEF]], 0, implicit-def $scc + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $scc + ; GFX11-NEXT: $sgpr0 = COPY [[COPY]] + ; GFX11-NEXT: S_ENDPGM 0, implicit $sgpr0 + %0:vcc(s1) = G_IMPLICIT_DEF + %1:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC %0 + $sgpr0 = COPY %1 + S_ENDPGM 0, implicit $sgpr0 +... From 6352f970ac30f6488db70cf40ca78078c86a8bab Mon Sep 17 00:00:00 2001 From: srcarroll <50210727+srcarroll@users.noreply.github.com> Date: Thu, 30 Oct 2025 10:20:19 -0500 Subject: [PATCH 204/539] [mlir][linalg] Genericize MapOp (#162742) This PR modifies the definition of `linalg::MapOp` so that it has the same structure of `linalg::GenericOp` and all other linalg ops. Mainly, it adds an `out` bbarg for the body of the op. Although the `out` arg is never used in the body, there doesn't seem to be much benefit in specializing the op to exclude it. In fact it only makes things more complicated because it doesn't align with the `GenericOp` structure. For example, `linalg-generalize-named-ops` avoided converting `linalg.map` purely because it didn't have the structure to do so. Moreover, although some fusion patterns are applied explicitly to `GenericOp`, we can change them to be applied to the base `LinalgOp` which will enable fusion for any fusion-compatible linalg op, but that requires the op having a generic structure. So these changes will enable us to use existing generic transformation patterns on `MapOp` that weren't possible before. They can either be applied to `MapOp` directly or applied after converting to `GenericOp`. --- .../Dialect/Linalg/IR/LinalgStructuredOps.td | 4 -- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 37 ++++++++++++------- .../Linalg/Transforms/Generalization.cpp | 6 +-- .../BufferizableOpInterfaceImpl.cpp | 2 + mlir/test/Dialect/Linalg/canonicalize.mlir | 2 +- .../Dialect/Linalg/generalize-named-ops.mlir | 22 +++++++---- mlir/test/Dialect/Linalg/invalid.mlir | 10 ++--- .../Dialect/Linalg/one-shot-bufferize.mlir | 2 +- mlir/test/Dialect/Linalg/roundtrip.mlir | 18 ++++----- .../linalg-ops-with-patterns.mlir | 2 +- mlir/test/Dialect/Tensor/bufferize.mlir | 2 +- .../lower-to-loops-using-interface.mlir | 6 +-- 12 files changed, 63 insertions(+), 50 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td index f3674c3eecfe6..ecd036d452b27 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td @@ -293,10 +293,6 @@ def MapOp : LinalgStructuredBase_Op<"map", [ // Implement functions necessary for DestinationStyleOpInterface. MutableOperandRange getDpsInitsMutable() { return getInitMutable(); } - SmallVector getOpOperandsMatchingBBargs() { - return getDpsInputOperands(); - } - bool payloadUsesValueFromOperand(OpOperand * opOperand) { if (isDpsInit(opOperand)) return false; return !getMatchingBlockArgument(opOperand).use_empty(); diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index cbc565b0c8cbd..3dc45edf4a23f 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -1474,6 +1474,8 @@ void MapOp::getAsmBlockArgumentNames(Region ®ion, OpAsmSetValueNameFn setNameFn) { for (Value v : getRegionInputArgs()) setNameFn(v, "in"); + for (Value v : getRegionOutputArgs()) + setNameFn(v, "init"); } void MapOp::getAsmResultNames(function_ref setNameFn) { @@ -1495,14 +1497,14 @@ void MapOp::build( if (bodyBuild) buildGenericRegion(builder, result.location, *result.regions.front(), - inputs, /*outputs=*/{}, bodyBuild); + inputs, /*outputs=*/{init}, bodyBuild); } static void addBodyWithPayloadOp(OpAsmParser &parser, OperationState &result, const OperationName &payloadOpName, const NamedAttrList &payloadOpAttrs, ArrayRef operands, - bool initFirst = false) { + bool initFirst = false, bool mapInit = true) { OpBuilder b(parser.getContext()); Region *body = result.addRegion(); Block &block = body->emplaceBlock(); @@ -1516,12 +1518,13 @@ static void addBodyWithPayloadOp(OpAsmParser &parser, OperationState &result, // If initFirst flag is enabled, we consider init as the first position of // payload operands. if (initFirst) { - payloadOpOperands.push_back(block.getArguments().back()); + if (mapInit) + payloadOpOperands.push_back(block.getArguments().back()); for (const auto &arg : block.getArguments().drop_back()) payloadOpOperands.push_back(arg); } else { payloadOpOperands = {block.getArguments().begin(), - block.getArguments().end()}; + block.getArguments().end() - int(!mapInit)}; } Operation *payloadOp = b.create( @@ -1553,8 +1556,8 @@ ParseResult MapOp::parse(OpAsmParser &parser, OperationState &result) { if (payloadOpName.has_value()) { if (!result.operands.empty()) addBodyWithPayloadOp(parser, result, payloadOpName.value(), - payloadOpAttrs, - ArrayRef(result.operands).drop_back()); + payloadOpAttrs, ArrayRef(result.operands), false, + false); else result.addRegion(); } else { @@ -1570,7 +1573,11 @@ ParseResult MapOp::parse(OpAsmParser &parser, OperationState &result) { return success(); } -static bool canUseShortForm(Block *body, bool initFirst = false) { +static bool canUseShortForm(Block *body, bool initFirst = false, + bool mapInit = true) { + // `intFirst == true` implies that we want to map init arg + if (initFirst && !mapInit) + return false; // Check if the body can be printed in short form. The following 4 conditions // must be satisfied: @@ -1582,7 +1589,7 @@ static bool canUseShortForm(Block *body, bool initFirst = false) { // 2) The payload op must have the same number of operands as the number of // block arguments. if (payload.getNumOperands() == 0 || - payload.getNumOperands() != body->getNumArguments()) + payload.getNumOperands() != body->getNumArguments() - int(!mapInit)) return false; // 3) If `initFirst` is true (e.g., for reduction ops), the init block @@ -1600,7 +1607,8 @@ static bool canUseShortForm(Block *body, bool initFirst = false) { } } else { for (const auto &[operand, bbArg] : - llvm::zip(payload.getOperands(), body->getArguments())) { + llvm::zip(payload.getOperands(), + body->getArguments().drop_back(int(!mapInit)))) { if (bbArg != operand) return false; } @@ -1632,7 +1640,8 @@ static void printShortForm(OpAsmPrinter &p, Operation *payloadOp) { void MapOp::print(OpAsmPrinter &p) { Block *mapper = getBody(); - bool useShortForm = canUseShortForm(mapper); + bool useShortForm = + canUseShortForm(mapper, /*initFirst=*/false, /*mapInit*/ false); if (useShortForm) { printShortForm(p, &mapper->getOperations().front()); } @@ -1658,11 +1667,13 @@ LogicalResult MapOp::verify() { auto *bodyBlock = getBody(); auto blockArgs = bodyBlock->getArguments(); - // Checks if the number of `inputs` match the arity of the `mapper` region. - if (getInputs().size() != blockArgs.size()) + // Checks if the number of `inputs` + `init` match the arity of the `mapper` + // region. + if (getInputs().size() + 1 != blockArgs.size()) return emitOpError() << "expects number of operands to match the arity of " "mapper, but got: " - << getInputs().size() << " and " << blockArgs.size(); + << getInputs().size() + 1 << " and " + << blockArgs.size(); // The parameters of mapper should all match the element type of inputs. for (const auto &[bbArgType, inputArg] : diff --git a/mlir/lib/Dialect/Linalg/Transforms/Generalization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Generalization.cpp index 3e31393fd51ed..75bb1757a55f5 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Generalization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Generalization.cpp @@ -31,10 +31,8 @@ using namespace mlir; using namespace mlir::linalg; static LogicalResult generalizeNamedOpPrecondition(LinalgOp linalgOp) { - // Bailout if `linalgOp` is already a generic or a linalg.map. We cannot - // trivially generalize a `linalg.map`, as it does not use the output as - // region arguments in the block. - if (isa(linalgOp) || isa(linalgOp)) + // Bailout if `linalgOp` is already a generic. + if (isa(linalgOp)) return failure(); // Check if the operation has exactly one region. if (linalgOp->getNumRegions() != 1) { diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp index bce964e47a3be..c607ece418dff 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp @@ -579,6 +579,7 @@ static Value lowerGenerateLikeOpBody(RewriterBase &rewriter, Location loc, linalg::MapOp::create(rewriter, loc, tensorType, /*inputs=*/ValueRange(), /*init=*/tensorDestination); Block &linalgBody = linalgOp.getMapper().emplaceBlock(); + linalgBody.addArgument(tensorType.getElementType(), loc); // Create linalg::IndexOps. rewriter.setInsertionPointToStart(&linalgBody); @@ -1068,6 +1069,7 @@ struct SplatOpInterface /*inputs=*/ValueRange(), /*init=*/*tensorAlloc); Block &linalgBody = linalgOp.getMapper().emplaceBlock(); + linalgBody.addArgument(tensorType.getElementType(), loc); // Create linalg::IndexOps. rewriter.setInsertionPointToStart(&linalgBody); diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir index 26d2d98572f47..f4020ede4854e 100644 --- a/mlir/test/Dialect/Linalg/canonicalize.mlir +++ b/mlir/test/Dialect/Linalg/canonicalize.mlir @@ -1423,7 +1423,7 @@ func.func @transpose_buffer(%input: memref, func.func @recursive_effect(%arg : tensor<1xf32>) { %init = arith.constant dense<0.0> : tensor<1xf32> %mapped = linalg.map ins(%arg:tensor<1xf32>) outs(%init :tensor<1xf32>) - (%in : f32) { + (%in : f32, %out: f32) { vector.print %in : f32 linalg.yield %in : f32 } diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir index ae07b1b82228c..dcdd6c8db4b21 100644 --- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir +++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir @@ -386,18 +386,24 @@ func.func @generalize_batch_reduce_gemm_bf16(%lhs: memref<7x8x9xbf16>, %rhs: mem // ----- -// CHECK-LABEL: generalize_linalg_map -func.func @generalize_linalg_map(%arg0: memref<1x8x8x8xf32>) { +func.func @generalize_linalg_map(%arg0: memref<1x8x8x8xf32>, %arg1: memref<1x8x8x8xf32>, %arg2: memref<1x8x8x8xf32>) { %cst = arith.constant 0.000000e+00 : f32 - // CHECK: linalg.map - // CHECK-NOT: linalg.generic - linalg.map outs(%arg0 : memref<1x8x8x8xf32>) - () { - linalg.yield %cst : f32 - } + linalg.map {arith.addf} ins(%arg0, %arg1: memref<1x8x8x8xf32>, memref<1x8x8x8xf32>) outs(%arg2 : memref<1x8x8x8xf32>) return } +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + +// CHECK: @generalize_linalg_map + +// CHECK: linalg.generic +// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP0]], #[[MAP0]]] +// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel"]} +// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<1x8x8x8xf32>, memref<1x8x8x8xf32>) outs(%{{.+}} : memref<1x8x8x8xf32> +// CHECK: ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32) +// CHECK: %[[ADD:.+]] = arith.addf %[[BBARG0]], %[[BBARG1]] : f32 +// CHECK: linalg.yield %[[ADD]] : f32 + // ----- func.func @generalize_add(%lhs: memref<7x14x21xf32>, %rhs: memref<7x14x21xf32>, diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir index 40bf4d19d6b91..fabc8e610612d 100644 --- a/mlir/test/Dialect/Linalg/invalid.mlir +++ b/mlir/test/Dialect/Linalg/invalid.mlir @@ -681,7 +681,7 @@ func.func @map_binary_wrong_yield_operands( %add = linalg.map ins(%lhs, %rhs : tensor<64xf32>, tensor<64xf32>) outs(%init:tensor<64xf32>) - (%lhs_elem: f32, %rhs_elem: f32) { + (%lhs_elem: f32, %rhs_elem: f32, %out: f32) { %0 = arith.addf %lhs_elem, %rhs_elem: f32 // expected-error @+1{{'linalg.yield' op expected number of yield values (2) to match the number of inits / outs operands of the enclosing LinalgOp (1)}} linalg.yield %0, %0: f32, f32 @@ -694,11 +694,11 @@ func.func @map_binary_wrong_yield_operands( func.func @map_input_mapper_arity_mismatch( %lhs: tensor<64xf32>, %rhs: tensor<64xf32>, %init: tensor<64xf32>) -> tensor<64xf32> { - // expected-error@+1{{'linalg.map' op expects number of operands to match the arity of mapper, but got: 2 and 3}} + // expected-error@+1{{'linalg.map' op expects number of operands to match the arity of mapper, but got: 3 and 4}} %add = linalg.map ins(%lhs, %rhs : tensor<64xf32>, tensor<64xf32>) outs(%init:tensor<64xf32>) - (%lhs_elem: f32, %rhs_elem: f32, %extra_elem: f32) { + (%lhs_elem: f32, %rhs_elem: f32, %out: f32, %extra_elem: f32) { %0 = arith.addf %lhs_elem, %rhs_elem: f32 linalg.yield %0: f32 } @@ -714,7 +714,7 @@ func.func @map_input_mapper_type_mismatch( %add = linalg.map ins(%lhs, %rhs : tensor<64xf32>, tensor<64xf32>) outs(%init:tensor<64xf32>) - (%lhs_elem: f64, %rhs_elem: f64) { + (%lhs_elem: f64, %rhs_elem: f64, %out: f32) { %0 = arith.addf %lhs_elem, %rhs_elem: f64 linalg.yield %0: f64 } @@ -730,7 +730,7 @@ func.func @map_input_output_shape_mismatch( %add = linalg.map ins(%lhs, %rhs : tensor<64x64xf32>, tensor<64x64xf32>) outs(%init:tensor<32xf32>) - (%lhs_elem: f32, %rhs_elem: f32) { + (%lhs_elem: f32, %rhs_elem: f32, %out: f32) { %0 = arith.addf %lhs_elem, %rhs_elem: f32 linalg.yield %0: f32 } diff --git a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir index 1df15e85bac17..85cc1ffc2029e 100644 --- a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir +++ b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir @@ -339,7 +339,7 @@ func.func @map_binary(%lhs: tensor<64xf32>, %rhs: tensor<64xf32>, %add = linalg.map ins(%lhs, %rhs: tensor<64xf32>, tensor<64xf32>) outs(%init:tensor<64xf32>) - (%lhs_elem: f32, %rhs_elem: f32) { + (%lhs_elem: f32, %rhs_elem: f32, %out: f32) { %0 = arith.addf %lhs_elem, %rhs_elem: f32 linalg.yield %0: f32 } diff --git a/mlir/test/Dialect/Linalg/roundtrip.mlir b/mlir/test/Dialect/Linalg/roundtrip.mlir index 563013d4083af..74928920c695a 100644 --- a/mlir/test/Dialect/Linalg/roundtrip.mlir +++ b/mlir/test/Dialect/Linalg/roundtrip.mlir @@ -341,7 +341,7 @@ func.func @mixed_parallel_reduced_results(%arg0 : tensor, func.func @map_no_inputs(%init: tensor<64xf32>) -> tensor<64xf32> { %add = linalg.map outs(%init:tensor<64xf32>) - () { + (%out: f32) { %0 = arith.constant 0.0: f32 linalg.yield %0: f32 } @@ -349,7 +349,7 @@ func.func @map_no_inputs(%init: tensor<64xf32>) -> tensor<64xf32> { } // CHECK-LABEL: func @map_no_inputs // CHECK: linalg.map outs -// CHECK-NEXT: () { +// CHECK-NEXT: (%[[OUT:.*]]: f32) { // CHECK-NEXT: arith.constant // CHECK-NEXT: linalg.yield // CHECK-NEXT: } @@ -361,7 +361,7 @@ func.func @map_binary(%lhs: tensor<64xf32>, %rhs: tensor<64xf32>, %add = linalg.map ins(%lhs, %rhs: tensor<64xf32>, tensor<64xf32>) outs(%init:tensor<64xf32>) - (%lhs_elem: f32, %rhs_elem: f32) { + (%lhs_elem: f32, %rhs_elem: f32, %out: f32) { %0 = arith.addf %lhs_elem, %rhs_elem: f32 linalg.yield %0: f32 } @@ -378,7 +378,7 @@ func.func @map_binary_memref(%lhs: memref<64xf32>, %rhs: memref<64xf32>, linalg.map ins(%lhs, %rhs: memref<64xf32>, memref<64xf32>) outs(%init:memref<64xf32>) - (%lhs_elem: f32, %rhs_elem: f32) { + (%lhs_elem: f32, %rhs_elem: f32, %out: f32) { %0 = arith.addf %lhs_elem, %rhs_elem: f32 linalg.yield %0: f32 } @@ -393,7 +393,7 @@ func.func @map_unary(%input: tensor<64xf32>, %init: tensor<64xf32>) -> tensor<64 %abs = linalg.map ins(%input:tensor<64xf32>) outs(%init:tensor<64xf32>) - (%input_elem: f32) { + (%input_elem: f32, %out: f32) { %0 = math.absf %input_elem: f32 linalg.yield %0: f32 } @@ -408,7 +408,7 @@ func.func @map_unary_memref(%input: memref<64xf32>, %init: memref<64xf32>) { linalg.map ins(%input:memref<64xf32>) outs(%init:memref<64xf32>) - (%input_elem: f32) { + (%input_elem: f32, %out: f32) { %0 = math.absf %input_elem: f32 linalg.yield %0: f32 } @@ -604,7 +604,7 @@ func.func @map_arith_with_attr(%lhs: tensor<64xf32>, %rhs: tensor<64xf32>, %add = linalg.map ins(%lhs, %rhs: tensor<64xf32>, tensor<64xf32>) outs(%init:tensor<64xf32>) - (%lhs_elem: f32, %rhs_elem: f32) { + (%lhs_elem: f32, %rhs_elem: f32, %out: f32) { %0 = arith.addf %lhs_elem, %rhs_elem fastmath : f32 linalg.yield %0: f32 } @@ -622,7 +622,7 @@ func.func @map_arith_with_attr(%lhs: tensor<64xf32>, %rhs: tensor<64xf32>, func.func @map_not_short_form_compatible(%lhs: tensor<1x32xf32>, %rhs: tensor<1x32xf32>, %init: tensor<1x32xf32>) -> tensor<1x32xf32> { %mapped = linalg.map ins(%lhs, %rhs : tensor<1x32xf32>, tensor<1x32xf32>) outs(%init : tensor<1x32xf32>) - (%in_1: f32, %in_2: f32) { + (%in_1: f32, %in_2: f32, %out: f32) { %1 = arith.maximumf %in_1, %in_2 : f32 linalg.yield %in_1 : f32 } @@ -634,7 +634,7 @@ func.func @map_not_short_form_compatible(%lhs: tensor<1x32xf32>, %rhs: tensor<1x // CHECK-NOT: linalg.map { arith.maximumf } ins(%[[LHS]] : tensor<1x32xf32> // CHECK: linalg.map ins(%[[LHS]], %[[RHS]] : tensor<1x32xf32>, tensor<1x32xf32>) // CHECK-SAME: outs(%[[INIT]] : tensor<1x32xf32>) -// CHECK-NEXT: (%[[IN1:.*]]: f32, %[[IN2:.*]]: f32) { +// CHECK-NEXT: (%[[IN1:.*]]: f32, %[[IN2:.*]]: f32, %[[OUT:.*]]: f32) { // CHECK-NEXT: %[[MAX_RESULT:.*]] = arith.maximumf %[[IN1]], %[[IN2]] : f32 // CHECK-NEXT: linalg.yield %[[IN1]] : f32 // CHECK-NEXT: } diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir index 93a03369be239..aa2c1da4b6274 100644 --- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir +++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir @@ -356,7 +356,7 @@ func.func @vectorize_map(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) { linalg.map ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%arg2 : memref<64xf32>) - (%in: f32, %in_0: f32) { + (%in: f32, %in_0: f32, %out: f32) { %0 = arith.addf %in, %in_0 : f32 linalg.yield %0 : f32 } diff --git a/mlir/test/Dialect/Tensor/bufferize.mlir b/mlir/test/Dialect/Tensor/bufferize.mlir index 296ca02564e35..5eb2360a29b8f 100644 --- a/mlir/test/Dialect/Tensor/bufferize.mlir +++ b/mlir/test/Dialect/Tensor/bufferize.mlir @@ -728,7 +728,7 @@ func.func @tensor.concat_dynamic_nonconcat_dim(%f: tensor, %g: tensor // CHECK: %[[ALLOC_T:.*]] = bufferization.to_tensor %[[ALLOC]] // CHECK: %[[MAPPED:.*]] = linalg.map outs(%[[ALLOC_T]] : tensor) -// CHECK: () { +// CHECK: (%[[INIT:.*]]: f32) { // CHECK: linalg.yield %[[F]] : f32 // CHECK: } // CHECK: return %[[MAPPED]] : tensor diff --git a/mlir/test/Interfaces/TilingInterface/lower-to-loops-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/lower-to-loops-using-interface.mlir index 8cbee3cbb758b..aa8882d21698c 100644 --- a/mlir/test/Interfaces/TilingInterface/lower-to-loops-using-interface.mlir +++ b/mlir/test/Interfaces/TilingInterface/lower-to-loops-using-interface.mlir @@ -257,10 +257,10 @@ module attributes {transform.with_named_sequence} { // ----- func.func @map(%lhs: memref<64xf32>, - %rhs: memref<64xf32>, %out: memref<64xf32>) { + %rhs: memref<64xf32>, %init: memref<64xf32>) { linalg.map ins(%lhs, %rhs : memref<64xf32>, memref<64xf32>) - outs(%out : memref<64xf32>) - (%in: f32, %in_0: f32) { + outs(%init : memref<64xf32>) + (%in: f32, %in_0: f32, %out: f32) { %0 = arith.addf %in, %in_0 : f32 linalg.yield %0 : f32 } From 12a5af188de5d2d98436157e423644ccbb8c2671 Mon Sep 17 00:00:00 2001 From: google-yfyang Date: Thu, 30 Oct 2025 11:27:54 -0400 Subject: [PATCH 205/539] [bazel][mlir] Port #164978: [mlir][gpu] Loose the condition to convert scf.parallel to gpu.launch (#165721) From 3cac3a563664fce4a4c9387bc64c5a41fc288d0e Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 30 Oct 2025 08:42:38 -0700 Subject: [PATCH 206/539] [RISCV] Support P extension ABSW instruction. (#165047) --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 1 + llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 14 +++++++++++++- llvm/lib/Target/RISCV/RISCVInstrInfoP.td | 5 +++++ llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp | 1 + llvm/test/CodeGen/RISCV/rv64p.ll | 6 ++---- 5 files changed, 22 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 9a6afa1cd4ea2..b25a05400fe31 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3995,6 +3995,7 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits, case RISCV::CTZW: case RISCV::CPOPW: case RISCV::SLLI_UW: + case RISCV::ABSW: case RISCV::FMV_W_X: case RISCV::FCVT_H_W: case RISCV::FCVT_H_W_INX: diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 1c930acd9c4a0..56881f71934c4 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -433,6 +433,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (Subtarget.hasStdExtP() || (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) { setOperationAction(ISD::ABS, XLenVT, Legal); + if (Subtarget.is64Bit()) + setOperationAction(ISD::ABS, MVT::i32, Custom); } else if (Subtarget.hasShortForwardBranchOpt()) { // We can use PseudoCCSUB to implement ABS. setOperationAction(ISD::ABS, XLenVT, Legal); @@ -14816,8 +14818,16 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && "Unexpected custom legalisation"); + if (Subtarget.hasStdExtP()) { + SDValue Src = + DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0)); + SDValue Abs = DAG.getNode(RISCVISD::ABSW, DL, MVT::i64, Src); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Abs)); + return; + } + if (Subtarget.hasStdExtZbb()) { - // Emit a special ABSW node that will be expanded to NEGW+MAX at isel. + // Emit a special node that will be expanded to NEGW+MAX at isel. // This allows us to remember that the result is sign extended. Expanding // to NEGW+MAX here requires a Freeze which breaks ComputeNumSignBits. SDValue Src = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, @@ -20290,6 +20300,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, break; } + case RISCVISD::ABSW: case RISCVISD::CLZW: case RISCVISD::CTZW: { // Only the lower 32 bits of the first operand are read @@ -21862,6 +21873,7 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode( case RISCVISD::REMUW: case RISCVISD::ROLW: case RISCVISD::RORW: + case RISCVISD::ABSW: case RISCVISD::FCVT_W_RV64: case RISCVISD::FCVT_WU_RV64: case RISCVISD::STRICT_FCVT_W_RV64: diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index cc085bb6c9fd7..4cbbba3aa68cb 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -1461,5 +1461,10 @@ let Predicates = [HasStdExtP, IsRV32] in { // Codegen patterns //===----------------------------------------------------------------------===// +def riscv_absw : RVSDNode<"ABSW", SDTIntUnaryOp>; + let Predicates = [HasStdExtP] in def : PatGpr; + +let Predicates = [HasStdExtP, IsRV64] in +def : PatGpr; diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp index d08115b72977f..ea98cdb4a1e67 100644 --- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp +++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp @@ -172,6 +172,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI, case RISCV::CTZW: case RISCV::CPOPW: case RISCV::SLLI_UW: + case RISCV::ABSW: case RISCV::FMV_W_X: case RISCV::FCVT_H_W: case RISCV::FCVT_H_W_INX: diff --git a/llvm/test/CodeGen/RISCV/rv64p.ll b/llvm/test/CodeGen/RISCV/rv64p.ll index cb07f945a582a..f937f44f13320 100644 --- a/llvm/test/CodeGen/RISCV/rv64p.ll +++ b/llvm/test/CodeGen/RISCV/rv64p.ll @@ -297,8 +297,7 @@ declare i32 @llvm.abs.i32(i32, i1 immarg) define i32 @abs_i32(i32 %x) { ; CHECK-LABEL: abs_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: sext.w a0, a0 -; CHECK-NEXT: abs a0, a0 +; CHECK-NEXT: absw a0, a0 ; CHECK-NEXT: ret %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true) ret i32 %abs @@ -307,8 +306,7 @@ define i32 @abs_i32(i32 %x) { define signext i32 @abs_i32_sext(i32 signext %x) { ; CHECK-LABEL: abs_i32_sext: ; CHECK: # %bb.0: -; CHECK-NEXT: abs a0, a0 -; CHECK-NEXT: sext.w a0, a0 +; CHECK-NEXT: absw a0, a0 ; CHECK-NEXT: ret %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true) ret i32 %abs From e11033b7c2d880d467caa4cea3022f3a3856ef83 Mon Sep 17 00:00:00 2001 From: Anshil Gandhi <95053726+gandhi56@users.noreply.github.com> Date: Thu, 30 Oct 2025 11:45:02 -0400 Subject: [PATCH 207/539] [AMDGPU] Add regbankselect rules for G_ADD/SUB and variants (#159860) Add legalization rules for G_ADD, G_UADDO, G_UADDE and their SUB counterparts. --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 15 +- .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 1 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 14 +- .../AMDGPU/AMDGPURegBankLegalizeRules.h | 3 +- llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll | 612 ++++++++++++++++++ .../AMDGPU/GlobalISel/regbankselect-add.mir | 524 +++++++++++++++ .../GlobalISel/regbankselect-add.s16.mir | 19 +- .../GlobalISel/regbankselect-add.v2s16.mir | 24 +- .../AMDGPU/GlobalISel/regbankselect-sext.mir | 8 + .../AMDGPU/GlobalISel/regbankselect-sub.mir | 479 +++++++++++++- .../AMDGPU/GlobalISel/regbankselect-zext.mir | 8 + llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll | 535 +++++++++++++++ 12 files changed, 2218 insertions(+), 24 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.mir create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 540756653dd22..b84c30ecaac0b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -500,6 +500,16 @@ void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) { MI.eraseFromParent(); } +void RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) { + auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg()); + auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg()); + auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo}); + auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi}); + B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), + {ResLo.getReg(0), ResHi.getReg(0)}); + MI.eraseFromParent(); +} + static bool isSignedBFE(MachineInstr &MI) { if (GIntrinsic *GI = dyn_cast(&MI)) return (GI->is(Intrinsic::amdgcn_sbfe)); @@ -804,6 +814,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, } break; } + case UnpackAExt: + return lowerUnpackAExt(MI); case WidenMMOToS32: return widenMMOToS32(cast(MI)); } @@ -1120,7 +1132,8 @@ void RegBankLegalizeHelper::applyMappingDst( assert(RB == SgprRB); Register NewDst = MRI.createVirtualRegister(SgprRB_S32); Op.setReg(NewDst); - B.buildTrunc(Reg, NewDst); + if (!MRI.use_empty(Reg)) + B.buildTrunc(Reg, NewDst); break; } case InvalidMapping: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index d937815bf4714..ad3ff1d374ec1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -124,6 +124,7 @@ class RegBankLegalizeHelper { void lowerSplitTo32Select(MachineInstr &MI); void lowerSplitTo32SExtInReg(MachineInstr &MI); void lowerUnpackMinMax(MachineInstr &MI); + void lowerUnpackAExt(MachineInstr &MI); }; } // end namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index a67b12a22589c..01abd358ff595 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -470,7 +470,19 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}) .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) - .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackAExt}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}); + + addRulesForGOpcs({G_UADDO, G_USUBO}, Standard) + .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}}) + .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}}); + + addRulesForGOpcs({G_UADDE, G_USUBE}, Standard) + .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr32AExtBoolInReg}}) + .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}}); addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 93e0efda77fdd..030bd75f8cd10 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -223,7 +223,8 @@ enum LoweringMethodID { UniCstExt, SplitLoad, WidenLoad, - WidenMMOToS32 + WidenMMOToS32, + UnpackAExt }; enum FastRulesTypes { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll new file mode 100644 index 0000000000000..e11720011af10 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll @@ -0,0 +1,612 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX12 %s + +define i16 @s_add_i16(i16 inreg %a, i16 inreg %b) { +; GFX7-LABEL: s_add_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s16, s16, s17 +; GFX7-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_add_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: s_add_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_i32 s16, s16, s17 +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_add_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_add_i32 s16, s16, s17 +; GFX10-NEXT: v_mov_b32_e32 v0, s16 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_add_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: s_add_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c = add i16 %a, %b + ret i16 %c +} + +define i16 @v_add_i16(i16 %a, i16 %b) { +; GFX7-LABEL: v_add_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_add_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_add_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_add_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_add_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_add_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c = add i16 %a, %b + ret i16 %c +} + +define i32 @s_add_i32(i32 inreg %a, i32 inreg %b) { +; GFX7-LABEL: s_add_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s16, s16, s17 +; GFX7-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_add_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: s_add_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_i32 s16, s16, s17 +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_add_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_add_i32 s16, s16, s17 +; GFX10-NEXT: v_mov_b32_e32 v0, s16 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_add_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: s_add_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c = add i32 %a, %b + ret i32 %c +} + +define i32 @v_add_i32(i32 %a, i32 %b) { +; GFX7-LABEL: v_add_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_add_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_add_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_add_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_add_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_add_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c = add i32 %a, %b + ret i32 %c +} + +define <2 x i16> @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) { +; GFX7-LABEL: s_add_v2i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s16, s16, s18 +; GFX7-NEXT: s_add_i32 s17, s17, s19 +; GFX7-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_add_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s4, s16, 16 +; GFX9-NEXT: s_lshr_b32 s5, s17, 16 +; GFX9-NEXT: s_add_i32 s16, s16, s17 +; GFX9-NEXT: s_add_i32 s4, s4, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: s_add_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s4, s16, 16 +; GFX8-NEXT: s_lshr_b32 s5, s17, 16 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_add_i32 s16, s16, s17 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s16 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_or_b32 s4, s5, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_add_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s4, s16, 16 +; GFX10-NEXT: s_lshr_b32 s5, s17, 16 +; GFX10-NEXT: s_add_i32 s16, s16, s17 +; GFX10-NEXT: s_add_i32 s4, s4, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_add_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-NEXT: s_lshr_b32 s3, s1, 16 +; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: s_add_i32 s2, s2, s3 +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: s_add_v2i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshr_b32 s2, s0, 16 +; GFX12-NEXT: s_lshr_b32 s3, s1, 16 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s2, s2, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c = add <2 x i16> %a, %b + ret <2 x i16> %c +} + +define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) { +; GFX7-LABEL: v_add_v2i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_add_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_add_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v2, v0, v1 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_add_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_add_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_add_v2i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c = add <2 x i16> %a, %b + ret <2 x i16> %c +} + +define i64 @s_add_i64(i64 inreg %a, i64 inreg %b) { +; GFX7-LABEL: s_add_i64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s16, s18 +; GFX7-NEXT: s_addc_u32 s5, s17, s19 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_add_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s4, s16, s18 +; GFX9-NEXT: s_addc_u32 s5, s17, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: s_add_i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s4, s16, s18 +; GFX8-NEXT: s_addc_u32 s5, s17, s19 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_add_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s4, s16, s18 +; GFX10-NEXT: s_addc_u32 s5, s17, s19 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_add_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s0, s0, s2 +; GFX11-NEXT: s_addc_u32 s1, s1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: s_add_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c = add i64 %a, %b + ret i64 %c +} + +define i64 @v_add_i64(i64 %a, i64 %b) { +; GFX7-LABEL: v_add_i64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_add_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_add_i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_add_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_add_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_add_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c = add i64 %a, %b + ret i64 %c +} + +define void @s_uaddo_uadde(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, ptr addrspace(1) %carry) { +; GFX7-LABEL: s_uaddo_uadde: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s16, s18 +; GFX7-NEXT: s_addc_u32 s5, s17, s19 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_cselect_b32 s8, 1, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_store_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_uaddo_uadde: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s4, s16, s18 +; GFX9-NEXT: s_addc_u32 s5, s17, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: s_uaddo_uadde: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s4, s16, s18 +; GFX8-NEXT: s_addc_u32 s5, s17, s19 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_cselect_b32 s6, 1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: flat_store_dword v[2:3], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_uaddo_uadde: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s4, s16, s18 +; GFX10-NEXT: s_addc_u32 s5, s17, s19 +; GFX10-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v6, s6 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX10-NEXT: global_store_dword v[2:3], v6, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_uaddo_uadde: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s0, s0, s2 +; GFX11-NEXT: s_addc_u32 s1, s1, s3 +; GFX11-NEXT: s_cselect_b32 s2, 1, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX11-NEXT: v_mov_b32_e32 v6, s2 +; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off +; GFX11-NEXT: global_store_b32 v[2:3], v6, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: s_uaddo_uadde: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_u32 s0, s0, s2 +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s3 +; GFX12-NEXT: s_cselect_b32 s2, 1, 0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off +; GFX12-NEXT: global_store_b32 v[2:3], v6, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %uaddo = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) + %add = extractvalue {i64, i1} %uaddo, 0 + %of = extractvalue {i64, i1} %uaddo, 1 + %of32 = select i1 %of, i32 1, i32 0 + store i64 %add, ptr addrspace(1) %res + store i32 %of32, ptr addrspace(1) %carry + ret void +} + +define void @v_uaddo_uadde(i64 %a, i64 %b, ptr addrspace(1) %res, ptr addrspace(1) %carry) { +; GFX7-LABEL: v_uaddo_uadde: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_dword v2, v[6:7], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uaddo_uadde: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_store_dword v[6:7], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_uaddo_uadde: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: flat_store_dword v[6:7], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uaddo_uadde: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX10-NEXT: global_store_dword v[6:7], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddo_uadde: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: global_store_b64 v[4:5], v[0:1], off +; GFX11-NEXT: global_store_b32 v[6:7], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_uaddo_uadde: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX12-NEXT: global_store_b64 v[4:5], v[0:1], off +; GFX12-NEXT: global_store_b32 v[6:7], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %uaddo = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) + %add = extractvalue {i64, i1} %uaddo, 0 + %of = extractvalue {i64, i1} %uaddo, 1 + %of32 = select i1 %of, i32 1, i32 0 + store i64 %add, ptr addrspace(1) %res + store i32 %of32, ptr addrspace(1) %carry + ret void +} + +declare {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.mir new file mode 100644 index 0000000000000..097372a957461 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.mir @@ -0,0 +1,524 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - | FileCheck %s +--- +name: add_s16_ss +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; CHECK-LABEL: name: add_s16_ss + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s16) + ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC1]](s16) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[ANYEXT]], [[ANYEXT1]] + ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[ADD]](s32) + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s16) = G_AND [[TRUNC2]], [[TRUNC2]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(s16) = G_ADD %2, %3 + %5:_(s16) = G_AND %4, %4 +... + +--- +name: add_s16_sv +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; CHECK-LABEL: name: add_s16_sv + ; CHECK: liveins: $sgpr0, $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[TRUNC]](s16) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s16) = G_ADD [[COPY2]], [[TRUNC1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[ADD]], [[ADD]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $vgpr0 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(s16) = G_ADD %2, %3 + %5:_(s16) = G_AND %4, %4 +... + +--- +name: add_s16_vs +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; CHECK-LABEL: name: add_s16_vs + ; CHECK: liveins: $sgpr0, $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[TRUNC1]](s16) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s16) = G_ADD [[TRUNC]], [[COPY2]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[ADD]], [[ADD]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $sgpr0 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(s16) = G_ADD %2, %3 + %5:_(s16) = G_AND %4, %4 +... + +--- +name: add_s16_vv +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: add_s16_vv + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s16) = G_ADD [[TRUNC]], [[TRUNC1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[ADD]], [[ADD]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(s16) = G_ADD %2, %3 + %5:_(s16) = G_AND %4, %4 +... + +--- +name: add_s32_ss +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; CHECK-LABEL: name: add_s32_ss + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ADD]], [[ADD]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s32) = G_ADD %0, %1 + %3:_(s32) = G_AND %2, %2 +... + +--- +name: add_s32_sv +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; CHECK-LABEL: name: add_s32_sv + ; CHECK: liveins: $sgpr0, $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[ADD]], [[ADD]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $vgpr0 + %2:_(s32) = G_ADD %0, %1 + %3:_(s32) = G_AND %2, %2 +... + +--- +name: add_s32_vs +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; CHECK-LABEL: name: add_s32_vs + ; CHECK: liveins: $sgpr0, $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY]], [[COPY2]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[ADD]], [[ADD]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $sgpr0 + %2:_(s32) = G_ADD %0, %1 + %3:_(s32) = G_AND %2, %2 +... + +--- +name: add_s32_vv +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: add_s32_vv + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[ADD]], [[ADD]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_ADD %0, %1 + %3:_(s32) = G_AND %2, %2 +... + +--- +name: add_s64_ss +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; CHECK-LABEL: name: add_s64_ss + ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s64) = G_ADD [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 255 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s64) = G_AND [[ADD]], [[ADD]] + %0:_(s64) = COPY $sgpr0_sgpr1 + %1:_(s64) = COPY $sgpr2_sgpr3 + %2:_(s64) = G_ADD %0, %1 + %3:_(s64) = G_CONSTANT i64 255 + %4:_(s64) = G_AND %2, %2 +... + +--- +name: add_s64_sv +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + ; CHECK-LABEL: name: add_s64_sv + ; CHECK: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s64) = COPY [[COPY]](s64) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s64) = G_ADD [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ADD]](s64) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ADD]](s64) + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]] + ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32) + %0:_(s64) = COPY $sgpr0_sgpr1 + %1:_(s64) = COPY $vgpr0_vgpr1 + %2:_(s64) = G_ADD %0, %1 + %3:_(s64) = G_AND %2, %2 +... + +--- +name: add_s64_vs +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + ; CHECK-LABEL: name: add_s64_vs + ; CHECK: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s64) = COPY [[COPY1]](s64) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s64) = G_ADD [[COPY]], [[COPY2]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ADD]](s64) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ADD]](s64) + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]] + ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $sgpr0_sgpr1 + %2:_(s64) = G_ADD %0, %1 + %3:_(s64) = G_AND %2, %2 +... + +--- +name: add_s64_vv +legalized: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-LABEL: name: add_s64_vv + ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s64) = G_ADD [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ADD]](s64) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ADD]](s64) + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]] + ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s64) = G_ADD %0, %1 + %3:_(s64) = G_AND %2, %2 +... + +--- +name: uaddo_s32_ss +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; CHECK-LABEL: name: uaddo_s32_ss + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[UADDO:%[0-9]+]]:sgpr(s32), [[UADDO1:%[0-9]+]]:sgpr(s32) = G_UADDO [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[UADDO1]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C]], [[C1]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[SELECT]], [[UADDO]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s32), %3:_(s1) = G_UADDO %0, %1 + %4:_(s32) = G_ZEXT %3 + %5:_(s32) = G_AND %4, %2 +... + +--- +name: uaddo_s32_sv +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr1 + ; CHECK-LABEL: name: uaddo_s32_sv + ; CHECK: liveins: $sgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[UADDO:%[0-9]+]]:vgpr(s32), [[UADDO1:%[0-9]+]]:vcc(s1) = G_UADDO [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[UADDO1]](s1), [[C]], [[C1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UADDO]], [[SELECT]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32), %3:_(s1) = G_UADDO %0, %1 + %4:_(s32) = G_ZEXT %3 + %5:_(s32) = G_AND %2, %4 +... + +--- +name: uaddo_s32_vs +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $sgpr1 + ; CHECK-LABEL: name: uaddo_s32_vs + ; CHECK: liveins: $vgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[UADDO:%[0-9]+]]:vgpr(s32), [[UADDO1:%[0-9]+]]:vcc(s1) = G_UADDO [[COPY]], [[COPY2]] + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[UADDO1]](s1), [[C]], [[C1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UADDO]], [[SELECT]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s32), %3:_(s1) = G_UADDO %0, %1 + %4:_(s32) = G_ZEXT %3 + %5:_(s32) = G_AND %2, %4 +... + +--- +name: uaddo_s32_vv +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: uaddo_s32_vv + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[UADDO:%[0-9]+]]:vgpr(s32), [[UADDO1:%[0-9]+]]:vcc(s1) = G_UADDO [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[UADDO1]](s1), [[C]], [[C1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UADDO]], [[SELECT]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32), %3:_(s1) = G_UADDO %0, %1 + %4:_(s32) = G_ZEXT %3 + %5:_(s32) = G_AND %2, %4 +... + +--- +name: uadde_s32_ss +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + ; CHECK-LABEL: name: uadde_s32_ss + ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY2]], [[C]] + ; CHECK-NEXT: [[UADDE:%[0-9]+]]:sgpr(s32), [[UADDE1:%[0-9]+]]:sgpr(s32) = G_UADDE [[COPY]], [[COPY1]], [[AND]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[UADDE1]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND1]](s32), [[C]], [[C1]] + ; CHECK-NEXT: [[AND2:%[0-9]+]]:sgpr(s32) = G_AND [[UADDE]], [[SELECT]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s32) = COPY $sgpr2 + %3:_(s1) = G_TRUNC %2 + %4:_(s32), %5:_(s1) = G_UADDE %0, %1, %3 + %6:_(s32) = G_ZEXT %5 + %7:_(s32) = G_AND %4, %6 +... + +--- +name: uadde_s32_sv +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr1, $sgpr2 + ; CHECK-LABEL: name: uadde_s32_sv + ; CHECK: liveins: $sgpr0, $vgpr1, $sgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_AMDGPU_COPY_VCC_SCC [[COPY2]](s32) + ; CHECK-NEXT: [[UADDE:%[0-9]+]]:vgpr(s32), [[UADDE1:%[0-9]+]]:vcc(s1) = G_UADDE [[COPY3]], [[COPY1]], [[AMDGPU_COPY_VCC_SCC]] + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[UADDE1]](s1), [[C]], [[C1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UADDE]], [[SELECT]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $sgpr2 + %3:_(s1) = G_TRUNC %2 + %4:_(s32), %5:_(s1) = G_UADDE %0, %1, %3 + %6:_(s32) = G_ZEXT %5 + %7:_(s32) = G_AND %4, %6 +... + +--- +name: uadde_s32_vs +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $sgpr1, $sgpr2 + ; CHECK-LABEL: name: uadde_s32_vs + ; CHECK: liveins: $vgpr0, $sgpr1, $sgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_AMDGPU_COPY_VCC_SCC [[COPY2]](s32) + ; CHECK-NEXT: [[UADDE:%[0-9]+]]:vgpr(s32), [[UADDE1:%[0-9]+]]:vcc(s1) = G_UADDE [[COPY]], [[COPY3]], [[AMDGPU_COPY_VCC_SCC]] + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[UADDE1]](s1), [[C]], [[C1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UADDE]], [[SELECT]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s32) = COPY $sgpr2 + %3:_(s1) = G_TRUNC %2 + %4:_(s32), %5:_(s1) = G_UADDE %0, %1, %3 + %6:_(s32) = G_ZEXT %5 + %7:_(s32) = G_AND %4, %6 +... + +--- +name: uadde_s32_vv +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: uadde_s32_vv + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY2]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), [[C1]] + ; CHECK-NEXT: [[UADDE:%[0-9]+]]:vgpr(s32), [[UADDE1:%[0-9]+]]:vcc(s1) = G_UADDE [[COPY]], [[COPY1]], [[ICMP]] + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[UADDE1]](s1), [[C]], [[C1]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UADDE]], [[SELECT]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s1) = G_TRUNC %2 + %4:_(s32), %5:_(s1) = G_UADDE %0, %1, %3 + %6:_(s32) = G_ZEXT %5 + %7:_(s32) = G_AND %4, %6 +... + +--- +name: uadde_s32_ss_scc_use +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + ; CHECK-LABEL: name: uadde_s32_ss_scc_use + ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY2]], [[C]] + ; CHECK-NEXT: [[UADDE:%[0-9]+]]:sgpr(s32), [[UADDE1:%[0-9]+]]:sgpr(s32) = G_UADDE [[COPY]], [[COPY1]], [[AND]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[UADDE1]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND1]](s32), [[C]], [[C1]] + ; CHECK-NEXT: [[AND2:%[0-9]+]]:sgpr(s32) = G_AND [[UADDE]], [[SELECT]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s32) = COPY $sgpr2 + %3:_(s1) = G_TRUNC %2 + %4:_(s32), %5:_(s1) = G_UADDE %0, %1, %3 + %6:_(s32) = G_ZEXT %5 + %8:_(s32) = G_AND %4, %6 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s16.mir index 54ee69fcb2204..30c958fcb192a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s16.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - | FileCheck %s --- name: add_s16_ss legalized: true @@ -19,13 +18,13 @@ body: | ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC1]](s16) ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[ANYEXT]], [[ANYEXT1]] ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[ADD]](s32) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[TRUNC2]](s16) + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s16) = G_AND [[TRUNC2]], [[TRUNC2]] %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 %2:_(s16) = G_TRUNC %0 %3:_(s16) = G_TRUNC %1 %4:_(s16) = G_ADD %2, %3 - S_ENDPGM 0, implicit %4 + %5:_(s16) = G_AND %4, %4 ... --- @@ -44,13 +43,13 @@ body: | ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[TRUNC]](s16) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s16) = G_ADD [[COPY2]], [[TRUNC1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[ADD]](s16) + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[ADD]], [[ADD]] %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $vgpr0 %2:_(s16) = G_TRUNC %0 %3:_(s16) = G_TRUNC %1 %4:_(s16) = G_ADD %2, %3 - S_ENDPGM 0, implicit %4 + %5:_(s16) = G_AND %4, %4 ... --- @@ -69,13 +68,13 @@ body: | ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[TRUNC1]](s16) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s16) = G_ADD [[TRUNC]], [[COPY2]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[ADD]](s16) + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[ADD]], [[ADD]] %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $sgpr0 %2:_(s16) = G_TRUNC %0 %3:_(s16) = G_TRUNC %1 %4:_(s16) = G_ADD %2, %3 - S_ENDPGM 0, implicit %4 + %5:_(s16) = G_AND %4, %4 ... --- @@ -93,11 +92,11 @@ body: | ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s16) = G_ADD [[TRUNC]], [[TRUNC1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[ADD]](s16) + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[ADD]], [[ADD]] %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %0 %3:_(s16) = G_TRUNC %1 %4:_(s16) = G_ADD %2, %3 - S_ENDPGM 0, implicit %4 + %5:_(s16) = G_AND %4, %4 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.v2s16.mir index 97018fac13a87..01eb39111b0ab 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.v2s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.v2s16.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - | FileCheck %s --- name: add_v2s16_ss @@ -18,16 +17,19 @@ body: | ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[BITCAST]], [[BITCAST1]] ; CHECK-NEXT: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[LSHR]], [[LSHR1]] ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ADD]](s32), [[ADD1]](s32) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s16) = G_CONSTANT i16 255 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16) + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(<2 x s16>) = G_AND [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR]] %0:_(<2 x s16>) = COPY $sgpr0 %1:_(<2 x s16>) = COPY $sgpr1 %2:_(<2 x s16>) = G_ADD %0, %1 - S_ENDPGM 0, implicit %2 + %3:_(s16) = G_CONSTANT i16 255 + %4:_(<2 x s16>) = G_BUILD_VECTOR %3, %3 + %5:_(<2 x s16>) = G_AND %2, %4 ... --- @@ -44,11 +46,11 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY]](<2 x s16>) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(<2 x s16>) = G_ADD [[COPY2]], [[COPY1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[ADD]](<2 x s16>) + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(<2 x s16>) = G_AND [[ADD]], [[ADD]] %0:_(<2 x s16>) = COPY $sgpr0 %1:_(<2 x s16>) = COPY $vgpr0 %2:_(<2 x s16>) = G_ADD %0, %1 - S_ENDPGM 0, implicit %2 + %3:_(<2 x s16>) = G_AND %2, %2 ... --- @@ -65,9 +67,11 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY1]](<2 x s16>) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(<2 x s16>) = G_ADD [[COPY]], [[COPY2]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(<2 x s16>) = G_AND [[ADD]], [[ADD]] %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $sgpr0 %2:_(<2 x s16>) = G_ADD %0, %1 + %3:_(<2 x s16>) = G_AND %2, %2 ... --- @@ -83,9 +87,9 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(<2 x s16>) = G_ADD [[COPY]], [[COPY1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[ADD]](<2 x s16>) + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(<2 x s16>) = G_AND [[ADD]], [[ADD]] %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_ADD %0, %1 - S_ENDPGM 0, implicit %2 + %3:_(<2 x s16>) = G_AND %2, %2 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir index 7378c9366ec36..e0e783e7a62f9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext.mir @@ -77,10 +77,14 @@ body: | ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C1]], [[C2]] ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s16) = G_CONSTANT i16 255 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s16) = G_AND [[TRUNC]], [[C3]] %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 %2:_(s1) = G_ICMP intpred(eq), %0, %1 %3:_(s16) = G_SEXT %2 + %4:_(s16) = G_CONSTANT i16 255 + %5:_(s16) = G_AND %3, %4 ... --- @@ -215,9 +219,13 @@ body: | ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C1]], [[C2]] ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s16) = G_CONSTANT i16 255 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s16) = G_AND [[TRUNC]], [[C3]] %0:_(s32) = COPY $sgpr0 %1:_(s1) = G_TRUNC %0 %2:_(s16) = G_SEXT %1 + %3:_(s16) = G_CONSTANT i16 255 + %4:_(s16) = G_AND %2, %3 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sub.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sub.mir index b0199d3ad5cd1..e3c01c0e7fcb4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sub.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sub.mir @@ -1,5 +1,107 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +--- +name: sub_s16_ss +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; CHECK-LABEL: name: sub_s16_ss + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s16) + ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC1]](s16) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:sgpr(s32) = G_SUB [[ANYEXT]], [[ANYEXT1]] + ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SUB]](s32) + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s16) = G_AND [[TRUNC2]], [[TRUNC2]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(s16) = G_SUB %2, %3 + %6:_(s16) = G_AND %4, %4 +... + +--- +name: sub_s16_sv +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; CHECK-LABEL: name: sub_s16_sv + ; CHECK: liveins: $sgpr0, $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[TRUNC]](s16) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s16) = G_SUB [[COPY2]], [[TRUNC1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[SUB]], [[SUB]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $vgpr0 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(s16) = G_SUB %2, %3 + %6:_(s16) = G_AND %4, %4 +... + +--- +name: sub_s16_vs +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; CHECK-LABEL: name: sub_s16_vs + ; CHECK: liveins: $sgpr0, $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[TRUNC1]](s16) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s16) = G_SUB [[TRUNC]], [[COPY2]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[SUB]], [[SUB]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $sgpr0 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(s16) = G_SUB %2, %3 + %6:_(s16) = G_AND %4, %4 +... + +--- +name: sub_s16_vv +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: sub_s16_vv + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s16) = G_SUB [[TRUNC]], [[TRUNC1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s16) = G_AND [[SUB]], [[SUB]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(s16) = G_SUB %2, %3 + %6:_(s16) = G_AND %4, %4 +... --- name: sub_s32_ss @@ -14,9 +116,11 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; CHECK-NEXT: [[SUB:%[0-9]+]]:sgpr(s32) = G_SUB [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[SUB]], [[SUB]] %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 %2:_(s32) = G_SUB %0, %1 + %4:_(s32) = G_AND %2, %2 ... --- @@ -33,9 +137,11 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s32) = G_SUB [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[SUB]], [[SUB]] %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $vgpr0 %2:_(s32) = G_SUB %0, %1 + %4:_(s32) = G_AND %2, %2 ... --- @@ -52,9 +158,11 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s32) = G_SUB [[COPY]], [[COPY2]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[SUB]], [[SUB]] %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $sgpr0 %2:_(s32) = G_SUB %0, %1 + %4:_(s32) = G_AND %2, %2 ... --- @@ -70,7 +178,376 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s32) = G_SUB [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[SUB]], [[SUB]] %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_SUB %0, %1 + %4:_(s32) = G_AND %2, %2 +... + +--- +name: sub_v2s16_ss +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; CHECK-LABEL: name: sub_v2s16_ss + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:sgpr(s32) = G_SUB [[BITCAST]], [[BITCAST1]] + ; CHECK-NEXT: [[SUB1:%[0-9]+]]:sgpr(s32) = G_SUB [[LSHR]], [[LSHR1]] + ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SUB]](s32), [[SUB1]](s32) + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(<2 x s16>) = G_AND [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC]] + %0:_(<2 x s16>) = COPY $sgpr0 + %1:_(<2 x s16>) = COPY $sgpr1 + %2:_(<2 x s16>) = G_SUB %0, %1 + %5:_(<2 x s16>) = G_AND %2, %2 +... + +--- +name: sub_v2s16_sv +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; CHECK-LABEL: name: sub_v2s16_sv + ; CHECK: liveins: $sgpr0, $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY]](<2 x s16>) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(<2 x s16>) = G_SUB [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(<2 x s16>) = G_AND [[SUB]], [[SUB]] + %0:_(<2 x s16>) = COPY $sgpr0 + %1:_(<2 x s16>) = COPY $vgpr0 + %2:_(<2 x s16>) = G_SUB %0, %1 + %5:_(<2 x s16>) = G_AND %2, %2 +... + +--- +name: sub_v2s16_vs +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; CHECK-LABEL: name: sub_v2s16_vs + ; CHECK: liveins: $sgpr0, $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY1]](<2 x s16>) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(<2 x s16>) = G_SUB [[COPY]], [[COPY2]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(<2 x s16>) = G_AND [[SUB]], [[SUB]] + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(<2 x s16>) = COPY $sgpr0 + %2:_(<2 x s16>) = G_SUB %0, %1 + %5:_(<2 x s16>) = G_AND %2, %2 +... + +--- +name: sub_v2s16_vv +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: sub_v2s16_vv + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 + ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(<2 x s16>) = G_SUB [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(<2 x s16>) = G_AND [[SUB]], [[SUB]] + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(<2 x s16>) = COPY $vgpr1 + %2:_(<2 x s16>) = G_SUB %0, %1 + %5:_(<2 x s16>) = G_AND %2, %2 +... + +--- +name: sub_s64_ss +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr0_sgpr1 + ; CHECK-LABEL: name: sub_s64_ss + ; CHECK: liveins: $sgpr0_sgpr1, $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[SUB:%[0-9]+]]:sgpr(s64) = G_SUB [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s64) = G_AND [[SUB]], [[SUB]] + %0:_(s64) = COPY $sgpr0_sgpr1 + %1:_(s64) = COPY $sgpr0_sgpr1 + %2:_(s64) = G_SUB %0, %1 + %4:_(s64) = G_AND %2, %2 +... + +--- +name: sub_s64_sv +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + ; CHECK-LABEL: name: sub_s64_sv + ; CHECK: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s64) = COPY [[COPY]](s64) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s64) = G_SUB [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[SUB]](s64) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[SUB]](s64) + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]] + ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32) + %0:_(s64) = COPY $sgpr0_sgpr1 + %1:_(s64) = COPY $vgpr0_vgpr1 + %2:_(s64) = G_SUB %0, %1 + %4:_(s64) = G_AND %2, %2 +... + +--- +name: sub_s64_vs +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + ; CHECK-LABEL: name: sub_s64_vs + ; CHECK: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s64) = COPY [[COPY1]](s64) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s64) = G_SUB [[COPY]], [[COPY2]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[SUB]](s64) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[SUB]](s64) + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]] + ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $sgpr0_sgpr1 + %2:_(s64) = G_SUB %0, %1 + %4:_(s64) = G_AND %2, %2 +... + +--- +name: sub_s64_vv +legalized: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-LABEL: name: sub_s64_vv + ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: [[SUB:%[0-9]+]]:vgpr(s64) = G_SUB [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[SUB]](s64) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[SUB]](s64) + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]] + ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s64) = G_SUB %0, %1 + %4:_(s64) = G_AND %2, %2 +... + +--- +name: usubo_s32_ss +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; CHECK-LABEL: name: usubo_s32_ss + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[USUBO:%[0-9]+]]:sgpr(s32), [[USUBO1:%[0-9]+]]:sgpr(s32) = G_USUBO [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[USUBO]], [[USUBO]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s32), %3:_(s1) = G_USUBO %0, %1 + %5:_(s32) = G_AND %2, %2 +... + +--- +name: usubo_s32_sv +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr1 + ; CHECK-LABEL: name: usubo_s32_sv + ; CHECK: liveins: $sgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[USUBO:%[0-9]+]]:vgpr(s32), [[USUBO1:%[0-9]+]]:vcc(s1) = G_USUBO [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[USUBO]], [[USUBO]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32), %3:_(s1) = G_USUBO %0, %1 + %5:_(s32) = G_AND %2, %2 +... + +--- +name: usubo_s32_vs +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $sgpr1 + ; CHECK-LABEL: name: usubo_s32_vs + ; CHECK: liveins: $vgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[USUBO:%[0-9]+]]:vgpr(s32), [[USUBO1:%[0-9]+]]:vcc(s1) = G_USUBO [[COPY]], [[COPY2]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[USUBO]], [[USUBO]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s32), %3:_(s1) = G_USUBO %0, %1 + %5:_(s32) = G_AND %2, %2 +... + +--- +name: usubo_s32_vv +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: usubo_s32_vv + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[USUBO:%[0-9]+]]:vgpr(s32), [[USUBO1:%[0-9]+]]:vcc(s1) = G_USUBO [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[USUBO]], [[USUBO]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32), %3:_(s1) = G_USUBO %0, %1 + %5:_(s32) = G_AND %2, %2 +... + +--- +name: usube_s32_ss +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + ; CHECK-LABEL: name: usube_s32_ss + ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY2]], [[C]] + ; CHECK-NEXT: [[USUBE:%[0-9]+]]:sgpr(s32), [[USUBE1:%[0-9]+]]:sgpr(s32) = G_USUBE [[COPY]], [[COPY1]], [[AND]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[USUBE]], [[USUBE]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s32) = COPY $sgpr2 + %3:_(s1) = G_TRUNC %2 + %4:_(s32), %5:_(s1) = G_USUBE %0, %1, %3 + %7:_(s32) = G_AND %4, %4 +... + +--- +name: usube_s32_sv +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr1, $sgpr2 + ; CHECK-LABEL: name: usube_s32_sv + ; CHECK: liveins: $sgpr0, $vgpr1, $sgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_AMDGPU_COPY_VCC_SCC [[COPY2]](s32) + ; CHECK-NEXT: [[USUBE:%[0-9]+]]:vgpr(s32), [[USUBE1:%[0-9]+]]:vcc(s1) = G_USUBE [[COPY3]], [[COPY1]], [[AMDGPU_COPY_VCC_SCC]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[USUBE]], [[USUBE]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $sgpr2 + %3:_(s1) = G_TRUNC %2 + %4:_(s32), %5:_(s1) = G_USUBE %0, %1, %3 + %7:_(s32) = G_AND %4, %4 +... + +--- +name: usube_s32_vs +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $sgpr1, $sgpr2 + ; CHECK-LABEL: name: usube_s32_vs + ; CHECK: liveins: $vgpr0, $sgpr1, $sgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_AMDGPU_COPY_VCC_SCC [[COPY2]](s32) + ; CHECK-NEXT: [[USUBE:%[0-9]+]]:vgpr(s32), [[USUBE1:%[0-9]+]]:vcc(s1) = G_USUBE [[COPY]], [[COPY3]], [[AMDGPU_COPY_VCC_SCC]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[USUBE]], [[USUBE]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s32) = COPY $sgpr2 + %3:_(s1) = G_TRUNC %2 + %4:_(s32), %5:_(s1) = G_USUBE %0, %1, %3 + %7:_(s32) = G_AND %4, %4 +... + +--- +name: usube_s32_vv +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: usube_s32_vv + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY2]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), [[C1]] + ; CHECK-NEXT: [[USUBE:%[0-9]+]]:vgpr(s32), [[USUBE1:%[0-9]+]]:vcc(s1) = G_USUBE [[COPY]], [[COPY1]], [[ICMP]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[USUBE]], [[USUBE]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s1) = G_TRUNC %2 + %4:_(s32), %5:_(s1) = G_USUBE %0, %1, %3 + %7:_(s32) = G_AND %4, %4 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zext.mir index 088c20a3137f7..d4baa5fb864fc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zext.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zext.mir @@ -73,10 +73,14 @@ body: | ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C]], [[C1]] ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32) + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s16) = G_CONSTANT i16 255 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s16) = G_AND [[TRUNC]], [[C2]] %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 %2:_(s1) = G_ICMP intpred(eq), %0, %1 %3:_(s16) = G_ZEXT %2 + %4:_(s16) = G_CONSTANT i16 255 + %5:_(s16) = G_AND %3, %4 ... --- @@ -209,9 +213,13 @@ body: | ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C]], [[C1]] ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[SELECT]](s32) + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s16) = G_CONSTANT i16 255 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s16) = G_AND [[TRUNC]], [[C2]] %0:_(s32) = COPY $sgpr0 %1:_(s1) = G_TRUNC %0 %2:_(s16) = G_ZEXT %1 + %3:_(s16) = G_CONSTANT i16 255 + %4:_(s16) = G_AND %2, %3 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll new file mode 100644 index 0000000000000..8b5958daac168 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll @@ -0,0 +1,535 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX12 %s + +define i16 @s_sub_i16(i16 inreg %a, i16 inreg %b) { +; GFX7-LABEL: s_sub_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_sub_i32 s4, s16, s17 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_sub_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_sub_i32 s4, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: s_sub_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_sub_i32 s4, s16, s17 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_sub_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_sub_i32 s4, s16, s17 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_sub_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_sub_i32 s0, s0, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: s_sub_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_sub_co_i32 s0, s0, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c = sub i16 %a, %b + ret i16 %c +} + +define i16 @v_sub_i16(i16 %a, i16 %b) { +; GFX7-LABEL: v_sub_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sub_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_sub_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_sub_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_sub_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_nc_u16 v0.l, v0.l, v1.l +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_sub_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_sub_nc_u16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c = sub i16 %a, %b + ret i16 %c +} + +define i32 @s_sub_i32(i32 inreg %a, i32 inreg %b) { +; GFX7-LABEL: s_sub_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_sub_i32 s4, s16, s17 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_sub_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_sub_i32 s4, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: s_sub_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_sub_i32 s4, s16, s17 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_sub_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_sub_i32 s4, s16, s17 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_sub_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_sub_i32 s0, s0, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: s_sub_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_sub_co_i32 s0, s0, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c = sub i32 %a, %b + ret i32 %c +} + +define i32 @v_sub_i32(i32 %a, i32 %b) { +; GFX7-LABEL: v_sub_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sub_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_sub_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_sub_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_sub_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_sub_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c = sub i32 %a, %b + ret i32 %c +} + +; TODO: Add test for s_sub_v2i16. Instruction selector currently fails +; to handle G_UNMERGE_VALUES. + +define <2 x i16> @v_sub_v2i16(<2 x i16> %a, <2 x i16> %b) { +; GFX7-LABEL: v_sub_v2i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sub_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_sub_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u16_e32 v2, v0, v1 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_sub_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_sub_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_sub_v2i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c = sub <2 x i16> %a, %b + ret <2 x i16> %c +} + +define i64 @s_sub_i64(i64 inreg %a, i64 inreg %b) { +; GFX7-LABEL: s_sub_i64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_sub_u32 s4, s16, s18 +; GFX7-NEXT: s_subb_u32 s5, s17, s19 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_sub_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_sub_u32 s4, s16, s18 +; GFX9-NEXT: s_subb_u32 s5, s17, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: s_sub_i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_sub_u32 s4, s16, s18 +; GFX8-NEXT: s_subb_u32 s5, s17, s19 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_sub_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_sub_u32 s4, s16, s18 +; GFX10-NEXT: s_subb_u32 s5, s17, s19 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_sub_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_sub_u32 s0, s0, s2 +; GFX11-NEXT: s_subb_u32 s1, s1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: s_sub_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c = sub i64 %a, %b + ret i64 %c +} + +define i64 @v_sub_i64(i64 %a, i64 %b) { +; GFX7-LABEL: v_sub_i64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sub_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_sub_i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_sub_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_sub_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_sub_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c = sub i64 %a, %b + ret i64 %c +} + +define void @s_usubo_usube(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, ptr addrspace(1) %carry) { +; GFX7-LABEL: s_usubo_usube: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_sub_u32 s4, s16, s18 +; GFX7-NEXT: s_subb_u32 s5, s17, s19 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_cselect_b32 s8, 1, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_store_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_usubo_usube: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_sub_u32 s4, s16, s18 +; GFX9-NEXT: s_subb_u32 s5, s17, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: s_usubo_usube: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_sub_u32 s4, s16, s18 +; GFX8-NEXT: s_subb_u32 s5, s17, s19 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_cselect_b32 s6, 1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: flat_store_dword v[2:3], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: s_usubo_usube: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_sub_u32 s4, s16, s18 +; GFX10-NEXT: s_subb_u32 s5, s17, s19 +; GFX10-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v6, s6 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX10-NEXT: global_store_dword v[2:3], v6, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_usubo_usube: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_sub_u32 s0, s0, s2 +; GFX11-NEXT: s_subb_u32 s1, s1, s3 +; GFX11-NEXT: s_cselect_b32 s2, 1, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX11-NEXT: v_mov_b32_e32 v6, s2 +; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off +; GFX11-NEXT: global_store_b32 v[2:3], v6, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: s_usubo_usube: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_sub_co_u32 s0, s0, s2 +; GFX12-NEXT: s_sub_co_ci_u32 s1, s1, s3 +; GFX12-NEXT: s_cselect_b32 s2, 1, 0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off +; GFX12-NEXT: global_store_b32 v[2:3], v6, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %usubo = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b) + %sub = extractvalue {i64, i1} %usubo, 0 + %of = extractvalue {i64, i1} %usubo, 1 + %of32 = select i1 %of, i32 1, i32 0 + store i64 %sub, ptr addrspace(1) %res + store i32 %of32, ptr addrspace(1) %carry + ret void +} + +define void @v_usubo_usube(i64 %a, i64 %b, ptr addrspace(1) %res, ptr addrspace(1) %carry) { +; GFX7-LABEL: v_usubo_usube: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_dword v2, v[6:7], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_usubo_usube: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_store_dword v[6:7], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_usubo_usube: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: flat_store_dword v[6:7], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_usubo_usube: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX10-NEXT: global_store_dword v[6:7], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_usubo_usube: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: global_store_b64 v[4:5], v[0:1], off +; GFX11-NEXT: global_store_b32 v[6:7], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_usubo_usube: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX12-NEXT: global_store_b64 v[4:5], v[0:1], off +; GFX12-NEXT: global_store_b32 v[6:7], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %usubo = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b) + %sub = extractvalue {i64, i1} %usubo, 0 + %of = extractvalue {i64, i1} %usubo, 1 + %of32 = select i1 %of, i32 1, i32 0 + store i64 %sub, ptr addrspace(1) %res + store i32 %of32, ptr addrspace(1) %carry + ret void +} From de14700cfce9fb6e0bf93b34bd42bb39e17c7114 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Thu, 30 Oct 2025 08:50:33 -0700 Subject: [PATCH 208/539] [libc++] Fix localization failures on macOS 15.4 (#138744) This patch reverts e15025dd and 88e15b781 which were temporary measures until we had figured out the underlying issues. It turns out that recent OSes updated localization data, removing the need for several Apple-specific workarounds in the tests. Fixes #135385 --- .../ios.base.cons/dtor.uninitialized.pass.cpp | 6 +-- .../fstreams/filebuf.virtuals/setbuf.pass.cpp | 6 +-- .../istream.unformatted/sync.pass.cpp | 6 +-- .../locale.collate.byname/compare.pass.cpp | 16 ------ .../get_long_double_fr_FR.pass.cpp | 5 -- .../get_long_double_ru_RU.pass.cpp | 3 -- .../get_long_double_zh_CN.pass.cpp | 27 +++++----- .../put_long_double_fr_FR.pass.cpp | 5 -- .../put_long_double_ru_RU.pass.cpp | 3 -- .../put_long_double_zh_CN.pass.cpp | 43 ++++++++-------- .../curr_symbol.pass.cpp | 15 +----- .../grouping.pass.cpp | 5 -- .../neg_format.pass.cpp | 35 ++----------- .../pos_format.pass.cpp | 10 +--- .../facet.num.get.members/get_double.pass.cpp | 6 +-- .../facet.num.get.members/get_float.pass.cpp | 6 +-- .../get_long_double.pass.cpp | 6 +-- .../locale.numpunct.byname/grouping.pass.cpp | 7 +-- .../thousands_sep.pass.cpp | 5 +- .../time.duration.nonmember/ostream.pass.cpp | 10 ---- .../time/time.syn/formatter.duration.pass.cpp | 51 ------------------- .../time.syn/formatter.file_time.pass.cpp | 19 ------- .../time/time.syn/formatter.hh_mm_ss.pass.cpp | 35 ------------- .../time.syn/formatter.local_time.pass.cpp | 19 ------- .../time/time.syn/formatter.sys_time.pass.cpp | 19 ------- libcxx/test/support/locale_helpers.h | 12 +++++ libcxxabi/test/uncaught_exception.pass.cpp | 6 +-- 27 files changed, 68 insertions(+), 318 deletions(-) diff --git a/libcxx/test/libcxx/input.output/iostreams.base/ios.base/ios.base.cons/dtor.uninitialized.pass.cpp b/libcxx/test/libcxx/input.output/iostreams.base/ios.base/ios.base.cons/dtor.uninitialized.pass.cpp index f17c1483c4a99..16d66e3be14ee 100644 --- a/libcxx/test/libcxx/input.output/iostreams.base/ios.base/ios.base.cons/dtor.uninitialized.pass.cpp +++ b/libcxx/test/libcxx/input.output/iostreams.base/ios.base/ios.base.cons/dtor.uninitialized.pass.cpp @@ -6,14 +6,12 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // UNSUPPORTED: no-exceptions // The fix for issue 57964 requires an updated dylib due to explicit // instantiations. That means Apple backdeployment targets remain broken. -// XFAIL: using-built-library-before-llvm-19 +// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added +// XFAIL: using-built-library-before-llvm-19 && !darwin // diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp index 9d14abcedd423..00aa97a45cc24 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp @@ -6,16 +6,14 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // // basic_streambuf* setbuf(char_type* s, streamsize n) override; // This test requires the fix to https://llvm.org/PR60509 in the dylib, // which landed in 5afb937d8a30445642ccaf33866ee4cdd0713222. -// XFAIL: using-built-library-before-llvm-19 +// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added +// XFAIL: using-built-library-before-llvm-19 && !darwin #include #include diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/sync.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/sync.pass.cpp index 3b685950d36a6..b04d2c07ebb1c 100644 --- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/sync.pass.cpp +++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/sync.pass.cpp @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // // int sync(); @@ -16,7 +13,8 @@ // The fix for bug 51497 and bug 51499 require and updated dylib due to // explicit instantiations. That means Apple backdeployment targets remain // broken. -// XFAIL: using-built-library-before-llvm-19 +// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added +// XFAIL: using-built-library-before-llvm-19 && !darwin #include #include diff --git a/libcxx/test/std/localization/locale.categories/category.collate/locale.collate.byname/compare.pass.cpp b/libcxx/test/std/localization/locale.categories/category.collate/locale.collate.byname/compare.pass.cpp index 4905ed40f4a24..8ae6bc2d3ba66 100644 --- a/libcxx/test/std/localization/locale.categories/category.collate/locale.collate.byname/compare.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.collate/locale.collate.byname/compare.pass.cpp @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // Bionic has minimal locale support, investigate this later. // XFAIL: LIBCXX-ANDROID-FIXME @@ -56,14 +53,7 @@ int main(int, char**) ASSERT_COMPARE(std::string, "AAA", "BBB", -1); ASSERT_COMPARE(std::string, "bbb", "aaa", 1); ASSERT_COMPARE(std::string, "ccc", "ccc", 0); - -#if defined(__APPLE__) - // Apple's default collation is case-sensitive - ASSERT_COMPARE(std::string, "aaaaaaA", "BaaaaaA", 1); -#else - // Glibc, Windows, and FreeBSD's default collation is case-insensitive ASSERT_COMPARE(std::string, "aaaaaaA", "BaaaaaA", -1); -#endif } #ifndef TEST_HAS_NO_WIDE_CHARACTERS { @@ -73,13 +63,7 @@ int main(int, char**) ASSERT_COMPARE(std::wstring, L"AAA", L"BBB", -1); ASSERT_COMPARE(std::wstring, L"bbb", L"aaa", 1); ASSERT_COMPARE(std::wstring, L"ccc", L"ccc", 0); -#if defined(__APPLE__) - // Apple's default collation is case-sensitive - ASSERT_COMPARE(std::wstring, L"aaaaaaA", L"BaaaaaA", 1); -#else - // Glibc, Windows, and FreeBSD's default collation is case-insensitive ASSERT_COMPARE(std::wstring, L"aaaaaaA", L"BaaaaaA", -1); -#endif } #endif } diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp index ea6b07934510a..c9ed59f3cb9aa 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp @@ -6,11 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - -// XFAIL: darwin - // NetBSD does not support LC_MONETARY at the moment // XFAIL: netbsd diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp index f98758d086de1..371cf0e90c8d3 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // NetBSD does not support LC_MONETARY at the moment // XFAIL: netbsd diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp index 6980b7ae77db0..c86df7e6b53bf 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // NetBSD does not support LC_MONETARY at the moment // XFAIL: netbsd @@ -158,7 +155,7 @@ int main(int, char**) std::noshowbase(ios); } { // negative one, showbase -#ifdef _AIX +#if defined(_AIX) || defined(__APPLE__) std::string v = "-" + currency_symbol + "0.01"; #else std::string v = currency_symbol + "-0.01"; @@ -172,7 +169,7 @@ int main(int, char**) assert(ex == -1); } { // negative one, showbase -#ifdef _AIX +#if defined(_AIX) || defined(__APPLE__) std::string v = "-" + currency_symbol + "0.01"; #else std::string v = currency_symbol + "-0.01"; @@ -212,7 +209,7 @@ int main(int, char**) std::noshowbase(ios); } { // negative, showbase -#ifdef _AIX +#if defined(_AIX) || defined(__APPLE__) std::string v = "-" + currency_symbol + "1,234,567.89"; #else std::string v = currency_symbol + "-1,234,567.89"; @@ -333,7 +330,7 @@ int main(int, char**) std::noshowbase(ios); } { // negative one, showbase -#if defined(TEST_HAS_GLIBC) || defined(_AIX) +#if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__) std::string v = "-" + currency_name + "0.01"; #else std::string v = currency_name + "-0.01"; @@ -348,7 +345,7 @@ int main(int, char**) assert(ex == -1); } { // negative one, showbase -#if defined(TEST_HAS_GLIBC) || defined(_AIX) +#if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__) std::string v = "-" + currency_name + "0.01"; #else std::string v = currency_name + "-0.01"; @@ -389,7 +386,7 @@ int main(int, char**) std::noshowbase(ios); } { // negative, showbase -#if defined(TEST_HAS_GLIBC) || defined(_AIX) +#if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__) std::string v = "-" + currency_name + "1,234,567.89"; #else std::string v = currency_name + "-1,234,567.89"; @@ -518,7 +515,7 @@ int main(int, char**) std::noshowbase(ios); } { // negative one, showbase -# ifdef _AIX +# if defined(_AIX) || defined(__APPLE__) std::wstring v = L"-" + w_currency_symbol + L"0.01"; # else std::wstring v = w_currency_symbol + L"-0.01"; @@ -532,7 +529,7 @@ int main(int, char**) assert(ex == -1); } { // negative one, showbase -# ifdef _AIX +# if defined(_AIX) || defined(__APPLE__) std::wstring v = L"-" + w_currency_symbol + L"0.01"; # else std::wstring v = w_currency_symbol + L"-0.01"; @@ -572,7 +569,7 @@ int main(int, char**) std::noshowbase(ios); } { // negative, showbase -# ifdef _AIX +# if defined(_AIX) || defined(__APPLE__) std::wstring v = L"-" + w_currency_symbol + L"1,234,567.89"; # else std::wstring v = w_currency_symbol + L"-1,234,567.89"; @@ -693,7 +690,7 @@ int main(int, char**) std::noshowbase(ios); } { // negative one, showbase -# if defined(TEST_HAS_GLIBC) || defined(_AIX) +# if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__) std::wstring v = L"-" + w_currency_name + L"0.01"; # else std::wstring v = w_currency_name + L"-0.01"; @@ -707,7 +704,7 @@ int main(int, char**) assert(ex == -1); } { // negative one, showbase -# if defined(TEST_HAS_GLIBC) || defined(_AIX) +# if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__) std::wstring v = L"-" + w_currency_name + L"0.01"; # else std::wstring v = w_currency_name + L"-0.01"; @@ -747,7 +744,7 @@ int main(int, char**) std::noshowbase(ios); } { // negative, showbase -# if defined(TEST_HAS_GLIBC) || defined(_AIX) +# if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__) std::wstring v = L"-" + w_currency_name + L"1,234,567.89"; # else std::wstring v = w_currency_name + L"-1,234,567.89"; diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp index 14745996b9fd1..f9d7998b07ff4 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp @@ -6,11 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - -// XFAIL: darwin - // NetBSD does not support LC_MONETARY at the moment // XFAIL: netbsd diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp index 0455e5949c44a..be1e397488468 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // NetBSD does not support LC_MONETARY at the moment // XFAIL: netbsd diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp index 68640fabb73b0..25046a8417083 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // NetBSD does not support LC_MONETARY at the moment // XFAIL: netbsd @@ -122,7 +119,7 @@ int main(int, char**) char str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), false, ios, '*', v); std::string ex(str, base(iter)); -#ifdef _AIX +#if defined(_AIX) || defined(__APPLE__) assert(ex == "-" + currency_symbol + "0.01"); #else assert(ex == currency_symbol + "-0.01"); @@ -142,7 +139,7 @@ int main(int, char**) char str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), false, ios, '*', v); std::string ex(str, base(iter)); -#ifdef _AIX +#if defined(_AIX) || defined(__APPLE__) assert(ex == "-" + currency_symbol + "1,234,567.89"); #else assert(ex == currency_symbol + "-1,234,567.89"); @@ -156,7 +153,7 @@ int main(int, char**) char str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), false, ios, ' ', v); std::string ex(str, base(iter)); -#ifdef _AIX +#if defined(_AIX) || defined(__APPLE__) assert(ex == "-" + currency_symbol + "1,234,567.89" + currency_symbol_padding); #else assert(ex == currency_symbol + "-1,234,567.89" + currency_symbol_padding); @@ -171,7 +168,7 @@ int main(int, char**) char str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), false, ios, ' ', v); std::string ex(str, base(iter)); -#ifdef _AIX +#if defined(_AIX) || defined(__APPLE__) assert(ex == "-" + currency_symbol + currency_symbol_padding + "1,234,567.89"); #else assert(ex == currency_symbol + "-" + currency_symbol_padding + "1,234,567.89"); @@ -186,7 +183,7 @@ int main(int, char**) char str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), false, ios, ' ', v); std::string ex(str, base(iter)); -#ifdef _AIX +#if defined(_AIX) || defined(__APPLE__) assert(ex == currency_symbol_padding + "-" + currency_symbol + "1,234,567.89"); #else assert(ex == currency_symbol_padding + currency_symbol + "-1,234,567.89"); @@ -239,7 +236,7 @@ int main(int, char**) char str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), true, ios, '*', v); std::string ex(str, base(iter)); -#if defined(TEST_HAS_GLIBC) || defined(_AIX) +#if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__) assert(ex == "-" + currency_name + "0.01"); #else assert(ex == currency_name + "-0.01"); @@ -259,7 +256,7 @@ int main(int, char**) char str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), true, ios, '*', v); std::string ex(str, base(iter)); -#if defined(TEST_HAS_GLIBC) || defined(_AIX) +#if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__) assert(ex == "-" + currency_name + "1,234,567.89"); #else assert(ex == currency_name + "-1,234,567.89"); @@ -273,7 +270,7 @@ int main(int, char**) char str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), true, ios, ' ', v); std::string ex(str, base(iter)); -#if defined(TEST_HAS_GLIBC) || defined(_AIX) +#if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__) assert(ex == "-" + currency_name + "1,234,567.89" + currency_name_padding); #else assert(ex == currency_name + "-1,234,567.89" + currency_name_padding); @@ -288,7 +285,7 @@ int main(int, char**) char str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), true, ios, ' ', v); std::string ex(str, base(iter)); -#if defined(TEST_HAS_GLIBC) || defined(_AIX) +#if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__) assert(ex == "-" + currency_name + currency_name_padding + "1,234,567.89"); #else assert(ex == currency_name + "-" + currency_name_padding + "1,234,567.89"); @@ -303,7 +300,7 @@ int main(int, char**) char str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), true, ios, ' ', v); std::string ex(str, base(iter)); -#if defined(TEST_HAS_GLIBC) || defined(_AIX) +#if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__) assert(ex == currency_name_padding + "-" + currency_name + "1,234,567.89"); #else assert(ex == currency_name_padding + currency_name + "-1,234,567.89"); @@ -366,7 +363,7 @@ int main(int, char**) wchar_t str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), false, ios, '*', v); std::wstring ex(str, base(iter)); -# ifdef _AIX +# if defined(_AIX) || defined(__APPLE__) assert(ex == L"-" + currency_symbol + L"0.01"); # else assert(ex == currency_symbol + L"-0.01"); @@ -386,7 +383,7 @@ int main(int, char**) wchar_t str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), false, ios, '*', v); std::wstring ex(str, base(iter)); -# ifdef _AIX +# if defined(_AIX) || defined(__APPLE__) assert(ex == L"-" + currency_symbol + L"1,234,567.89"); # else assert(ex == currency_symbol + L"-1,234,567.89"); @@ -400,7 +397,7 @@ int main(int, char**) wchar_t str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), false, ios, ' ', v); std::wstring ex(str, base(iter)); -# ifdef _AIX +# if defined(_AIX) || defined(__APPLE__) assert(ex == L"-" + currency_symbol + L"1,234,567.89 "); # else assert(ex == currency_symbol + L"-1,234,567.89 "); @@ -415,7 +412,7 @@ int main(int, char**) wchar_t str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), false, ios, ' ', v); std::wstring ex(str, base(iter)); -# ifdef _AIX +# if defined(_AIX) || defined(__APPLE__) assert(ex == L"-" + currency_symbol + L" 1,234,567.89"); # else assert(ex == currency_symbol + L"- 1,234,567.89"); @@ -430,7 +427,7 @@ int main(int, char**) wchar_t str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), false, ios, ' ', v); std::wstring ex(str, base(iter)); -# ifdef _AIX +# if defined(_AIX) || defined(__APPLE__) assert(ex == L" -" + currency_symbol + L"1,234,567.89"); # else assert(ex == L" " + currency_symbol + L"-1,234,567.89"); @@ -483,7 +480,7 @@ int main(int, char**) wchar_t str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), true, ios, '*', v); std::wstring ex(str, base(iter)); -# if defined(TEST_HAS_GLIBC) || defined(_AIX) +# if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__) assert(ex == L"-" + currency_name + L"0.01"); #else assert(ex == currency_name + L"-0.01"); @@ -503,7 +500,7 @@ int main(int, char**) wchar_t str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), true, ios, '*', v); std::wstring ex(str, base(iter)); -# if defined(TEST_HAS_GLIBC) || defined(_AIX) +# if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__) assert(ex == L"-" + currency_name + L"1,234,567.89"); #else assert(ex == currency_name + L"-1,234,567.89"); @@ -517,7 +514,7 @@ int main(int, char**) wchar_t str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), true, ios, ' ', v); std::wstring ex(str, base(iter)); -# if defined(TEST_HAS_GLIBC) || defined(_AIX) +# if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__) assert(ex == L"-" + currency_name + L"1,234,567.89" + currency_name_padding); #else assert(ex == currency_name + L"-1,234,567.89" + currency_name_padding); @@ -532,7 +529,7 @@ int main(int, char**) wchar_t str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), true, ios, ' ', v); std::wstring ex(str, base(iter)); -# if defined(TEST_HAS_GLIBC) || defined(_AIX) +# if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__) assert(ex == L"-" + currency_name + currency_name_padding + L"1,234,567.89"); #else assert(ex == currency_name + L"-" + currency_name_padding + L"1,234,567.89"); @@ -547,7 +544,7 @@ int main(int, char**) wchar_t str[100]; cpp17_output_iterator iter = f.put(cpp17_output_iterator(str), true, ios, ' ', v); std::wstring ex(str, base(iter)); -# if defined(TEST_HAS_GLIBC) || defined(_AIX) +# if defined(TEST_HAS_GLIBC) || defined(_AIX) || defined(__APPLE__) assert(ex == currency_name_padding + L"-" + currency_name + L"1,234,567.89"); #else assert(ex == currency_name_padding + currency_name + L"-1,234,567.89"); diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/curr_symbol.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/curr_symbol.pass.cpp index 9c1253d47acd2..e7f0f29e87742 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/curr_symbol.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/curr_symbol.pass.cpp @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // NetBSD does not support LC_MONETARY at the moment // XFAIL: netbsd @@ -117,11 +114,7 @@ int main(int, char**) { Fnf f(LOCALE_fr_FR_UTF_8, 1); -#ifdef __APPLE__ - assert(f.curr_symbol() == " Eu"); -#else assert(f.curr_symbol() == " \u20ac"); -#endif } { Fnt f(LOCALE_fr_FR_UTF_8, 1); @@ -130,11 +123,7 @@ int main(int, char**) #ifndef TEST_HAS_NO_WIDE_CHARACTERS { Fwf f(LOCALE_fr_FR_UTF_8, 1); -#ifdef __APPLE__ - assert(f.curr_symbol() == L" Eu"); -#else assert(f.curr_symbol() == L" \u20ac"); -#endif } { Fwt f(LOCALE_fr_FR_UTF_8, 1); @@ -164,7 +153,7 @@ int main(int, char**) { Fnf f(LOCALE_zh_CN_UTF_8, 1); -#ifdef _WIN32 +#if defined(_WIN32) || defined(__APPLE__) assert(f.curr_symbol() == "\xC2\xA5"); // \u00A5 #else assert(f.curr_symbol() == "\xEF\xBF\xA5"); // \uFFE5 @@ -177,7 +166,7 @@ int main(int, char**) #ifndef TEST_HAS_NO_WIDE_CHARACTERS { Fwf f(LOCALE_zh_CN_UTF_8, 1); -#ifdef _WIN32 +#if defined(_WIN32) || defined(__APPLE__) assert(f.curr_symbol() == L"\u00A5"); #else assert(f.curr_symbol() == L"\uFFE5"); diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/grouping.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/grouping.pass.cpp index 630b2739c88a8..90dc6c4d7a2ab 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/grouping.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/grouping.pass.cpp @@ -6,11 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - -// XFAIL: darwin -// // NetBSD does not support LC_MONETARY at the moment // XFAIL: netbsd diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/neg_format.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/neg_format.pass.cpp index a3e3d853524b5..e9528147dfe62 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/neg_format.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/neg_format.pass.cpp @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // NetBSD does not support LC_MONETARY at the moment // XFAIL: netbsd @@ -82,14 +79,6 @@ void assert_sign_symbol_none_value(std::money_base::pattern p) assert(p.field[3] == std::money_base::value); } -void assert_value_none_symbol_sign(std::money_base::pattern p) -{ - assert(p.field[0] == std::money_base::value); - assert(p.field[1] == std::money_base::none); - assert(p.field[2] == std::money_base::symbol); - assert(p.field[3] == std::money_base::sign); -} - void assert_sign_value_none_symbol(std::money_base::pattern p) { assert(p.field[0] == std::money_base::sign); @@ -149,39 +138,23 @@ int main(int, char**) { Fnf f(LOCALE_fr_FR_UTF_8, 1); std::money_base::pattern p = f.neg_format(); -#ifdef __APPLE__ - assert_value_none_symbol_sign(p); -#else assert_sign_value_none_symbol(p); -#endif } { Fnt f(LOCALE_fr_FR_UTF_8, 1); std::money_base::pattern p = f.neg_format(); -#ifdef __APPLE__ - assert_value_none_symbol_sign(p); -#else assert_sign_value_none_symbol(p); -#endif } #ifndef TEST_HAS_NO_WIDE_CHARACTERS { Fwf f(LOCALE_fr_FR_UTF_8, 1); std::money_base::pattern p = f.neg_format(); -#ifdef __APPLE__ - assert_value_none_symbol_sign(p); -#else assert_sign_value_none_symbol(p); -#endif } { Fwt f(LOCALE_fr_FR_UTF_8, 1); std::money_base::pattern p = f.neg_format(); -#ifdef __APPLE__ - assert_value_none_symbol_sign(p); -#else assert_sign_value_none_symbol(p); -#endif } #endif // TEST_HAS_NO_WIDE_CHARACTERS @@ -211,7 +184,7 @@ int main(int, char**) { Fnf f(LOCALE_zh_CN_UTF_8, 1); std::money_base::pattern p = f.neg_format(); -#ifdef _AIX +#if defined(_AIX) || defined(__APPLE__) assert_sign_symbol_none_value(p); #else assert_symbol_sign_none_value(p); @@ -220,7 +193,7 @@ int main(int, char**) { Fnt f(LOCALE_zh_CN_UTF_8, 1); std::money_base::pattern p = f.neg_format(); -#if defined(_WIN32) || defined(__APPLE__) +#if defined(_WIN32) assert_symbol_sign_none_value(p); #else assert_sign_symbol_none_value(p); @@ -230,7 +203,7 @@ int main(int, char**) { Fwf f(LOCALE_zh_CN_UTF_8, 1); std::money_base::pattern p = f.neg_format(); -#ifdef _AIX +#if defined(_AIX) || defined(__APPLE__) assert_sign_symbol_none_value(p); #else assert_symbol_sign_none_value(p); @@ -239,7 +212,7 @@ int main(int, char**) { Fwt f(LOCALE_zh_CN_UTF_8, 1); std::money_base::pattern p = f.neg_format(); -#if defined(_WIN32) || defined(__APPLE__) +#if defined(_WIN32) assert_symbol_sign_none_value(p); #else assert_sign_symbol_none_value(p); diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/pos_format.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/pos_format.pass.cpp index 671620a0c2f92..11832a7d89278 100644 --- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/pos_format.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/pos_format.pass.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// + // NetBSD does not support LC_MONETARY at the moment // XFAIL: netbsd @@ -79,14 +79,6 @@ void assert_sign_symbol_none_value(std::money_base::pattern p) assert(p.field[3] == std::money_base::value); } -void assert_value_none_symbol_sign(std::money_base::pattern p) -{ - assert(p.field[0] == std::money_base::value); - assert(p.field[1] == std::money_base::none); - assert(p.field[2] == std::money_base::symbol); - assert(p.field[3] == std::money_base::sign); -} - void assert_sign_value_none_symbol(std::money_base::pattern p) { assert(p.field[0] == std::money_base::sign); diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp index 612d3738a373f..31682fea43bc4 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp @@ -6,12 +6,10 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed behavior of // FP parsing. This requires 3e15c97fa3812993bdc319827a5c6d867b765ae8 in the dylib. -// XFAIL: using-built-library-before-llvm-19 +// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added +// XFAIL: using-built-library-before-llvm-19 && !darwin // diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp index 58bc9e5abef87..57eedc8633be3 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp @@ -6,12 +6,10 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed behavior of // FP parsing. This requires 3e15c97fa3812993bdc319827a5c6d867b765ae8 in the dylib. -// XFAIL: using-built-library-before-llvm-19 +// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added +// XFAIL: using-built-library-before-llvm-19 && !darwin // diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp index bf8bb651d6bce..8324ee317014d 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp @@ -6,12 +6,10 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed behavior of // FP parsing. This requires 3e15c97fa3812993bdc319827a5c6d867b765ae8 in the dylib. -// XFAIL: using-built-library-before-llvm-19 +// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added +// XFAIL: using-built-library-before-llvm-19 && !darwin // diff --git a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/grouping.pass.cpp b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/grouping.pass.cpp index a87c5e0ace28a..11ec75469c704 100644 --- a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/grouping.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/grouping.pass.cpp @@ -5,10 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// - -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - +// // NetBSD does not support LC_NUMERIC at the moment // XFAIL: netbsd @@ -63,7 +60,7 @@ int main(int, char**) } { std::locale l(LOCALE_fr_FR_UTF_8); -#if defined(TEST_HAS_GLIBC) || defined(_WIN32) || defined(_AIX) +#if defined(TEST_HAS_GLIBC) || defined(_WIN32) || defined(_AIX) || defined(__APPLE__) const char* const group = "\3"; #else const char* const group = "\x7f"; diff --git a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp index ef39e8aa7b685..53f2c8554f3d7 100644 --- a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // NetBSD does not support LC_NUMERIC at the moment // XFAIL: netbsd @@ -69,7 +66,7 @@ int main(int, char**) // The below tests work around GLIBC's use of U202F as LC_NUMERIC thousands_sep. std::locale l(LOCALE_fr_FR_UTF_8); { -#if defined(_CS_GNU_LIBC_VERSION) || defined(_WIN32) || defined(_AIX) +#if defined(_CS_GNU_LIBC_VERSION) || defined(_WIN32) || defined(_AIX) || defined(__APPLE__) const char sep = ' '; #else const char sep = ','; diff --git a/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp b/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp index 4e84db9a84d78..97ac04275b0b6 100644 --- a/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp +++ b/libcxx/test/std/time/time.duration/time.duration.nonmember/ostream.pass.cpp @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-localization // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME @@ -83,17 +80,10 @@ static void test_values() { assert(stream_c_locale(1'000.123456s) == SV("1000.1235s")); if constexpr (std::same_as) { -#if defined(__APPLE__) - assert(stream_fr_FR_locale(-1'000'000s) == SV("-1000000s")); - assert(stream_fr_FR_locale(1'000'000s) == SV("1000000s")); - assert(stream_fr_FR_locale(-1'000.123456s) == SV("-1000,1235s")); - assert(stream_fr_FR_locale(1'000.123456s) == SV("1000,1235s")); -#else assert(stream_fr_FR_locale(-1'000'000s) == SV("-1 000 000s")); assert(stream_fr_FR_locale(1'000'000s) == SV("1 000 000s")); assert(stream_fr_FR_locale(-1'000.123456s) == SV("-1 000,1235s")); assert(stream_fr_FR_locale(1'000.123456s) == SV("1 000,1235s")); -#endif } else { #ifndef TEST_HAS_NO_WIDE_CHARACTERS assert(stream_fr_FR_locale(-1'000'000s) == L"-1" FR_THOU_SEP "000" FR_THOU_SEP "000s"); diff --git a/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp b/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp index 973bce8f81d41..f1f7debed2464 100644 --- a/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp +++ b/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-localization // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME @@ -408,19 +405,11 @@ static void test_valid_positive_integral_values() { "%OM='00'\t" "%S='00'\t" "%OS='00'\t" -# if defined(__APPLE__) - "%p='AM'\t" -# else "%p='午前'\t" -# endif "%R='00:00'\t" "%T='00:00:00'\t" # if defined(__APPLE__) || defined(__FreeBSD__) -# if defined(__APPLE__) - "%r='12:00:00 AM'\t" -# else "%r='12:00:00 午前'\t" -# endif "%X='00時00分00秒'\t" "%EX='00時00分00秒'\t" # elif defined(_WIN32) @@ -448,19 +437,11 @@ static void test_valid_positive_integral_values() { "%OM='59'\t" "%S='59'\t" "%OS='59'\t" -# if defined(__APPLE__) - "%p='AM'\t" -# else "%p='午前'\t" -# endif "%R='11:59'\t" "%T='11:59:59'\t" # if defined(__APPLE__) || defined(__FreeBSD__) -# if defined(__APPLE__) - "%r='11:59:59 AM'\t" -# else "%r='11:59:59 午前'\t" -# endif "%X='11時59分59秒'\t" "%EX='11時59分59秒'\t" # elif defined(_WIN32) @@ -488,19 +469,11 @@ static void test_valid_positive_integral_values() { "%OM='00'\t" "%S='00'\t" "%OS='00'\t" -# if defined(__APPLE__) - "%p='PM'\t" -# else "%p='午後'\t" -# endif "%R='12:00'\t" "%T='12:00:00'\t" # if defined(__APPLE__) || defined(__FreeBSD__) -# if defined(__APPLE__) - "%r='12:00:00 PM'\t" -# else "%r='12:00:00 午後'\t" -# endif "%X='12時00分00秒'\t" "%EX='12時00分00秒'\t" # else @@ -528,19 +501,11 @@ static void test_valid_positive_integral_values() { "%OM='59'\t" "%S='59'\t" "%OS='59'\t" -# if defined(__APPLE__) - "%p='PM'\t" -# else "%p='午後'\t" -# endif "%R='23:59'\t" "%T='23:59:59'\t" # if defined(__APPLE__) || defined(__FreeBSD__) -# if defined(__APPLE__) - "%r='11:59:59 PM'\t" -# else "%r='11:59:59 午後'\t" -# endif "%X='23時59分59秒'\t" "%EX='23時59分59秒'\t" # else @@ -568,19 +533,11 @@ static void test_valid_positive_integral_values() { "%OM='00'\t" "%S='00'\t" "%OS='00'\t" -# if defined(__APPLE__) - "%p='AM'\t" -# else "%p='午前'\t" -# endif "%R='00:00'\t" "%T='00:00:00'\t" # if defined(__APPLE__) || defined(__FreeBSD__) -# if defined(__APPLE__) - "%r='12:00:00 AM'\t" -# else "%r='12:00:00 午前'\t" -# endif "%X='00時00分00秒'\t" "%EX='00時00分00秒'\t" # elif defined(_WIN32) @@ -835,19 +792,11 @@ static void test_valid_negative_integral_values() { "%OM='59'\t" "%S='59'\t" "%OS='59'\t" -# if defined(__APPLE__) - "%p='PM'\t" -# else "%p='午後'\t" -# endif "%R='23:59'\t" "%T='23:59:59'\t" # if defined(__APPLE__) || defined(__FreeBSD__) -# if defined(__APPLE__) - "%r='11:59:59 PM'\t" -# else "%r='11:59:59 午後'\t" -# endif "%X='23時59分59秒'\t" "%EX='23時59分59秒'\t" # elif defined(_WIN32) diff --git a/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp index 28a972b19dcef..e258c4161eda4 100644 --- a/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp +++ b/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-localization // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME @@ -695,19 +692,11 @@ static void test_valid_values_time() { "%OM='00'\t" "%S='00'\t" "%OS='00'\t" -# if defined(__APPLE__) - "%p='AM'\t" -# else "%p='午前'\t" -# endif "%R='00:00'\t" "%T='00:00:00'\t" # if defined(__APPLE__) || defined(__FreeBSD__) -# if defined(__APPLE__) - "%r='12:00:00 AM'\t" -# else "%r='12:00:00 午前'\t" -# endif "%X='00時00分00秒'\t" "%EX='00時00分00秒'\t" # elif defined(_WIN32) @@ -732,19 +721,11 @@ static void test_valid_values_time() { "%OM='31'\t" "%S='30.123'\t" "%OS='30.123'\t" -# if defined(__APPLE__) - "%p='PM'\t" -# else "%p='午後'\t" -# endif "%R='23:31'\t" "%T='23:31:30.123'\t" # if defined(__APPLE__) || defined(__FreeBSD__) -# if defined(__APPLE__) - "%r='11:31:30 PM'\t" -# else "%r='11:31:30 午後'\t" -# endif "%X='23時31分30秒'\t" "%EX='23時31分30秒'\t" # elif defined(_WIN32) diff --git a/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp b/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp index 82d9b4c7540a7..bbd9c074bef24 100644 --- a/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp +++ b/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-localization // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME @@ -302,19 +299,11 @@ static void test_valid_values() { "%OM='00'\t" "%S='00'\t" "%OS='00'\t" -# if defined(__APPLE__) - "%p='AM'\t" -# else "%p='午前'\t" -# endif "%R='00:00'\t" "%T='00:00:00'\t" # if defined(__APPLE__) || defined(__FreeBSD__) -# if defined(__APPLE__) - "%r='12:00:00 AM'\t" -# else "%r='12:00:00 午前'\t" -# endif "%X='00時00分00秒'\t" "%EX='00時00分00秒'\t" # elif defined(_WIN32) @@ -339,19 +328,11 @@ static void test_valid_values() { "%OM='31'\t" "%S='30.123'\t" "%OS='30.123'\t" -# if defined(__APPLE__) - "%p='PM'\t" -# else "%p='午後'\t" -# endif "%R='23:31'\t" "%T='23:31:30.123'\t" # if defined(__APPLE__) || defined(__FreeBSD__) -# if defined(__APPLE__) - "%r='11:31:30 PM'\t" -# else "%r='11:31:30 午後'\t" -# endif "%X='23時31分30秒'\t" "%EX='23時31分30秒'\t" # elif defined(_WIN32) @@ -376,19 +357,11 @@ static void test_valid_values() { "%OM='02'\t" "%S='01.123456789012'\t" "%OS='01.123456789012'\t" -# if defined(__APPLE__) - "%p='AM'\t" -# else "%p='午前'\t" -# endif "%R='03:02'\t" "%T='03:02:01.123456789012'\t" # if defined(__APPLE__) || defined(__FreeBSD__) -# if defined(__APPLE__) - "%r='03:02:01 AM'\t" -# else "%r='03:02:01 午前'\t" -# endif "%X='03時02分01秒'\t" "%EX='03時02分01秒'\t" # elif defined(_WIN32) @@ -413,19 +386,11 @@ static void test_valid_values() { "%OM='01'\t" "%S='01'\t" "%OS='01'\t" -# if defined(__APPLE__) - "%p='AM'\t" -# else "%p='午前'\t" -# endif "%R='01:01'\t" "%T='01:01:01'\t" # if defined(__APPLE__) || defined(__FreeBSD__) -# if defined(__APPLE__) - "%r='01:01:01 AM'\t" -# else "%r='01:01:01 午前'\t" -# endif "%X='01時01分01秒'\t" "%EX='01時01分01秒'\t" # elif defined(_WIN32) diff --git a/libcxx/test/std/time/time.syn/formatter.local_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.local_time.pass.cpp index bd23337ccb318..ce3af8ec199ae 100644 --- a/libcxx/test/std/time/time.syn/formatter.local_time.pass.cpp +++ b/libcxx/test/std/time/time.syn/formatter.local_time.pass.cpp @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-localization // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME @@ -694,19 +691,11 @@ static void test_valid_values_time() { "%OM='00'\t" "%S='00'\t" "%OS='00'\t" -# if defined(__APPLE__) - "%p='AM'\t" -# else "%p='午前'\t" -# endif "%R='00:00'\t" "%T='00:00:00'\t" # if defined(__APPLE__) || defined(__FreeBSD__) -# if defined(__APPLE__) - "%r='12:00:00 AM'\t" -# else "%r='12:00:00 午前'\t" -# endif "%X='00時00分00秒'\t" "%EX='00時00分00秒'\t" # elif defined(_WIN32) @@ -731,19 +720,11 @@ static void test_valid_values_time() { "%OM='31'\t" "%S='30.123'\t" "%OS='30.123'\t" -# if defined(__APPLE__) - "%p='PM'\t" -# else "%p='午後'\t" -# endif "%R='23:31'\t" "%T='23:31:30.123'\t" # if defined(__APPLE__) || defined(__FreeBSD__) -# if defined(__APPLE__) - "%r='11:31:30 PM'\t" -# else "%r='11:31:30 午後'\t" -# endif "%X='23時31分30秒'\t" "%EX='23時31分30秒'\t" # elif defined(_WIN32) diff --git a/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp index 9c9c8e0de1e93..9238f3daf1f81 100644 --- a/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp +++ b/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-localization // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME @@ -691,19 +688,11 @@ static void test_valid_values_time() { "%OM='00'\t" "%S='00'\t" "%OS='00'\t" -# if defined(__APPLE__) - "%p='AM'\t" -# else "%p='午前'\t" -# endif "%R='00:00'\t" "%T='00:00:00'\t" # if defined(__APPLE__) || defined(__FreeBSD__) -# if defined(__APPLE__) - "%r='12:00:00 AM'\t" -# else "%r='12:00:00 午前'\t" -# endif "%X='00時00分00秒'\t" "%EX='00時00分00秒'\t" # elif defined(_WIN32) @@ -728,19 +717,11 @@ static void test_valid_values_time() { "%OM='31'\t" "%S='30.123'\t" "%OS='30.123'\t" -# if defined(__APPLE__) - "%p='PM'\t" -# else "%p='午後'\t" -# endif "%R='23:31'\t" "%T='23:31:30.123'\t" # if defined(__APPLE__) || defined(__FreeBSD__) -# if defined(__APPLE__) - "%r='11:31:30 PM'\t" -# else "%r='11:31:30 午後'\t" -# endif "%X='23時31分30秒'\t" "%EX='23時31分30秒'\t" # elif defined(_WIN32) diff --git a/libcxx/test/support/locale_helpers.h b/libcxx/test/support/locale_helpers.h index 946c2fed0f3a5..3cec7397e3d7e 100644 --- a/libcxx/test/support/locale_helpers.h +++ b/libcxx/test/support/locale_helpers.h @@ -73,6 +73,12 @@ MultiStringType currency_symbol_ru_RU() { return MKSTR("\u20BD"); // U+20BD RUBLE SIGN #elif defined(_WIN32) || defined(__FreeBSD__) || defined(_AIX) return MKSTR("\u20BD"); // U+20BD RUBLE SIGN +#elif defined(__APPLE__) + if (__builtin_available(macOS 15.4, *)) { + return MKSTR("\u20BD"); // U+20BD RUBLE SIGN + } else { + return MKSTR("\u0440\u0443\u0431."); + } #else return MKSTR("\u0440\u0443\u0431."); #endif @@ -81,6 +87,12 @@ MultiStringType currency_symbol_ru_RU() { MultiStringType currency_symbol_zh_CN() { #if defined(_WIN32) return MKSTR("\u00A5"); // U+00A5 YEN SIGN +#elif defined(__APPLE__) + if (__builtin_available(macOS 15.4, *)) { + return MKSTR("\u00A5"); // U+00A5 YEN SIGN + } else { + return MKSTR("\uFFE5"); // U+FFE5 FULLWIDTH YEN SIGN + } #else return MKSTR("\uFFE5"); // U+FFE5 FULLWIDTH YEN SIGN #endif diff --git a/libcxxabi/test/uncaught_exception.pass.cpp b/libcxxabi/test/uncaught_exception.pass.cpp index 8e8468c43240d..e97732006e110 100644 --- a/libcxxabi/test/uncaught_exception.pass.cpp +++ b/libcxxabi/test/uncaught_exception.pass.cpp @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -// TODO(mordante) Investigate -// UNSUPPORTED: apple-clang - // UNSUPPORTED: no-exceptions // This tests that libc++abi still provides __cxa_uncaught_exception() for @@ -18,7 +15,8 @@ // to undefined symbols when linking against a libc++ that re-exports the symbols, // but running against a libc++ that doesn't. Fortunately, usage of __cxa_uncaught_exception() // in the wild seems to be close to non-existent. -// XFAIL: using-built-library-before-llvm-19 +// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added +// XFAIL: using-built-library-before-llvm-19 && !darwin #include #include From aaa86c5109f50d8780b4c4a17820ef957f3b9d0e Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Thu, 30 Oct 2025 09:03:45 -0700 Subject: [PATCH 209/539] [InstrProf] Remove deprecated -debug-info-correlate flag (#165289) --- .../Instrumentation/PGOInstrumentation.h | 2 -- llvm/lib/Frontend/Driver/CodeGenOptions.cpp | 4 +--- .../Instrumentation/InstrProfiling.cpp | 20 +++++-------------- .../Instrumentation/PGOInstrumentation.cpp | 2 +- 4 files changed, 7 insertions(+), 21 deletions(-) diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h index ced446dacb6cc..9dcd4b53a0dbe 100644 --- a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h +++ b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h @@ -26,8 +26,6 @@ namespace llvm { -LLVM_ABI extern cl::opt DebugInfoCorrelate; - class Function; class Instruction; class Module; diff --git a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp index df884908845d2..b546e816419e3 100644 --- a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp +++ b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp @@ -12,7 +12,6 @@ #include "llvm/TargetParser/Triple.h" namespace llvm { -extern llvm::cl::opt DebugInfoCorrelate; extern llvm::cl::opt ProfileCorrelate; } // namespace llvm @@ -64,8 +63,7 @@ TargetLibraryInfoImpl *createTLII(const llvm::Triple &TargetTriple, } std::string getDefaultProfileGenName() { - return llvm::DebugInfoCorrelate || - llvm::ProfileCorrelate != InstrProfCorrelator::NONE + return llvm::ProfileCorrelate != InstrProfCorrelator::NONE ? "default_%m.proflite" : "default_%m.profraw"; } diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 7795cce9d9d3c..b5548d4f24a2f 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -69,14 +69,6 @@ namespace llvm { // Command line option to enable vtable value profiling. Defined in // ProfileData/InstrProf.cpp: -enable-vtable-value-profiling= extern cl::opt EnableVTableValueProfiling; -// TODO: Remove -debug-info-correlate in next LLVM release, in favor of -// -profile-correlate=debug-info. -cl::opt DebugInfoCorrelate( - "debug-info-correlate", - cl::desc("Use debug info to correlate profiles. (Deprecated, use " - "-profile-correlate=debug-info)"), - cl::init(false)); - LLVM_ABI cl::opt ProfileCorrelate( "profile-correlate", cl::desc("Use debug info or binary file to correlate profiles."), @@ -1047,7 +1039,7 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { // in lightweight mode. We need to move the value profile pointer to the // Counter struct to get this working. assert( - !DebugInfoCorrelate && ProfileCorrelate == InstrProfCorrelator::NONE && + ProfileCorrelate == InstrProfCorrelator::NONE && "Value profiling is not yet supported with lightweight instrumentation"); GlobalVariable *Name = Ind->getName(); auto It = ProfileDataMap.find(Name); @@ -1504,7 +1496,7 @@ static inline Constant *getVTableAddrForProfData(GlobalVariable *GV) { } void InstrLowerer::getOrCreateVTableProfData(GlobalVariable *GV) { - assert(!DebugInfoCorrelate && + assert(ProfileCorrelate != InstrProfCorrelator::DEBUG_INFO && "Value profiling is not supported with lightweight instrumentation"); if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage()) return; @@ -1584,8 +1576,7 @@ GlobalVariable *InstrLowerer::setupProfileSection(InstrProfInstBase *Inc, // Use internal rather than private linkage so the counter variable shows up // in the symbol table when using debug info for correlation. - if ((DebugInfoCorrelate || - ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) && + if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO && TT.isOSBinFormatMachO() && Linkage == GlobalValue::PrivateLinkage) Linkage = GlobalValue::InternalLinkage; @@ -1691,8 +1682,7 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) { auto *CounterPtr = setupProfileSection(Inc, IPSK_cnts); PD.RegionCounters = CounterPtr; - if (DebugInfoCorrelate || - ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) { + if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) { LLVMContext &Ctx = M.getContext(); Function *Fn = Inc->getParent()->getParent(); if (auto *SP = Fn->getSubprogram()) { @@ -1737,7 +1727,7 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) { void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { // When debug information is correlated to profile data, a data variable // is not needed. - if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) + if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) return; GlobalVariable *NamePtr = Inc->getName(); diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 71736cfa4d89a..af53fa0bae468 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -456,7 +456,7 @@ createIRLevelProfileFlagVar(Module &M, ProfileVersion |= VARIANT_MASK_INSTR_ENTRY; if (PGOInstrumentLoopEntries) ProfileVersion |= VARIANT_MASK_INSTR_LOOP_ENTRIES; - if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) + if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) ProfileVersion |= VARIANT_MASK_DBG_CORRELATE; if (PGOFunctionEntryCoverage) ProfileVersion |= From 4fd1eae7f4781d4a1f7f2f4aea83ca14407333e4 Mon Sep 17 00:00:00 2001 From: Ryotaro Kasuga Date: Fri, 31 Oct 2025 01:06:30 +0900 Subject: [PATCH 210/539] [DA] Check nsw when extracting a constant operand of SCEVMul (#164408) Given a `SCEVMulExpr` such as `5 * %m`, `gcdMIVtest` in DA assumes the value as a multiple of 5 in a mathematical sense. However, this is not necessarily true if `5 * %m` overflows, especially because an odd number has an inverse modulo `2^64`. Such incorrect assumptions can lead to invalid analysis results. This patch stops unconditionally extracting a constant operand from `SCEVMulExpr`. Instead, it only allows this when the `SCEVMulExpr` has the `nsw` flag. --- llvm/lib/Analysis/DependenceAnalysis.cpp | 22 +++++++++++-------- llvm/test/Analysis/DependenceAnalysis/GCD.ll | 6 ++--- .../DependenceAnalysis/SymbolicSIV.ll | 4 ++-- .../compute-absolute-value.ll | 2 +- .../DependenceAnalysis/gcd-miv-overflow.ll | 15 +++++-------- 5 files changed, 25 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 84ee8c0bf3e18..11d829492a10e 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -2854,14 +2854,18 @@ bool DependenceInfo::testMIV(const SCEV *Src, const SCEV *Dst, banerjeeMIVtest(Src, Dst, Loops, Result); } -// Given a product, e.g., 10*X*Y, returns the first constant operand, -// in this case 10. If there is no constant part, returns std::nullopt. -static std::optional getConstantPart(const SCEV *Expr) { +/// Given a SCEVMulExpr, returns its first operand if its first operand is a +/// constant and the product doesn't overflow in a signed sense. Otherwise, +/// returns std::nullopt. For example, given (10 * X * Y), it returns 10. +/// Notably, if it doesn't have nsw, the multiplication may overflow, and if +/// so, it may not a multiple of 10. +static std::optional getConstanCoefficient(const SCEV *Expr) { if (const auto *Constant = dyn_cast(Expr)) return Constant->getAPInt(); if (const auto *Product = dyn_cast(Expr)) if (const auto *Constant = dyn_cast(Product->getOperand(0))) - return Constant->getAPInt(); + if (Product->hasNoSignedWrap()) + return Constant->getAPInt(); return std::nullopt; } @@ -2887,7 +2891,7 @@ bool DependenceInfo::accumulateCoefficientsGCD(const SCEV *Expr, if (AddRec->getLoop() == CurLoop) { CurLoopCoeff = Step; } else { - std::optional ConstCoeff = getConstantPart(Step); + std::optional ConstCoeff = getConstanCoefficient(Step); // If the coefficient is the product of a constant and other stuff, we can // use the constant in the GCD computation. @@ -2940,7 +2944,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, const SCEV *Coeff = AddRec->getStepRecurrence(*SE); // If the coefficient is the product of a constant and other stuff, // we can use the constant in the GCD computation. - std::optional ConstCoeff = getConstantPart(Coeff); + std::optional ConstCoeff = getConstanCoefficient(Coeff); if (!ConstCoeff) return false; RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff->abs()); @@ -2958,7 +2962,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, const SCEV *Coeff = AddRec->getStepRecurrence(*SE); // If the coefficient is the product of a constant and other stuff, // we can use the constant in the GCD computation. - std::optional ConstCoeff = getConstantPart(Coeff); + std::optional ConstCoeff = getConstanCoefficient(Coeff); if (!ConstCoeff) return false; RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff->abs()); @@ -2979,7 +2983,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, } else if (const SCEVMulExpr *Product = dyn_cast(Operand)) { // Search for constant operand to participate in GCD; // If none found; return false. - std::optional ConstOp = getConstantPart(Product); + std::optional ConstOp = getConstanCoefficient(Product); if (!ConstOp) return false; ExtraGCD = APIntOps::GreatestCommonDivisor(ExtraGCD, ConstOp->abs()); @@ -3032,7 +3036,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, Delta = SE->getMinusSCEV(SrcCoeff, DstCoeff); // If the coefficient is the product of a constant and other stuff, // we can use the constant in the GCD computation. - std::optional ConstCoeff = getConstantPart(Delta); + std::optional ConstCoeff = getConstanCoefficient(Delta); if (!ConstCoeff) // The difference of the two coefficients might not be a product // or constant, in which case we give up on this direction. diff --git a/llvm/test/Analysis/DependenceAnalysis/GCD.ll b/llvm/test/Analysis/DependenceAnalysis/GCD.ll index 03343e7a98211..cb14d189afe4c 100644 --- a/llvm/test/Analysis/DependenceAnalysis/GCD.ll +++ b/llvm/test/Analysis/DependenceAnalysis/GCD.ll @@ -254,7 +254,7 @@ define void @gcd4(ptr %A, ptr %B, i64 %M, i64 %N) nounwind uwtable ssp { ; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4 ; CHECK-NEXT: da analyze - output [* *]! ; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx16, align 4 -; CHECK-NEXT: da analyze - none! +; CHECK-NEXT: da analyze - flow [* *|<]! ; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.11, align 4 ; CHECK-NEXT: da analyze - confused! ; CHECK-NEXT: Src: %0 = load i32, ptr %arrayidx16, align 4 --> Dst: %0 = load i32, ptr %arrayidx16, align 4 @@ -322,7 +322,7 @@ define void @gcd5(ptr %A, ptr %B, i64 %M, i64 %N) nounwind uwtable ssp { ; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4 ; CHECK-NEXT: da analyze - output [* *]! ; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx16, align 4 -; CHECK-NEXT: da analyze - flow [<> *]! +; CHECK-NEXT: da analyze - flow [* *|<]! ; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.11, align 4 ; CHECK-NEXT: da analyze - confused! ; CHECK-NEXT: Src: %0 = load i32, ptr %arrayidx16, align 4 --> Dst: %0 = load i32, ptr %arrayidx16, align 4 @@ -390,7 +390,7 @@ define void @gcd6(i64 %n, ptr %A, ptr %B) nounwind uwtable ssp { ; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx5, align 4 --> Dst: store i32 %conv, ptr %arrayidx5, align 4 ; CHECK-NEXT: da analyze - output [* *]! ; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx5, align 4 --> Dst: %2 = load i32, ptr %arrayidx9, align 4 -; CHECK-NEXT: da analyze - none! +; CHECK-NEXT: da analyze - flow [* *|<]! ; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx5, align 4 --> Dst: store i32 %2, ptr %B.addr.12, align 4 ; CHECK-NEXT: da analyze - confused! ; CHECK-NEXT: Src: %2 = load i32, ptr %arrayidx9, align 4 --> Dst: %2 = load i32, ptr %arrayidx9, align 4 diff --git a/llvm/test/Analysis/DependenceAnalysis/SymbolicSIV.ll b/llvm/test/Analysis/DependenceAnalysis/SymbolicSIV.ll index cdfaec76fa892..73a415baef4c4 100644 --- a/llvm/test/Analysis/DependenceAnalysis/SymbolicSIV.ll +++ b/llvm/test/Analysis/DependenceAnalysis/SymbolicSIV.ll @@ -384,7 +384,7 @@ define void @symbolicsiv6(ptr %A, ptr %B, i64 %n, i64 %N, i64 %M) nounwind uwtab ; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4 ; CHECK-NEXT: da analyze - none! ; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx7, align 4 -; CHECK-NEXT: da analyze - none! +; CHECK-NEXT: da analyze - flow [*|<]! ; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 ; CHECK-NEXT: da analyze - confused! ; CHECK-NEXT: Src: %0 = load i32, ptr %arrayidx7, align 4 --> Dst: %0 = load i32, ptr %arrayidx7, align 4 @@ -440,7 +440,7 @@ define void @symbolicsiv7(ptr %A, ptr %B, i64 %n, i64 %N, i64 %M) nounwind uwtab ; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4 ; CHECK-NEXT: da analyze - none! ; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %1 = load i32, ptr %arrayidx6, align 4 -; CHECK-NEXT: da analyze - flow [<>]! +; CHECK-NEXT: da analyze - flow [*|<]! ; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %1, ptr %B.addr.02, align 4 ; CHECK-NEXT: da analyze - confused! ; CHECK-NEXT: Src: %1 = load i32, ptr %arrayidx6, align 4 --> Dst: %1 = load i32, ptr %arrayidx6, align 4 diff --git a/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll b/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll index 64fad37ab699a..783150af2cd13 100644 --- a/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll +++ b/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll @@ -18,7 +18,7 @@ define void @unknown_sign(ptr %a, i64 %k) { ; CHECK-NEXT: Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.0, align 1 ; CHECK-NEXT: da analyze - none! ; CHECK-NEXT: Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 2, ptr %idx.1, align 1 -; CHECK-NEXT: da analyze - output [<>]! +; CHECK-NEXT: da analyze - output [*|<]! ; CHECK-NEXT: Src: store i8 2, ptr %idx.1, align 1 --> Dst: store i8 2, ptr %idx.1, align 1 ; CHECK-NEXT: da analyze - none! ; diff --git a/llvm/test/Analysis/DependenceAnalysis/gcd-miv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/gcd-miv-overflow.ll index 43f66dd7d0974..9169ac323d834 100644 --- a/llvm/test/Analysis/DependenceAnalysis/gcd-miv-overflow.ll +++ b/llvm/test/Analysis/DependenceAnalysis/gcd-miv-overflow.ll @@ -13,23 +13,20 @@ ; offset1 += 3; ; } ; -; FIXME: DependenceAnalysis currently detects no dependency between the two -; stores, but it does exist. E.g., consider `m` is 12297829382473034411, which -; is a modular multiplicative inverse of 3 under modulo 2^64. Then `offset0` is -; effectively `i + 4`, so accesses will be as follows: +; Dependency exists between the two stores. E.g., consider `m` is +; 12297829382473034411, which is a modular multiplicative inverse of 3 under +; modulo 2^64. Then `offset0` is effectively `i + 4`, so accesses will be as +; follows: ; ; - A[offset0] : A[4], A[5], A[6], ... ; - A[offset1] : A[0], A[3], A[6], ... ; -; The root cause is that DA interprets `3*m` in non-modular arithmetic, which -; isn't necessarily true due to overflow. -; define void @gcdmiv_coef_ovfl(ptr %A, i64 %m) { ; CHECK-ALL-LABEL: 'gcdmiv_coef_ovfl' ; CHECK-ALL-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1 ; CHECK-ALL-NEXT: da analyze - none! ; CHECK-ALL-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 -; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: da analyze - output [*|<]! ; CHECK-ALL-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 ; CHECK-ALL-NEXT: da analyze - none! ; @@ -37,7 +34,7 @@ define void @gcdmiv_coef_ovfl(ptr %A, i64 %m) { ; CHECK-GCD-MIV-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1 ; CHECK-GCD-MIV-NEXT: da analyze - consistent output [*]! ; CHECK-GCD-MIV-NEXT: Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 -; CHECK-GCD-MIV-NEXT: da analyze - none! +; CHECK-GCD-MIV-NEXT: da analyze - consistent output [*|<]! ; CHECK-GCD-MIV-NEXT: Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1 ; CHECK-GCD-MIV-NEXT: da analyze - consistent output [*]! ; From c8c4f91619cf263968fd44de17d063d90e6fd145 Mon Sep 17 00:00:00 2001 From: Ebin-McW Date: Thu, 30 Oct 2025 21:52:42 +0530 Subject: [PATCH 211/539] [Flang] Solved issue with inline compiler directive (#143699) Issue was with pointer passing. Fixes #139297 --- flang/lib/Parser/prescan.cpp | 2 +- flang/test/Parser/inline-directives.f90 | 29 +++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 flang/test/Parser/inline-directives.f90 diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index 4739da0676fa9..fd69404f313d3 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -557,7 +557,7 @@ bool Prescanner::MustSkipToEndOfLine() const { return true; // skip over ignored columns in right margin (73:80) } else if (*at_ == '!' && !inCharLiteral_ && (!inFixedForm_ || tabInCurrentLine_ || column_ != 6)) { - return !IsCompilerDirectiveSentinel(at_); + return !IsCompilerDirectiveSentinel(at_ + 1); } else { return false; } diff --git a/flang/test/Parser/inline-directives.f90 b/flang/test/Parser/inline-directives.f90 new file mode 100644 index 0000000000000..24d4f95759a6e --- /dev/null +++ b/flang/test/Parser/inline-directives.f90 @@ -0,0 +1,29 @@ +! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s + +! Test that checks whether compiler directives can be inlined without mistaking it as comment. + +module m +contains +#define MACRO(X) subroutine func1(X); real(2) :: X; !dir$ ignore_tkr(d) X; end subroutine func1; +MACRO(foo) + +!CHECK: SUBROUTINE func1 (foo) +!CHECK: !DIR$ IGNORE_TKR (d) foo +!CHECK: END SUBROUTINE func1 + + subroutine func2(foo) + real(2) :: foo; !dir$ ignore_tkr(d) foo; + end subroutine func2 + +!CHECK: SUBROUTINE func2 (foo) +!CHECK: !DIR$ IGNORE_TKR (d) foo +!CHECK: END SUBROUTINE func2 + + subroutine func3(foo) + real(2) :: foo; !dir$ ignore_tkr(d) foo; end subroutine func3; + +!CHECK: SUBROUTINE func3 (foo) +!CHECK: !DIR$ IGNORE_TKR (d) foo +!CHECK: END SUBROUTINE func3 + +end module From 401096f624dc0764d36dc06ebdaaaea6782111cf Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 30 Oct 2025 16:21:24 +0000 Subject: [PATCH 212/539] [lldb][AArch64][test] Require SVE for some Linux tests These tests had only ever been run on SVE or SVE+SME systems. While investigating #138717 I found they failed on an SME only system. This happens because before the first stop we try to initialise SVE registers while outside of streaming mode. Which causes a SIGILL. To fix this, require SVE to be present. I could go in and make these work on SME only, but it's more complex and I will be adding SME only specific tests in future anyway. --- .../aarch64_dynamic_regset/TestArm64DynamicRegsets.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lldb/test/API/commands/register/register/aarch64_dynamic_regset/TestArm64DynamicRegsets.py b/lldb/test/API/commands/register/register/aarch64_dynamic_regset/TestArm64DynamicRegsets.py index eb121ecbfdbaf..a985ebbced719 100644 --- a/lldb/test/API/commands/register/register/aarch64_dynamic_regset/TestArm64DynamicRegsets.py +++ b/lldb/test/API/commands/register/register/aarch64_dynamic_regset/TestArm64DynamicRegsets.py @@ -97,6 +97,9 @@ def setup_register_config_test(self, run_args=None): @skipIf(oslist=no_match(["linux"])) def test_aarch64_dynamic_regset_config(self): """Test AArch64 Dynamic Register sets configuration.""" + if not self.isAArch64SVE(): + self.skipTest("SVE must be present") + register_sets = self.setup_register_config_test() for registerSet in register_sets: @@ -259,6 +262,8 @@ def write_to_enable_za_test(self, has_zt0, write_za_first): def test_aarch64_dynamic_regset_config_sme_write_za_to_enable(self): """Test that ZA and ZT0 (if present) shows as 0s when disabled and can be enabled by writing to ZA.""" + if not self.isAArch64SVE(): + self.skipTest("SVE must be present.") if not self.isAArch64SME(): self.skipTest("SME must be present.") @@ -270,6 +275,8 @@ def test_aarch64_dynamic_regset_config_sme_write_za_to_enable(self): def test_aarch64_dynamic_regset_config_sme_write_zt0_to_enable(self): """Test that ZA and ZT0 (if present) shows as 0s when disabled and can be enabled by writing to ZT0.""" + if not self.isAArch64SVE(): + self.skipTest("SVE must be present.") if not self.isAArch64SME(): self.skipTest("SME must be present.") if not self.isAArch64SME2(): From 2429def821b6535aab12894205a79656790ed01b Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 30 Oct 2025 09:27:47 -0700 Subject: [PATCH 213/539] [lit] Add support for setting limits to unlimited This is used by a couple compiler-rt tests. Reviewers: petrhosek, ilovepi Reviewed By: ilovepi Pull Request: https://github.com/llvm/llvm-project/pull/165123 --- llvm/utils/lit/lit/TestRunner.py | 19 +++++++++++++++---- .../Inputs/shtest-ulimit/ulimit_unlimited.txt | 6 ++++++ llvm/utils/lit/tests/shtest-ulimit.py | 8 +++++++- 3 files changed, 28 insertions(+), 5 deletions(-) create mode 100644 llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_unlimited.txt diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index 4a9b3c618e4f3..76beebd757a75 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -600,20 +600,31 @@ def executeBuiltinUmask(cmd, shenv): def executeBuiltinUlimit(cmd, shenv): """executeBuiltinUlimit - Change the current limits.""" - if os.name != "posix": + try: + # Try importing the resource module (available on POSIX systems) and + # emit an error where it does not exist (e.g., Windows). + import resource + except ImportError: raise InternalShellError(cmd, "'ulimit' not supported on this system") if len(cmd.args) != 3: raise InternalShellError(cmd, "'ulimit' requires two arguments") try: - new_limit = int(cmd.args[2]) + if cmd.args[2] == "unlimited": + new_limit = resource.RLIM_INFINITY + else: + new_limit = int(cmd.args[2]) except ValueError as err: raise InternalShellError(cmd, "Error: 'ulimit': %s" % str(err)) if cmd.args[1] == "-v": - shenv.ulimit["RLIMIT_AS"] = new_limit * 1024 + if new_limit != resource.RLIM_INFINITY: + new_limit = new_limit * 1024 + shenv.ulimit["RLIMIT_AS"] = new_limit elif cmd.args[1] == "-n": shenv.ulimit["RLIMIT_NOFILE"] = new_limit elif cmd.args[1] == "-s": - shenv.ulimit["RLIMIT_STACK"] = new_limit * 1024 + if new_limit != resource.RLIM_INFINITY: + new_limit = new_limit * 1024 + shenv.ulimit["RLIMIT_STACK"] = new_limit elif cmd.args[1] == "-f": shenv.ulimit["RLIMIT_FSIZE"] = new_limit else: diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_unlimited.txt b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_unlimited.txt new file mode 100644 index 0000000000000..b8aa3d5071712 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_unlimited.txt @@ -0,0 +1,6 @@ +# RUN: ulimit -f 5 +# RUN: %{python} %S/print_limits.py +# RUN: ulimit -f unlimited +# RUN: %{python} %S/print_limits.py +# Fail the test so that we can assert on the output. +# RUN: not echo return diff --git a/llvm/utils/lit/tests/shtest-ulimit.py b/llvm/utils/lit/tests/shtest-ulimit.py index 21e5a5e2491d1..e15e190920308 100644 --- a/llvm/utils/lit/tests/shtest-ulimit.py +++ b/llvm/utils/lit/tests/shtest-ulimit.py @@ -11,7 +11,7 @@ # RUN: not %{lit} -a -v %{inputs}/shtest-ulimit --order=lexical \ # RUN: | FileCheck -DBASE_NOFILE_LIMIT=%{readfile:%t.nofile_limit} %s -# CHECK: -- Testing: 3 tests{{.*}} +# CHECK: -- Testing: 4 tests{{.*}} # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit-bad-arg.txt ({{[^)]*}}) # CHECK: ulimit -n @@ -25,3 +25,9 @@ # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_reset.txt ({{[^)]*}}) # CHECK: RLIMIT_NOFILE=[[BASE_NOFILE_LIMIT]] + +# CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_unlimited.txt ({{[^)]*}}) +# CHECK: ulimit -f 5 +# CHECK: RLIMIT_FSIZE=5 +# CHECK: ulimit -f unlimited +# CHECK: RLIMIT_FSIZE=-1 From 6c75eb63ff1c8cb21520663acba9b27f9cbeba16 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Thu, 30 Oct 2025 16:30:59 +0000 Subject: [PATCH 214/539] [lldb][DWARF] Support DW_AT_bit_size on type tags (#165686) One (DWARF-spec compliant) exmample is: https://github.com/llvm/llvm-project/pull/164372, where we attach a `DW_AT_bit_size` to `_BitInt` types that can't be exactly described by a byte-size. This patch adds support for `DW_AT_bit_size` to `DWARFASTParserClang` when parsing type tags. Note, we don't use this bit-size yet, but will do so in follow-up patches. --- .../SymbolFile/DWARF/DWARFASTParserClang.cpp | 4 + .../SymbolFile/DWARF/DWARFASTParserClang.h | 1 + .../DWARF/DWARFASTParserClangTests.cpp | 90 +++++++++++++++++++ 3 files changed, 95 insertions(+) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index 36bc17680f3fa..c049829f37219 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -450,6 +450,10 @@ ParsedDWARFTypeAttributes::ParsedDWARFTypeAttributes(const DWARFDIE &die) { byte_size = form_value.Unsigned(); break; + case DW_AT_bit_size: + data_bit_size = form_value.Unsigned(); + break; + case DW_AT_alignment: alignment = form_value.Unsigned(); break; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h index da58f4c146226..f5f707129d67d 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h @@ -574,6 +574,7 @@ struct ParsedDWARFTypeAttributes { lldb_private::plugin::dwarf::DWARFFormValue type; lldb::LanguageType class_language = lldb::eLanguageTypeUnknown; std::optional byte_size; + std::optional data_bit_size; std::optional alignment; size_t calling_convention = llvm::dwarf::DW_CC_normal; uint32_t bit_stride = 0; diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp index 1abce6999874e..064ed6d1d3e58 100644 --- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp +++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp @@ -1651,3 +1651,93 @@ TEST_F(DWARFASTParserClangTests, TestObjectPointer_IndexEncoding) { EXPECT_EQ(param_die, ast_parser.GetObjectParameter(sub2, context_die)); } } + +TEST_F(DWARFASTParserClangTests, TestTypeBitSize) { + // Tests that we correctly parse DW_AT_bit_size of a DW_AT_base_type. + + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_str: + - _BitInt(2) + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_base_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_encoding + Form: DW_FORM_data1 + - Attribute: DW_AT_byte_size + Form: DW_FORM_data1 + - Attribute: DW_AT_bit_size + Form: DW_FORM_data1 + + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + +# DW_TAG_compile_unit +# DW_AT_language [DW_FORM_data2] (DW_LANG_C_plus_plus) + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + +# DW_TAG_base_type +# DW_AT_name [DW_FORM_strp] ('_BitInt(2)') + + - AbbrCode: 0x2 + Values: + - Value: 0x0 + - Value: 0x05 + - Value: 0x01 + - Value: 0x02 +... +)"; + + YAMLModuleTester t(yamldata); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto holder = std::make_unique("ast"); + auto &ast_ctx = *holder->GetAST(); + DWARFASTParserClangStub ast_parser(ast_ctx); + + auto type_die = cu_die.GetFirstChild(); + ASSERT_TRUE(type_die.IsValid()); + ASSERT_EQ(type_die.Tag(), DW_TAG_base_type); + + ParsedDWARFTypeAttributes attrs(type_die); + EXPECT_EQ(attrs.byte_size.value_or(0), 1U); + EXPECT_EQ(attrs.data_bit_size.value_or(0), 2U); + + SymbolContext sc; + auto type_sp = + ast_parser.ParseTypeFromDWARF(sc, type_die, /*type_is_new_ptr=*/nullptr); + ASSERT_NE(type_sp, nullptr); + + EXPECT_EQ(llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0), + 1U); +} From 8696cb40743820b520b76e445d90b5ebaa9174b7 Mon Sep 17 00:00:00 2001 From: Erik Enikeev Date: Thu, 30 Oct 2025 19:36:55 +0300 Subject: [PATCH 215/539] [ARM] Mark function calls as possibly changing FPSCR (#160699) This patch does the same changes as D143001 for AArch64. This PR is part of the work on adding strict FP support in ARM, which was previously discussed in #137101. --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 5 +++++ llvm/lib/Target/ARM/ARMISelLowering.h | 2 ++ llvm/test/CodeGen/ARM/strict-fp-func.ll | 13 +++++++++++++ 3 files changed, 20 insertions(+) create mode 100644 llvm/test/CodeGen/ARM/strict-fp-func.ll diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index a4d3d62e9f487..6b0653457cbaf 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -22109,6 +22109,11 @@ bool ARMTargetLowering::isComplexDeinterleavingOperationSupported( ScalarTy->isIntegerTy(32)); } +ArrayRef ARMTargetLowering::getRoundingControlRegisters() const { + static const MCPhysReg RCRegs[] = {ARM::FPSCR_RM}; + return RCRegs; +} + Value *ARMTargetLowering::createComplexDeinterleavingIR( IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 357d2c5d2fad1..bf3438b0d8803 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -1009,6 +1009,8 @@ class VectorType; bool isUnsupportedFloatingType(EVT VT) const; + ArrayRef getRoundingControlRegisters() const override; + SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal, SDValue ARMcc, SDValue Flags, SelectionDAG &DAG) const; SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, diff --git a/llvm/test/CodeGen/ARM/strict-fp-func.ll b/llvm/test/CodeGen/ARM/strict-fp-func.ll new file mode 100644 index 0000000000000..39bb2b46bdac5 --- /dev/null +++ b/llvm/test/CodeGen/ARM/strict-fp-func.ll @@ -0,0 +1,13 @@ +; RUN: llc -mtriple arm-none-eabi -stop-after=finalize-isel %s -o - | FileCheck %s + +define float @func_02(float %x, float %y) strictfp nounwind { + %call = call float @func_01(float %x) strictfp + %res = call float @llvm.experimental.constrained.fadd.f32(float %call, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") strictfp + ret float %res +} +; CHECK-LABEL: name: func_02 +; CHECK: BL @func_01, {{.*}}, implicit-def $fpscr_rm + + +declare float @func_01(float) +declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) From 08648966de9890d3e8fb65b0d634f4566f365d0e Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Thu, 30 Oct 2025 09:42:12 -0700 Subject: [PATCH 216/539] [profile] Use correct flag in InstrProf test (#165738) The `--debug-info-correlate` flag was removed in https://github.com/llvm/llvm-project/pull/165289, but I must have forgotten this test. Replace with `--profile-correlate=debug-info` to fix. --- .../profile/Linux/instrprof-debug-info-correlate-warnings.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-warnings.c b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-warnings.c index 5069c6340b64f..25022f241a6d2 100644 --- a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-warnings.c +++ b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-warnings.c @@ -1,6 +1,6 @@ // Disable full debug info and verify that we get warnings during merging -// RUN: %clang_pgogen -o %t -gline-tables-only -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp +// RUN: %clang_pgogen -o %t -gline-tables-only -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp // RUN: env LLVM_PROFILE_FILE=%t.proflite %run %t // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t %t.proflite --max-debug-info-correlation-warnings=2 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK,LIMIT --implicit-check-not=warning // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t %t.proflite --max-debug-info-correlation-warnings=0 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK,NOLIMIT --implicit-check-not=warning From e7cbb2f87bac8ea4da19aad4635e5b55fd9be8c8 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Thu, 30 Oct 2025 09:44:25 -0700 Subject: [PATCH 217/539] [AMDGPU] Support bfloat comparison for ballot intrinsic (#165495) We do not have native instructions for direct bfloat comparisons. However, we can expand bfloat to float, and do float comparison instead. TODO: handle bfloat comparison for ballot intrinsic on global isel path. Fixes: SWDEV-563403 --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 +++++++-- .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll | 21 +++++++++++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll | 12 +++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b34ab2a7e08e5..8bb28084159e8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7035,9 +7035,15 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SDLoc SL(N); if (Src.getOpcode() == ISD::SETCC) { + SDValue Op0 = Src.getOperand(0); + SDValue Op1 = Src.getOperand(1); + // Need to expand bfloat to float for comparison (setcc). + if (Op0.getValueType() == MVT::bf16) { + Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0); + Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1); + } // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...) - return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0), - Src.getOperand(1), Src.getOperand(2)); + return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2)); } if (const ConstantSDNode *Arg = dyn_cast(Src)) { // (ballot 0) -> 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll index aa591d28eb346..c1f3a12dba578 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll @@ -591,3 +591,24 @@ exit: store i32 %ballot, ptr addrspace(1) %out ret void } + +define amdgpu_cs i32 @compare_bfloats(bfloat %x, bfloat %y) { +; GFX10-LABEL: compare_bfloats: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_cmp_gt_f32_e64 s0, v0, v1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: compare_bfloats: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-NEXT: v_cmp_gt_f32_e64 s0, v1, v2 +; GFX11-NEXT: ; return to shader part epilog + %cmp = fcmp ogt bfloat %x, %y + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) + ret i32 %ballot +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll index 30c2c260a3274..827a01ff33d02 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll @@ -557,3 +557,15 @@ exit: store i64 %ballot, ptr addrspace(1) %out ret void } + +define amdgpu_cs i64 @compare_bfloats(bfloat %x, bfloat %y) { +; CHECK-LABEL: compare_bfloats: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CHECK-NEXT: v_cmp_gt_f32_e64 s[0:1], v0, v1 +; CHECK-NEXT: ; return to shader part epilog + %cmp = fcmp ogt bfloat %x, %y + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) + ret i64 %ballot +} From 60bd3f1c2724cea55a6bce1b17eb45400fa98639 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Thu, 21 Aug 2025 12:32:28 -0700 Subject: [PATCH 218/539] [MLIR] Apply clang-tidy fixes for bugprone-argument-comment in TestTransformDialectExtension.cpp (NFC) --- .../lib/Dialect/Transform/TestTransformDialectExtension.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp index 496f18bc49fad..61db9d2b44461 100644 --- a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp +++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp @@ -797,7 +797,7 @@ DiagnosedSilenceableFailure mlir::test::TestProduceInvalidIR::applyToOne( // Provide some IR that does not verify. rewriter.setInsertionPointToStart(&target->getRegion(0).front()); TestDummyPayloadOp::create(rewriter, target->getLoc(), TypeRange(), - ValueRange(), /*failToVerify=*/true); + ValueRange(), /*fail_to_verify=*/true); return DiagnosedSilenceableFailure::success(); } From cec25e6ddb530c6cde04baf3fdd49dc165d3c7ae Mon Sep 17 00:00:00 2001 From: Rana Pratap Reddy <109514914+ranapratap55@users.noreply.github.com> Date: Thu, 30 Oct 2025 22:20:28 +0530 Subject: [PATCH 219/539] [AMDGPU][Clang] Support for type inferring extended image builtins for AMDGPU (#164358) Introduces the builtins for extended image insts for amdgcn. --- clang/include/clang/Basic/Builtins.def | 1 + clang/include/clang/Basic/BuiltinsAMDGPU.def | 41 + clang/lib/AST/ASTContext.cpp | 5 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 81 +- clang/lib/Sema/SemaAMDGPU.cpp | 43 +- clang/test/CodeGen/builtins-extended-image.c | 1528 +++++++++++++++++ ...iltins-extended-image-param-gfx1100-err.cl | 227 +++ ...uiltins-extended-image-param-gfx942-err.cl | 227 +++ 8 files changed, 2150 insertions(+), 3 deletions(-) create mode 100644 clang/test/CodeGen/builtins-extended-image.c create mode 100644 clang/test/SemaOpenCL/builtins-extended-image-param-gfx1100-err.cl create mode 100644 clang/test/SemaOpenCL/builtins-extended-image-param-gfx942-err.cl diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def index b856ad145824d..3a5b72e20afab 100644 --- a/clang/include/clang/Basic/Builtins.def +++ b/clang/include/clang/Basic/Builtins.def @@ -43,6 +43,7 @@ // SJ -> sigjmp_buf // K -> ucontext_t // p -> pid_t +// e -> _Float16 for HIP/C++ and __fp16 for OpenCL // . -> "...". This may only occur at the end of the function list. // // Types may be prefixed with the following modifiers: diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index f265d82efee75..36cb527a9c806 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -967,6 +967,47 @@ TARGET_BUILTIN(__builtin_amdgcn_image_sample_3d_v4f32_f32, "V4fifffQtV4ibii", "n TARGET_BUILTIN(__builtin_amdgcn_image_sample_3d_v4f16_f32, "V4hifffQtV4ibii", "nc", "image-insts") TARGET_BUILTIN(__builtin_amdgcn_image_sample_cube_v4f32_f32, "V4fifffQtV4ibii", "nc", "image-insts") TARGET_BUILTIN(__builtin_amdgcn_image_sample_cube_v4f16_f32, "V4hifffQtV4ibii", "nc", "image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1d_v4f32_f32, "V4fifQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1d_v4f16_f32, "V4eifQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1darray_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1darray_v4f16_f32, "V4eiffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_f32_f32, "fiffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_v4f16_f32, "V4eiffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_f32_f32, "fifffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_3d_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_3d_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_cube_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_cube_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1d_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1d_v4f16_f32, "V4eiffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1darray_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1darray_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_f32_f32, "fifffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_f32_f32, "fiffffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_3d_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_3d_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_cube_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_cube_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1d_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1d_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1darray_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1darray_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_f32_f32, "fiffffffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_v4f32_f32, "V4fiffffffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_v4f16_f32, "V4eiffffffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_f32_f32, "fifffffffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_v4f32_f32, "V4fifffffffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_v4f16_f32, "V4eifffffffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_3d_v4f32_f32, "V4fifffffffffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_3d_v4f16_f32, "V4eifffffffffQtV4ibii", "nc", "extended-image-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts") #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 687cd46773f43..2669f62456711 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -12403,6 +12403,11 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context, // Read the base type. switch (*Str++) { default: llvm_unreachable("Unknown builtin type letter!"); + case 'e': + assert(HowLong == 0 && !Signed && !Unsigned && + "Bad modifiers used with 'e'!"); + Type = Context.getLangOpts().OpenCL ? Context.HalfTy : Context.Float16Ty; + break; case 'x': assert(HowLong == 0 && !Signed && !Unsigned && "Bad modifiers used with 'x'!"); diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index f49a5af2c9587..9eab70955b6b9 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -647,8 +647,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_ballot_w64: { llvm::Type *ResultType = ConvertType(E->getType()); llvm::Value *Src = EmitScalarExpr(E->getArg(0)); - Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType }); - return Builder.CreateCall(F, { Src }); + Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, {ResultType}); + return Builder.CreateCall(F, {Src}); } case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w32: case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w64: { @@ -1139,6 +1139,83 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_image_sample_cube_v4f16_f32: return emitAMDGCNImageOverloadedReturnType( *this, E, Intrinsic::amdgcn_image_sample_cube, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1d_v4f32_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1d_v4f16_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_sample_lz_1d, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1d_v4f32_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1d_v4f16_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_sample_l_1d, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1d_v4f32_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1d_v4f16_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_sample_d_1d, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_v4f32_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_v4f16_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_f32_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_sample_lz_2d, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_v4f32_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_v4f16_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_f32_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_sample_l_2d, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_v4f32_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_v4f16_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_f32_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_sample_d_2d, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_3d_v4f32_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_3d_v4f16_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_sample_lz_3d, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_3d_v4f32_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_3d_v4f16_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_sample_l_3d, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_3d_v4f32_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_3d_v4f16_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_sample_d_3d, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_cube_v4f32_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_cube_v4f16_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_sample_lz_cube, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_cube_v4f32_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_cube_v4f16_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_sample_l_cube, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1darray_v4f32_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_1darray_v4f16_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_sample_lz_1darray, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1darray_v4f32_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_1darray_v4f16_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_sample_l_1darray, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1darray_v4f32_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_1darray_v4f16_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_sample_d_1darray, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_v4f32_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_v4f16_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_f32_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_sample_lz_2darray, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_v4f32_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_v4f16_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_f32_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_sample_l_2darray, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_v4f32_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_v4f16_f32: + case clang::AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_f32_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_sample_d_2darray, false); + case clang::AMDGPU::BI__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32: + return emitAMDGCNImageOverloadedReturnType( + *this, E, Intrinsic::amdgcn_image_gather4_lz_2d, false); case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4: case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4: { llvm::FixedVectorType *VT = FixedVectorType::get(Builder.getInt32Ty(), 8); diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index e32f4376a5ebf..139c4abc040df 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -153,7 +153,48 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_image_sample_3d_v4f32_f32: case AMDGPU::BI__builtin_amdgcn_image_sample_3d_v4f16_f32: case AMDGPU::BI__builtin_amdgcn_image_sample_cube_v4f32_f32: - case AMDGPU::BI__builtin_amdgcn_image_sample_cube_v4f16_f32: { + case AMDGPU::BI__builtin_amdgcn_image_sample_cube_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_lz_1d_v4f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_lz_1d_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_lz_1darray_v4f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_lz_1darray_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_v4f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_lz_2d_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_v4f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_lz_2darray_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_lz_3d_v4f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_lz_3d_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_lz_cube_v4f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_lz_cube_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_l_1d_v4f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_l_1d_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_l_1darray_v4f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_l_1darray_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_l_2d_v4f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_v4f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_l_2darray_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_l_3d_v4f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_l_3d_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_l_cube_v4f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_l_cube_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_d_1d_v4f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_d_1d_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_d_1darray_v4f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_d_1darray_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_v4f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_d_2d_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_v4f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_d_2darray_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_d_3d_v4f32_f32: + case AMDGPU::BI__builtin_amdgcn_image_sample_d_3d_v4f16_f32: + case AMDGPU::BI__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32: { StringRef FeatureList( getASTContext().BuiltinInfo.getRequiredFeatures(BuiltinID)); if (!Builtin::evaluateRequiredTargetFeatures(FeatureList, diff --git a/clang/test/CodeGen/builtins-extended-image.c b/clang/test/CodeGen/builtins-extended-image.c new file mode 100644 index 0000000000000..0dbf81dabd77b --- /dev/null +++ b/clang/test/CodeGen/builtins-extended-image.c @@ -0,0 +1,1528 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1100 -target-feature +extended-image-insts %s -emit-llvm -o - | FileCheck %s + +typedef int int4 __attribute__((ext_vector_type(4))); +typedef float float4 __attribute__((ext_vector_type(4))); +typedef _Float16 half4 __attribute__((ext_vector_type(4))); + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_gather4_lz_2d_v4f32_f32_r( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP4]] +// +float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_r(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(1, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_gather4_lz_2d_v4f32_f32_g( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32.v8i32.v4i32(i32 2, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP4]] +// +float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_g(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(2, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_gather4_lz_2d_v4f32_f32_b( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32.v8i32.v4i32(i32 4, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP4]] +// +float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_b(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(4, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_gather4_lz_2d_v4f32_f32_a( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32.v8i32.v4i32(i32 8, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP4]] +// +float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_a(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(8, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_lz_1d_v4f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP2]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// +float4 test_amdgcn_image_sample_lz_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_1d_v4f32_f32(100, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_l_1d_v4f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP4]] +// +float4 test_amdgcn_image_sample_l_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_1d_v4f32_f32(100, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_d_1d_v4f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP5]] +// +float4 test_amdgcn_image_sample_d_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_1d_v4f32_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_lz_2d_v4f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP4]] +// +float4 test_amdgcn_image_sample_lz_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2d_v4f32_f32(100, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_l_2d_v4f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32.v8i32.v4i32(i32 10, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP5]] +// +float4 test_amdgcn_image_sample_l_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2d_v4f32_f32(10, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_d_2d_v4f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP6]], align 32 +// CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP7]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP8]] +// +float4 test_amdgcn_image_sample_d_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2d_v4f32_f32(100, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_lz_3d_v4f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.3d.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP5]] +// +float4 test_amdgcn_image_sample_lz_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_3d_v4f32_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_l_3d_v4f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32 +// CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.3d.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP6]] +// +float4 test_amdgcn_image_sample_l_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_3d_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_d_3d_v4f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32 +// CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], float [[TMP6]], float [[TMP7]], float [[TMP8]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP10]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP11]] +// +float4 test_amdgcn_image_sample_d_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_3d_v4f32_f32(1, f32, f32, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_lz_cube_v4f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.cube.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP5]] +// +float4 test_amdgcn_image_sample_lz_cube_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_cube_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_l_cube_v4f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32 +// CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.cube.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP6]] +// +float4 test_amdgcn_image_sample_l_cube_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_cube_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_lz_1darray_v4f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1darray.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP4]] +// +float4 test_amdgcn_image_sample_lz_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_1darray_v4f32_f32(1, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_l_1darray_v4f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1darray.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP5]] +// +float4 test_amdgcn_image_sample_l_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_1darray_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_d_1darray_v4f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32 +// CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1darray.v4f32.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP6]] +// +float4 test_amdgcn_image_sample_d_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_1darray_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_lz_2darray_v4f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2darray.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP5]] +// +float4 test_amdgcn_image_sample_lz_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2darray_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_l_2darray_v4f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32 +// CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2darray.v4f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP6]] +// +float4 test_amdgcn_image_sample_l_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2darray_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_d_2darray_v4f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP7]], align 32 +// CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2darray.v4f32.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], float [[TMP6]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP8]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x float> [[TMP9]] +// +float4 test_amdgcn_image_sample_d_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2darray_v4f32_f32(1, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_lz_1d_v4f16_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.1d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP2]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x half> [[TMP3]] +// +half4 test_amdgcn_image_sample_lz_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_1d_v4f16_f32(100, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_1d_v4f16_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.1d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x half> [[TMP4]] +// +half4 test_amdgcn_image_sample_l_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_1d_v4f16_f32(100, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_d_1d_v4f16_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.1d.v4f16.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x half> [[TMP5]] +// +half4 test_amdgcn_image_sample_d_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_1d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_lz_2d_v4f16_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.2d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x half> [[TMP4]] +// +half4 test_amdgcn_image_sample_lz_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2d_v4f16_f32(100, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_2d_v4f16_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.2d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x half> [[TMP5]] +// +half4 test_amdgcn_image_sample_l_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_d_2d_v4f16_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP6]], align 32 +// CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.2d.v4f16.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP7]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x half> [[TMP8]] +// +half4 test_amdgcn_image_sample_d_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2d_v4f16_f32(100, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_lz_3d_v4f16_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.3d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x half> [[TMP5]] +// +half4 test_amdgcn_image_sample_lz_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_3d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_3d_v4f16_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32 +// CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.3d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x half> [[TMP6]] +// +half4 test_amdgcn_image_sample_l_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_3d_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_d_3d_v4f16_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32 +// CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP11:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.3d.v4f16.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], float [[TMP6]], float [[TMP7]], float [[TMP8]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP10]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x half> [[TMP11]] +// +half4 test_amdgcn_image_sample_d_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_3d_v4f16_f32(100, f32, f32, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_lz_cube_v4f16_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.cube.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x half> [[TMP5]] +// +half4 test_amdgcn_image_sample_lz_cube_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_cube_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_cube_v4f16_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32 +// CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.cube.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x half> [[TMP6]] +// +half4 test_amdgcn_image_sample_l_cube_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_cube_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_lz_1darray_v4f16_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.1darray.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x half> [[TMP4]] +// +half4 test_amdgcn_image_sample_lz_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_1darray_v4f16_f32(100, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_1darray_v4f16_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.1darray.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x half> [[TMP5]] +// +half4 test_amdgcn_image_sample_l_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_1darray_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_d_1darray_v4f16_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32 +// CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.1darray.v4f16.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x half> [[TMP6]] +// +half4 test_amdgcn_image_sample_d_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_1darray_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_lz_2darray_v4f16_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.2darray.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x half> [[TMP5]] +// +half4 test_amdgcn_image_sample_lz_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2darray_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_2darray_v4f16_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32 +// CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.2darray.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x half> [[TMP6]] +// +half4 test_amdgcn_image_sample_l_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2darray_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_d_2darray_v4f16_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP7]], align 32 +// CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.2darray.v4f16.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], float [[TMP6]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP8]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret <4 x half> [[TMP9]] +// +half4 test_amdgcn_image_sample_d_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2darray_v4f16_f32(100, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local float @test_amdgcn_image_sample_lz_2d_f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret float [[TMP4]] +// +float test_amdgcn_image_sample_lz_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2d_f32_f32(1, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local float @test_amdgcn_image_sample_l_2d_f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.image.sample.l.2d.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret float [[TMP5]] +// +float test_amdgcn_image_sample_l_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2d_f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local float @test_amdgcn_image_sample_d_2d_f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP6]], align 32 +// CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.amdgcn.image.sample.d.2d.f32.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP7]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret float [[TMP8]] +// +float test_amdgcn_image_sample_d_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2d_f32_f32(1, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local float @test_amdgcn_image_sample_lz_2darray_f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.image.sample.lz.2darray.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret float [[TMP5]] +// +float test_amdgcn_image_sample_lz_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2darray_f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local float @test_amdgcn_image_sample_l_2darray_f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32 +// CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.amdgcn.image.sample.l.2darray.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret float [[TMP6]] +// +float test_amdgcn_image_sample_l_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2darray_f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} + +// CHECK-LABEL: define dso_local float @test_amdgcn_image_sample_d_2darray_f32_f32( +// CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[V4F32_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[F32_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[I32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[TEX_ADDR:%.*]] = alloca ptr, align 32, addrspace(5) +// CHECK-NEXT: [[VEC4I32_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[V4F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V4F32_ADDR]] to ptr +// CHECK-NEXT: [[F32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F32_ADDR]] to ptr +// CHECK-NEXT: [[I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I32_ADDR]] to ptr +// CHECK-NEXT: [[TEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TEX_ADDR]] to ptr +// CHECK-NEXT: [[VEC4I32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VEC4I32_ADDR]] to ptr +// CHECK-NEXT: store <4 x float> [[V4F32]], ptr [[V4F32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store float [[F32]], ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[I32]], ptr [[I32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr [[TEX]], ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <4 x i32> [[VEC4I32]], ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[F32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP7]], align 32 +// CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.image.sample.d.2darray.f32.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], float [[TMP6]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP8]], i1 false, i32 120, i32 110) +// CHECK-NEXT: ret float [[TMP9]] +// +float test_amdgcn_image_sample_d_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2darray_f32_f32(1, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110); +} diff --git a/clang/test/SemaOpenCL/builtins-extended-image-param-gfx1100-err.cl b/clang/test/SemaOpenCL/builtins-extended-image-param-gfx1100-err.cl new file mode 100644 index 0000000000000..47dbdd4e51782 --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-extended-image-param-gfx1100-err.cl @@ -0,0 +1,227 @@ +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1100 -target-feature +extended-image-insts -S -verify=expected -o - %s +// REQUIRES: amdgpu-registered-target + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +typedef int int4 __attribute__((ext_vector_type(4))); +typedef float float4 __attribute__((ext_vector_type(4))); +typedef half half4 __attribute__((ext_vector_type(4))); + +float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_r(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(1, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_g(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(2, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_b(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(4, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_a(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(8, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_sample_lz_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_1d_v4f32_f32(i32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_1d_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_sample_l_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_1d_v4f32_f32(100, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_1d_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_sample_d_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_1d_v4f32_f32(100, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_1d_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_sample_lz_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2d_v4f32_f32(100, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_2d_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_sample_l_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2d_v4f32_f32(100, f32, f32, f32, tex, vec4i32, 0, f32, 103); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_2d_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_sample_d_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2d_v4f32_f32(i32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_2d_v4f32_f32' must be a constant integer}} +} +float4 test_amdgcn_image_sample_lz_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_3d_v4f32_f32(i32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_3d_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_sample_l_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_3d_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_3d_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_sample_d_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_3d_v4f32_f32(1, f32, f32, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_3d_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_sample_lz_cube_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_cube_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_cube_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_sample_l_cube_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_cube_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_cube_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_sample_lz_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_1darray_v4f32_f32(1, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_1darray_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_sample_l_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_1darray_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_1darray_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_sample_d_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_1darray_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_1darray_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_sample_lz_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2darray_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_2darray_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_sample_l_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2darray_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_2darray_v4f32_f32' must be a constant integer}} +} + +float4 test_amdgcn_image_sample_d_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2darray_v4f32_f32(1, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_2darray_v4f32_f32' must be a constant integer}} +} + +half4 test_amdgcn_image_sample_lz_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_1d_v4f16_f32(23, f32, tex, vec4i32, 0, i32, 11); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_1d_v4f16_f32' must be a constant integer}} +} + +half4 test_amdgcn_image_sample_l_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_1d_v4f16_f32(i32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_1d_v4f16_f32' must be a constant integer}} +} + +half4 test_amdgcn_image_sample_d_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_1d_v4f16_f32(i32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_1d_v4f16_f32' must be a constant integer}} +} + +half4 test_amdgcn_image_sample_lz_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2d_v4f16_f32(100, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_2d_v4f16_f32' must be a constant integer}} +} + +half4 test_amdgcn_image_sample_l_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_2d_v4f16_f32' must be a constant integer}} +} + +half4 test_amdgcn_image_sample_d_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2d_v4f16_f32(100, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_2d_v4f16_f32' must be a constant integer}} +} + +half4 test_amdgcn_image_sample_lz_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_3d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_3d_v4f16_f32' must be a constant integer}} +} + +half4 test_amdgcn_image_sample_l_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_3d_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_3d_v4f16_f32' must be a constant integer}} +} + +half4 test_amdgcn_image_sample_d_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_3d_v4f16_f32(100, f32, f32, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_3d_v4f16_f32' must be a constant integer}} +} + +half4 test_amdgcn_image_sample_lz_cube_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_cube_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_cube_v4f16_f32' must be a constant integer}} +} + +half4 test_amdgcn_image_sample_l_cube_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_cube_v4f16_f32(i32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_cube_v4f16_f32' must be a constant integer}} +} + +half4 test_amdgcn_image_sample_lz_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_1darray_v4f16_f32(i32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_1darray_v4f16_f32' must be a constant integer}} +} + +half4 test_amdgcn_image_sample_l_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_1darray_v4f16_f32(i32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_1darray_v4f16_f32' must be a constant integer}} +} + +half4 test_amdgcn_image_sample_d_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_1darray_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_1darray_v4f16_f32' must be a constant integer}} +} + +half4 test_amdgcn_image_sample_lz_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2darray_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_2darray_v4f16_f32' must be a constant integer}} +} + +half4 test_amdgcn_image_sample_l_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2darray_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_2darray_v4f16_f32' must be a constant integer}} +} + +half4 test_amdgcn_image_sample_d_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2darray_v4f16_f32(100, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_2darray_v4f16_f32' must be a constant integer}} +} + +float test_amdgcn_image_sample_lz_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2d_f32_f32(1, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_2d_f32_f32' must be a constant integer}} +} + +float test_amdgcn_image_sample_l_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2d_f32_f32(1, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_2d_f32_f32' must be a constant integer}} +} + +float test_amdgcn_image_sample_d_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2d_f32_f32(1, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_2d_f32_f32' must be a constant integer}} +} + +float test_amdgcn_image_sample_lz_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2darray_f32_f32(1, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_lz_2darray_f32_f32' must be a constant integer}} +} + +float test_amdgcn_image_sample_l_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2darray_f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_l_2darray_f32_f32' must be a constant integer}} +} + +float test_amdgcn_image_sample_d_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2darray_f32_f32(1, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, f32, i32); //expected-error{{argument to '__builtin_amdgcn_image_sample_d_2darray_f32_f32' must be a constant integer}} +} diff --git a/clang/test/SemaOpenCL/builtins-extended-image-param-gfx942-err.cl b/clang/test/SemaOpenCL/builtins-extended-image-param-gfx942-err.cl new file mode 100644 index 0000000000000..e60f8c70dc7c4 --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-extended-image-param-gfx942-err.cl @@ -0,0 +1,227 @@ +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx942 -verify=GFX94 -S -o - %s +// REQUIRES: amdgpu-registered-target + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +typedef int int4 __attribute__((ext_vector_type(4))); +typedef float float4 __attribute__((ext_vector_type(4))); +typedef half half4 __attribute__((ext_vector_type(4))); + +float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_r(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(1, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_gather4_lz_2d_v4f32_f32_r' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_g(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(2, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_gather4_lz_2d_v4f32_f32_g' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_b(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(4, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_gather4_lz_2d_v4f32_f32_b' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_a(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32(8, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_gather4_lz_2d_v4f32_f32_a' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_sample_lz_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_1d_v4f32_f32(105, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_1d_v4f32_f32' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_sample_l_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_1d_v4f32_f32(100, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_1d_v4f32_f32' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_sample_d_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_1d_v4f32_f32(100, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_1d_v4f32_f32' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_sample_lz_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2d_v4f32_f32(100, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_2d_v4f32_f32' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_sample_l_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2d_v4f32_f32(10, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_2d_v4f32_f32' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_sample_d_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2d_v4f32_f32(105, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_2d_v4f32_f32' needs target feature extended-image-insts}} +} +float4 test_amdgcn_image_sample_lz_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_3d_v4f32_f32(105, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_3d_v4f32_f32' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_sample_l_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_3d_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_3d_v4f32_f32' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_sample_d_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_3d_v4f32_f32(1, f32, f32, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_3d_v4f32_f32' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_sample_lz_cube_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_cube_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_cube_v4f32_f32' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_sample_l_cube_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_cube_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_cube_v4f32_f32' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_sample_lz_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_1darray_v4f32_f32(1, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_1darray_v4f32_f32' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_sample_l_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_1darray_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_1darray_v4f32_f32' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_sample_d_1darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_1darray_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_1darray_v4f32_f32' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_sample_lz_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2darray_v4f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_2darray_v4f32_f32' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_sample_l_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2darray_v4f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_2darray_v4f32_f32' needs target feature extended-image-insts}} +} + +float4 test_amdgcn_image_sample_d_2darray_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2darray_v4f32_f32(1, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_2darray_v4f32_f32' needs target feature extended-image-insts}} +} + +half4 test_amdgcn_image_sample_lz_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_1d_v4f16_f32(105, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_1d_v4f16_f32' needs target feature extended-image-insts}} +} + +half4 test_amdgcn_image_sample_l_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_1d_v4f16_f32(105, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_1d_v4f16_f32' needs target feature extended-image-insts}} +} + +half4 test_amdgcn_image_sample_d_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_1d_v4f16_f32(105, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_1d_v4f16_f32' needs target feature extended-image-insts}} +} + +half4 test_amdgcn_image_sample_lz_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2d_v4f16_f32(100, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_2d_v4f16_f32' needs target feature extended-image-insts}} +} + +half4 test_amdgcn_image_sample_l_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_2d_v4f16_f32' needs target feature extended-image-insts}} +} + +half4 test_amdgcn_image_sample_d_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2d_v4f16_f32(100, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_2d_v4f16_f32' needs target feature extended-image-insts}} +} + +half4 test_amdgcn_image_sample_lz_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_3d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_3d_v4f16_f32' needs target feature extended-image-insts}} +} + +half4 test_amdgcn_image_sample_l_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_3d_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_3d_v4f16_f32' needs target feature extended-image-insts}} +} + +half4 test_amdgcn_image_sample_d_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_3d_v4f16_f32(100, f32, f32, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_3d_v4f16_f32' needs target feature extended-image-insts}} +} + +half4 test_amdgcn_image_sample_lz_cube_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_cube_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_cube_v4f16_f32' needs target feature extended-image-insts}} +} + +half4 test_amdgcn_image_sample_l_cube_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_cube_v4f16_f32(105, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_cube_v4f16_f32' needs target feature extended-image-insts}} +} + +half4 test_amdgcn_image_sample_lz_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_1darray_v4f16_f32(105, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_1darray_v4f16_f32' needs target feature extended-image-insts}} +} + +half4 test_amdgcn_image_sample_l_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_1darray_v4f16_f32(105, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_1darray_v4f16_f32' needs target feature extended-image-insts}} +} + +half4 test_amdgcn_image_sample_d_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_1darray_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_1darray_v4f16_f32' needs target feature extended-image-insts}} +} + +half4 test_amdgcn_image_sample_lz_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2darray_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_2darray_v4f16_f32' needs target feature extended-image-insts}} +} + +half4 test_amdgcn_image_sample_l_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2darray_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_2darray_v4f16_f32' needs target feature extended-image-insts}} +} + +half4 test_amdgcn_image_sample_d_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2darray_v4f16_f32(100, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_2darray_v4f16_f32' needs target feature extended-image-insts}} +} + +float test_amdgcn_image_sample_lz_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2d_f32_f32(1, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_2d_f32_f32' needs target feature extended-image-insts}} +} + +float test_amdgcn_image_sample_l_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2d_f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_2d_f32_f32' needs target feature extended-image-insts}} +} + +float test_amdgcn_image_sample_d_2d_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2d_f32_f32(1, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_2d_f32_f32' needs target feature extended-image-insts}} +} + +float test_amdgcn_image_sample_lz_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_lz_2darray_f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_lz_2darray_f32_f32' needs target feature extended-image-insts}} +} + +float test_amdgcn_image_sample_l_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_l_2darray_f32_f32(1, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_l_2darray_f32_f32' needs target feature extended-image-insts}} +} + +float test_amdgcn_image_sample_d_2darray_f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) { + + return __builtin_amdgcn_image_sample_d_2darray_f32_f32(1, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 101, 121); //GFX94-error{{'test_amdgcn_image_sample_d_2darray_f32_f32' needs target feature extended-image-insts}} +} From 6e2a08b70cf0fa58dc4badf9c06aa6ca0caa45bd Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 30 Oct 2025 10:00:51 -0700 Subject: [PATCH 220/539] [libc][hdrgen] Add extra_standards and license_text (#165459) This adds a few new features to hdrgen, all meant to facilitate using it with inputs and outputs that are outside the llvm-libc source tree. The new `extra_standards` field is a dictionary to augment the set of names that can be used in `standards` lists. The keys are the identifiers used in YAML ("stdc") and the values are the pretty names generated in the header comments ("Standard C"). This lets a libc project that's leveraging the llvm-libc sources along with its own code define new APIs outside the formal and de facto standards that llvm-libc draws its supported APIs from. The new `license_text` field is a list of lines of license text that replaces the standard LLVM license text used at the top of each generated header. This lets other projects use hdrgen with their own inputs to produce generated headers that are not tied to the LLVM project. Finally, for any function attributes that are not in a canonical list known to be provided by __llvm-libc-common.h, an include will be generated for "llvm-libc-macros/{attribute name}.h", expecting that file to define the "attribute" name as a macro. All this can be used immediately by builds that drive hdrgen and build libc code outside the LLVM CMake build. Future changes could add CMake plumbing to facilitate augmenting the LLVM CMake build of libc with outside sources via overlays and cache files. --- libc/utils/hdrgen/hdrgen/header.py | 71 ++++++++++++++----- libc/utils/hdrgen/hdrgen/yaml_to_classes.py | 2 + .../hdrgen/tests/expected_output/custom.h | 21 ++++++ .../tests/expected_output/test_header.h | 1 + .../tests/expected_output/test_small.json | 1 + .../hdrgen/tests/input/custom-common.yaml | 6 ++ libc/utils/hdrgen/tests/input/custom.yaml | 13 ++++ libc/utils/hdrgen/tests/test_integration.py | 7 ++ 8 files changed, 106 insertions(+), 16 deletions(-) create mode 100644 libc/utils/hdrgen/tests/expected_output/custom.h create mode 100644 libc/utils/hdrgen/tests/input/custom-common.yaml create mode 100644 libc/utils/hdrgen/tests/input/custom.yaml diff --git a/libc/utils/hdrgen/hdrgen/header.py b/libc/utils/hdrgen/hdrgen/header.py index 715d4b7c9b7ed..558ee58469207 100644 --- a/libc/utils/hdrgen/hdrgen/header.py +++ b/libc/utils/hdrgen/hdrgen/header.py @@ -35,6 +35,13 @@ COMMON_HEADER = PurePosixPath("__llvm-libc-common.h") +# These "attributes" are known macros defined in COMMON_HEADER. +# Others are found in "llvm-libc-macros/{name}.h". +COMMON_ATTRIBUTES = { + "_Noreturn", + "_Returns_twice", +} + # All the canonical identifiers are in lowercase for easy maintenance. # This maps them to the pretty descriptions to generate in header comments. LIBRARY_DESCRIPTIONS = { @@ -50,9 +57,7 @@ HEADER_TEMPLATE = """\ //===-- {library} header <{header}> --===// // -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +{license_lines} // //===---------------------------------------------------------------------===// @@ -64,6 +69,12 @@ #endif // {guard} """ +LLVM_LICENSE_TEXT = [ + "Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.", + "See https://llvm.org/LICENSE.txt for license information.", + "SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception", +] + class HeaderFile: def __init__(self, name): @@ -74,8 +85,10 @@ def __init__(self, name): self.enumerations = [] self.objects = [] self.functions = [] + self.extra_standards = {} self.standards = [] self.merge_yaml_files = [] + self.license_text = [] def add_macro(self, macro): self.macros.append(macro) @@ -98,6 +111,11 @@ def merge(self, other): self.enumerations = sorted(set(self.enumerations) | set(other.enumerations)) self.objects = sorted(set(self.objects) | set(other.objects)) self.functions = sorted(set(self.functions) | set(other.functions)) + self.extra_standards |= other.extra_standards + if self.license_text: + assert not other.license_text, "only one `license_text` allowed" + else: + self.license_text = other.license_text def all_types(self): return reduce( @@ -106,6 +124,13 @@ def all_types(self): set(self.types), ) + def all_attributes(self): + return reduce( + lambda a, b: a | b, + [set(f.attributes) for f in self.functions], + set(), + ) + def all_standards(self): # FIXME: Only functions have the "standard" field, but all the entity # types should have one too. @@ -114,16 +139,24 @@ def all_standards(self): ) def includes(self): - return { - PurePosixPath("llvm-libc-macros") / macro.header - for macro in self.macros - if macro.header is not None - } | { - COMPILER_HEADER_TYPES.get( - typ.type_name, PurePosixPath("llvm-libc-types") / f"{typ.type_name}.h" - ) - for typ in self.all_types() - } + return ( + { + PurePosixPath("llvm-libc-macros") / macro.header + for macro in self.macros + if macro.header is not None + } + | { + COMPILER_HEADER_TYPES.get( + typ.type_name, + PurePosixPath("llvm-libc-types") / f"{typ.type_name}.h", + ) + for typ in self.all_types() + } + | { + PurePosixPath("llvm-libc-macros") / f"{attr}.h" + for attr in self.all_attributes() - COMMON_ATTRIBUTES + } + ) def header_guard(self): return "_LLVM_LIBC_" + "_".join( @@ -131,24 +164,29 @@ def header_guard(self): ) def library_description(self): + descriptions = LIBRARY_DESCRIPTIONS | self.extra_standards # If the header itself is in standard C, just call it that. if "stdc" in self.standards: - return LIBRARY_DESCRIPTIONS["stdc"] + return descriptions["stdc"] # If the header itself is in POSIX, just call it that. if "posix" in self.standards: - return LIBRARY_DESCRIPTIONS["posix"] + return descriptions["posix"] # Otherwise, consider the standards for each symbol as well. standards = self.all_standards() # Otherwise, it's described by all those that apply, but ignoring # "stdc" and "posix" since this is not a "stdc" or "posix" header. return " / ".join( sorted( - LIBRARY_DESCRIPTIONS[standard] + descriptions[standard] for standard in standards if standard not in {"stdc", "posix"} ) ) + def license_lines(self): + lines = self.license_text or LLVM_LICENSE_TEXT + return "\n".join([f"// {line}" for line in lines]) + def template(self, dir, files_read): if self.template_file is not None: # There's a custom template file, so just read it in and record @@ -162,6 +200,7 @@ def template(self, dir, files_read): library=self.library_description(), header=self.name, guard=self.header_guard(), + license_lines=self.license_lines(), ) def public_api(self): diff --git a/libc/utils/hdrgen/hdrgen/yaml_to_classes.py b/libc/utils/hdrgen/hdrgen/yaml_to_classes.py index ebe7781d449f7..9eddbe615cbba 100644 --- a/libc/utils/hdrgen/hdrgen/yaml_to_classes.py +++ b/libc/utils/hdrgen/hdrgen/yaml_to_classes.py @@ -37,6 +37,8 @@ def yaml_to_classes(yaml_data, header_class, entry_points=None): header = header_class(header_name) header.template_file = yaml_data.get("header_template") header.standards = yaml_data.get("standards", []) + header.extra_standards = yaml_data.get("extra_standards", {}) + header.license_text = yaml_data.get("license_text", []) header.merge_yaml_files = yaml_data.get("merge_yaml_files", []) for macro_data in yaml_data.get("macros", []): diff --git a/libc/utils/hdrgen/tests/expected_output/custom.h b/libc/utils/hdrgen/tests/expected_output/custom.h new file mode 100644 index 0000000000000..5f9ed231490fd --- /dev/null +++ b/libc/utils/hdrgen/tests/expected_output/custom.h @@ -0,0 +1,21 @@ +//===-- Wile E. Coyote header --===// +// +// Caveat emptor. +// I never studied law. +// +//===---------------------------------------------------------------------===// + +#ifndef _LLVM_LIBC_CUSTOM_H +#define _LLVM_LIBC_CUSTOM_H + +#include "__llvm-libc-common.h" +#include "llvm-libc-types/meep.h" +#include "llvm-libc-types/road.h" + +__BEGIN_C_DECLS + +road runner(meep, meep) __NOEXCEPT; + +__END_C_DECLS + +#endif // _LLVM_LIBC_CUSTOM_H diff --git a/libc/utils/hdrgen/tests/expected_output/test_header.h b/libc/utils/hdrgen/tests/expected_output/test_header.h index 748c09808c128..49112a353f7b6 100644 --- a/libc/utils/hdrgen/tests/expected_output/test_header.h +++ b/libc/utils/hdrgen/tests/expected_output/test_header.h @@ -12,6 +12,7 @@ #include "__llvm-libc-common.h" #include "llvm-libc-macros/float16-macros.h" +#include "llvm-libc-macros/CONST_FUNC_A.h" #include "llvm-libc-macros/test_more-macros.h" #include "llvm-libc-macros/test_small-macros.h" #include "llvm-libc-types/float128.h" diff --git a/libc/utils/hdrgen/tests/expected_output/test_small.json b/libc/utils/hdrgen/tests/expected_output/test_small.json index 9cc73d013a679..8502df23b9a41 100644 --- a/libc/utils/hdrgen/tests/expected_output/test_small.json +++ b/libc/utils/hdrgen/tests/expected_output/test_small.json @@ -4,6 +4,7 @@ "standards": [], "includes": [ "__llvm-libc-common.h", + "llvm-libc-macros/CONST_FUNC_A.h", "llvm-libc-macros/test_more-macros.h", "llvm-libc-macros/test_small-macros.h", "llvm-libc-types/float128.h", diff --git a/libc/utils/hdrgen/tests/input/custom-common.yaml b/libc/utils/hdrgen/tests/input/custom-common.yaml new file mode 100644 index 0000000000000..909a3ba5163a5 --- /dev/null +++ b/libc/utils/hdrgen/tests/input/custom-common.yaml @@ -0,0 +1,6 @@ +license_text: + - Caveat emptor. + - I never studied law. + +extra_standards: + acme: Wile E. Coyote diff --git a/libc/utils/hdrgen/tests/input/custom.yaml b/libc/utils/hdrgen/tests/input/custom.yaml new file mode 100644 index 0000000000000..7d3ff8ec421dd --- /dev/null +++ b/libc/utils/hdrgen/tests/input/custom.yaml @@ -0,0 +1,13 @@ +merge_yaml_files: + - custom-common.yaml + +header: custom.h +standards: + - acme + +functions: + - name: runner + return_type: road + arguments: + - type: meep + - type: meep diff --git a/libc/utils/hdrgen/tests/test_integration.py b/libc/utils/hdrgen/tests/test_integration.py index bf393d26a8101..c6e76d826a3a4 100644 --- a/libc/utils/hdrgen/tests/test_integration.py +++ b/libc/utils/hdrgen/tests/test_integration.py @@ -59,6 +59,13 @@ def test_generate_subdir_header(self): self.run_script(yaml_file, output_file) self.compare_files(output_file, expected_output_file) + def test_custom_license_and_standards(self): + yaml_file = self.source_dir / "input" / "custom.yaml" + expected_output_file = self.source_dir / "expected_output" / "custom.h" + output_file = self.output_dir / "custom.h" + self.run_script(yaml_file, output_file) + self.compare_files(output_file, expected_output_file) + def test_generate_json(self): yaml_file = self.source_dir / "input/test_small.yaml" expected_output_file = self.source_dir / "expected_output/test_small.json" From 4f52682ba232ae0e3c09bdcf089a9f1ea5655cca Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Thu, 30 Oct 2025 10:25:53 -0700 Subject: [PATCH 221/539] [clang][lex] Use `FileManager` to make prebuilt module paths absolute (#165347) This PR switches from using `llvm::sys::fs::make_absolute()` to `FileManager::makeAbsolutePath()` so that `FileSystemOptions` (i.e. the `-working-directory` option) and the `VFS`'s CWD have a say in how the prebuilt module paths are resolved. This matches how the rest of the compiler treats input files. --- clang/lib/Lex/HeaderSearch.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp index 65c324c10ca5d..f05c28fd7a123 100644 --- a/clang/lib/Lex/HeaderSearch.cpp +++ b/clang/lib/Lex/HeaderSearch.cpp @@ -221,7 +221,7 @@ std::string HeaderSearch::getPrebuiltModuleFileName(StringRef ModuleName, // file. for (const std::string &Dir : HSOpts.PrebuiltModulePaths) { SmallString<256> Result(Dir); - llvm::sys::fs::make_absolute(Result); + FileMgr.makeAbsolutePath(Result); if (ModuleName.contains(':')) // The separator of C++20 modules partitions (':') is not good for file // systems, here clang and gcc choose '-' by default since it is not a @@ -246,7 +246,7 @@ std::string HeaderSearch::getPrebuiltImplicitModuleFileName(Module *Module) { StringRef ModuleCacheHash = HSOpts.DisableModuleHash ? "" : getModuleHash(); for (const std::string &Dir : HSOpts.PrebuiltModulePaths) { SmallString<256> CachePath(Dir); - llvm::sys::fs::make_absolute(CachePath); + FileMgr.makeAbsolutePath(CachePath); llvm::sys::path::append(CachePath, ModuleCacheHash); std::string FileName = getCachedModuleFileNameImpl(ModuleName, ModuleMapPath, CachePath); From 11086a452c9d65c97e37d14b9e8f743db63e0ca9 Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Thu, 30 Oct 2025 13:27:41 -0400 Subject: [PATCH 222/539] [ADT] Support `.Default` with `nullptr` and `nullopt` values in TypeSwitch (#165724) In the previous implementation, this would fail for cases like `TypeSwitch>` because `std::nullopt` does not match `ResultT` exactly and the overload for callable types would be selected. Add new overloads that support `nullptr` and `std::nullopt`. These can be added alongside generic callables because we wouldn't want to call any 'null' function refs anyway. I selected the `nullptr` and `nullopt` specializations because how often they appear in the codebase -- currently, you will see lots of code like `.Default(std::optional())` that can be simplified with this patch. --- llvm/include/llvm/ADT/TypeSwitch.h | 17 +++++++++++ llvm/unittests/ADT/TypeSwitchTest.cpp | 41 +++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/llvm/include/llvm/ADT/TypeSwitch.h b/llvm/include/llvm/ADT/TypeSwitch.h index 5657303b0a1f2..50ca1d5a6b5b6 100644 --- a/llvm/include/llvm/ADT/TypeSwitch.h +++ b/llvm/include/llvm/ADT/TypeSwitch.h @@ -111,6 +111,7 @@ class TypeSwitch : public detail::TypeSwitchBase, T> { return std::move(*result); return defaultFn(this->value); } + /// As a default, return the given value. [[nodiscard]] ResultT Default(ResultT defaultResult) { if (result) @@ -118,6 +119,22 @@ class TypeSwitch : public detail::TypeSwitchBase, T> { return defaultResult; } + /// Default for pointer-like results types that accept `nullptr`. + template >> + [[nodiscard]] ResultT Default(std::nullptr_t) { + return Default(ResultT(nullptr)); + } + + /// Default for optional results types that accept `std::nullopt`. + template >> + [[nodiscard]] ResultT Default(std::nullopt_t) { + return Default(ResultT(std::nullopt)); + } + /// Declare default as unreachable, making sure that all cases were handled. [[nodiscard]] ResultT DefaultUnreachable( const char *message = "Fell off the end of a type-switch") { diff --git a/llvm/unittests/ADT/TypeSwitchTest.cpp b/llvm/unittests/ADT/TypeSwitchTest.cpp index a7d934265c5f0..b80122837c1ad 100644 --- a/llvm/unittests/ADT/TypeSwitchTest.cpp +++ b/llvm/unittests/ADT/TypeSwitchTest.cpp @@ -142,3 +142,44 @@ TEST(TypeSwitchTest, DefaultUnreachableWithVoid) { EXPECT_DEATH((void)translate(DerivedD()), "Unhandled type"); #endif } + +TEST(TypeSwitchTest, DefaultNullopt) { + auto translate = [](auto value) { + return TypeSwitch>(&value) + .Case([](DerivedA *) { return 0; }) + .Default(std::nullopt); + }; + EXPECT_EQ(0, translate(DerivedA())); + EXPECT_EQ(std::nullopt, translate(DerivedD())); +} + +TEST(TypeSwitchTest, DefaultNullptr) { + float foo = 0.0f; + auto translate = [&](auto value) { + return TypeSwitch(&value) + .Case([&](DerivedA *) { return &foo; }) + .Default(nullptr); + }; + EXPECT_EQ(&foo, translate(DerivedA())); + EXPECT_EQ(nullptr, translate(DerivedD())); +} + +TEST(TypeSwitchTest, DefaultNullptrForPointerLike) { + struct Value { + void *ptr; + Value(const Value &other) : ptr(other.ptr) {} + Value(std::nullptr_t) : ptr(nullptr) {} + Value() : Value(nullptr) {} + }; + + float foo = 0.0f; + Value fooVal; + fooVal.ptr = &foo; + auto translate = [&](auto value) { + return TypeSwitch(&value) + .Case([&](DerivedA *) { return fooVal; }) + .Default(nullptr); + }; + EXPECT_EQ(&foo, translate(DerivedA()).ptr); + EXPECT_EQ(nullptr, translate(DerivedD()).ptr); +} From ee0006e9f7acae78933ead8b7f74d8e96fc39f70 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 30 Oct 2025 16:41:37 +0000 Subject: [PATCH 223/539] Reland "[lldb-dap] Improving consistency of tests by removing concurrency." (#165688)" This reverts commit f205be095609aa61dfac3ae729406e0af2dcd15f. This new select mechanism has exposed the fact that the resources the Arm Linux bot has can vary a lot. We do limit it to a low number of parallel tests but in this case, I think it's write performance somewhere. Reland the changes since they work elsewhere, and disable lldb-dap tests on Arm Linux while I fix our buildbot. --- .../test/tools/lldb-dap/dap_server.py | 206 +++++++----------- .../test/tools/lldb-dap/lldbdap_testcase.py | 4 +- .../TestDAP_breakpointEvents.py | 30 ++- .../tools/lldb-dap/launch/TestDAP_launch.py | 2 +- .../module-event/TestDAP_module_event.py | 88 ++++---- .../tools/lldb-dap/module/TestDAP_module.py | 8 +- .../restart/TestDAP_restart_console.py | 24 +- .../lldb-dap/send-event/TestDAP_sendEvent.py | 2 +- 8 files changed, 161 insertions(+), 203 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py index d892c01f0bc71..8f3652172dfdf 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py @@ -10,8 +10,8 @@ import subprocess import signal import sys -import threading import warnings +import selectors import time from typing import ( Any, @@ -139,35 +139,6 @@ def dump_memory(base_addr, data, num_per_line, outfile): outfile.write("\n") -def read_packet( - f: IO[bytes], trace_file: Optional[IO[str]] = None -) -> Optional[ProtocolMessage]: - """Decode a JSON packet that starts with the content length and is - followed by the JSON bytes from a file 'f'. Returns None on EOF. - """ - line = f.readline().decode("utf-8") - if len(line) == 0: - return None # EOF. - - # Watch for line that starts with the prefix - prefix = "Content-Length: " - if line.startswith(prefix): - # Decode length of JSON bytes - length = int(line[len(prefix) :]) - # Skip empty line - separator = f.readline().decode() - if separator != "": - Exception("malformed DAP content header, unexpected line: " + separator) - # Read JSON bytes - json_str = f.read(length).decode() - if trace_file: - trace_file.write("from adapter:\n%s\n" % (json_str)) - # Decode the JSON bytes into a python dictionary - return json.loads(json_str) - - raise Exception("unexpected malformed message from lldb-dap: " + line) - - def packet_type_is(packet, packet_type): return "type" in packet and packet["type"] == packet_type @@ -199,16 +170,8 @@ def __init__( self.log_file = log_file self.send = send self.recv = recv - - # Packets that have been received and processed but have not yet been - # requested by a test case. - self._pending_packets: List[Optional[ProtocolMessage]] = [] - # Received packets that have not yet been processed. - self._recv_packets: List[Optional[ProtocolMessage]] = [] - # Used as a mutex for _recv_packets and for notify when _recv_packets - # changes. - self._recv_condition = threading.Condition() - self._recv_thread = threading.Thread(target=self._read_packet_thread) + self.selector = selectors.DefaultSelector() + self.selector.register(recv, selectors.EVENT_READ) # session state self.init_commands = init_commands @@ -234,9 +197,6 @@ def __init__( # keyed by breakpoint id self.resolved_breakpoints: dict[str, Breakpoint] = {} - # trigger enqueue thread - self._recv_thread.start() - @classmethod def encode_content(cls, s: str) -> bytes: return ("Content-Length: %u\r\n\r\n%s" % (len(s), s)).encode("utf-8") @@ -252,17 +212,46 @@ def validate_response(cls, command, response): f"seq mismatch in response {command['seq']} != {response['request_seq']}" ) - def _read_packet_thread(self): - try: - while True: - packet = read_packet(self.recv, trace_file=self.trace_file) - # `packet` will be `None` on EOF. We want to pass it down to - # handle_recv_packet anyway so the main thread can handle unexpected - # termination of lldb-dap and stop waiting for new packets. - if not self._handle_recv_packet(packet): - break - finally: - dump_dap_log(self.log_file) + def _read_packet( + self, + timeout: float = DEFAULT_TIMEOUT, + ) -> Optional[ProtocolMessage]: + """Decode a JSON packet that starts with the content length and is + followed by the JSON bytes from self.recv. Returns None on EOF. + """ + + ready = self.selector.select(timeout) + if not ready: + warnings.warn( + "timeout occurred waiting for a packet, check if the test has a" + " negative assertion and see if it can be inverted.", + stacklevel=4, + ) + return None # timeout + + line = self.recv.readline().decode("utf-8") + if len(line) == 0: + return None # EOF. + + # Watch for line that starts with the prefix + prefix = "Content-Length: " + if line.startswith(prefix): + # Decode length of JSON bytes + length = int(line[len(prefix) :]) + # Skip empty line + separator = self.recv.readline().decode() + if separator != "": + Exception("malformed DAP content header, unexpected line: " + separator) + # Read JSON bytes + json_str = self.recv.read(length).decode() + if self.trace_file: + self.trace_file.write( + "%s from adapter:\n%s\n" % (time.time(), json_str) + ) + # Decode the JSON bytes into a python dictionary + return json.loads(json_str) + + raise Exception("unexpected malformed message from lldb-dap: " + line) def get_modules( self, start_module: Optional[int] = None, module_count: Optional[int] = None @@ -310,34 +299,6 @@ def collect_output( output += self.get_output(category, clear=clear) return output - def _enqueue_recv_packet(self, packet: Optional[ProtocolMessage]): - with self.recv_condition: - self.recv_packets.append(packet) - self.recv_condition.notify() - - def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool: - """Handles an incoming packet. - - Called by the read thread that is waiting for all incoming packets - to store the incoming packet in "self._recv_packets" in a thread safe - way. This function will then signal the "self._recv_condition" to - indicate a new packet is available. - - Args: - packet: A new packet to store. - - Returns: - True if the caller should keep calling this function for more - packets. - """ - with self._recv_condition: - self._recv_packets.append(packet) - self._recv_condition.notify() - # packet is None on EOF - return packet is not None and not ( - packet["type"] == "response" and packet["command"] == "disconnect" - ) - def _recv_packet( self, *, @@ -361,46 +322,34 @@ def _recv_packet( The first matching packet for the given predicate, if specified, otherwise None. """ - assert ( - threading.current_thread != self._recv_thread - ), "Must not be called from the _recv_thread" - - def process_until_match(): - self._process_recv_packets() - for i, packet in enumerate(self._pending_packets): - if packet is None: - # We need to return a truthy value to break out of the - # wait_for, use `EOFError` as an indicator of EOF. - return EOFError() - if predicate and predicate(packet): - self._pending_packets.pop(i) - return packet - - with self._recv_condition: - packet = self._recv_condition.wait_for(process_until_match, timeout) - return None if isinstance(packet, EOFError) else packet - - def _process_recv_packets(self) -> None: + deadline = time.time() + timeout + + while time.time() < deadline: + packet = self._read_packet(timeout=deadline - time.time()) + if packet is None: + return None + self._process_recv_packet(packet) + if not predicate or predicate(packet): + return packet + + def _process_recv_packet(self, packet) -> None: """Process received packets, updating the session state.""" - with self._recv_condition: - for packet in self._recv_packets: - if packet and ("seq" not in packet or packet["seq"] == 0): - warnings.warn( - f"received a malformed packet, expected 'seq != 0' for {packet!r}" - ) - # Handle events that may modify any stateful properties of - # the DAP session. - if packet and packet["type"] == "event": - self._handle_event(packet) - elif packet and packet["type"] == "request": - # Handle reverse requests and keep processing. - self._handle_reverse_request(packet) - # Move the packet to the pending queue. - self._pending_packets.append(packet) - self._recv_packets.clear() + if packet and ("seq" not in packet or packet["seq"] == 0): + warnings.warn( + f"received a malformed packet, expected 'seq != 0' for {packet!r}" + ) + # Handle events that may modify any stateful properties of + # the DAP session. + if packet and packet["type"] == "event": + self._handle_event(packet) + elif packet and packet["type"] == "request": + # Handle reverse requests and keep processing. + self._handle_reverse_request(packet) def _handle_event(self, packet: Event) -> None: """Handle any events that modify debug session state we track.""" + self.events.append(packet) + event = packet["event"] body: Optional[Dict] = packet.get("body", None) @@ -453,6 +402,8 @@ def _handle_event(self, packet: Event) -> None: self.invalidated_event = packet elif event == "memory": self.memory_event = packet + elif event == "module": + self.module_events.append(packet) def _handle_reverse_request(self, request: Request) -> None: if request in self.reverse_requests: @@ -521,18 +472,14 @@ def send_packet(self, packet: ProtocolMessage) -> int: Returns the seq number of the request. """ - # Set the seq for requests. - if packet["type"] == "request": - packet["seq"] = self.sequence - self.sequence += 1 - else: - packet["seq"] = 0 + packet["seq"] = self.sequence + self.sequence += 1 # Encode our command dictionary as a JSON string json_str = json.dumps(packet, separators=(",", ":")) if self.trace_file: - self.trace_file.write("to adapter:\n%s\n" % (json_str)) + self.trace_file.write("%s to adapter:\n%s\n" % (time.time(), json_str)) length = len(json_str) if length > 0: @@ -913,6 +860,8 @@ def request_restart(self, restartArguments=None): if restartArguments: command_dict["arguments"] = restartArguments + # Clear state, the process is about to restart... + self._process_continued(True) response = self._send_recv(command_dict) # Caller must still call wait_for_stopped. return response @@ -1479,8 +1428,10 @@ def request_testGetTargetBreakpoints(self): def terminate(self): self.send.close() - if self._recv_thread.is_alive(): - self._recv_thread.join() + self.recv.close() + self.selector.close() + if self.log_file: + dump_dap_log(self.log_file) def request_setInstructionBreakpoints(self, memory_reference=[]): breakpoints = [] @@ -1577,6 +1528,7 @@ def launch( stdout=subprocess.PIPE, stderr=sys.stderr, env=adapter_env, + bufsize=0, ) if connection is None: diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py index 29935bb8046ff..a897c1b014597 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py @@ -15,6 +15,8 @@ # DAP tests as a whole have been flakey on the Windows on Arm bot. See: # https://github.com/llvm/llvm-project/issues/137660 @skipIf(oslist=["windows"], archs=["aarch64"]) +# The Arm Linux bot needs stable resources before it can run these tests reliably. +@skipif(oslist=["linux]"], archs=["arm$"]) class DAPTestCaseBase(TestBase): # set timeout based on whether ASAN was enabled or not. Increase # timeout by a factor of 10 if ASAN is enabled. @@ -416,7 +418,7 @@ def continue_to_next_stop(self): return self.dap_server.wait_for_stopped() def continue_to_breakpoint(self, breakpoint_id: str): - self.continue_to_breakpoints((breakpoint_id)) + self.continue_to_breakpoints([breakpoint_id]) def continue_to_breakpoints(self, breakpoint_ids): self.do_continue() diff --git a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py index beab4d6c1f5a6..7b78541fb4f8e 100644 --- a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py +++ b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py @@ -81,24 +81,20 @@ def test_breakpoint_events(self): breakpoint["verified"], "expect foo breakpoint to not be verified" ) - # Flush the breakpoint events. - self.dap_server.wait_for_breakpoint_events() - # Continue to the breakpoint - self.continue_to_breakpoints(dap_breakpoint_ids) + self.continue_to_breakpoint(foo_bp_id) + self.continue_to_next_stop() # foo_bp2 + self.continue_to_breakpoint(main_bp_id) + self.continue_to_exit() - verified_breakpoint_ids = [] - unverified_breakpoint_ids = [] - for breakpoint_event in self.dap_server.wait_for_breakpoint_events(): - breakpoint = breakpoint_event["body"]["breakpoint"] - id = breakpoint["id"] - if breakpoint["verified"]: - verified_breakpoint_ids.append(id) - else: - unverified_breakpoint_ids.append(id) + bp_events = [e for e in self.dap_server.events if e["event"] == "breakpoint"] - self.assertIn(main_bp_id, unverified_breakpoint_ids) - self.assertIn(foo_bp_id, unverified_breakpoint_ids) + main_bp_events = [ + e for e in bp_events if e["body"]["breakpoint"]["id"] == main_bp_id + ] + foo_bp_events = [ + e for e in bp_events if e["body"]["breakpoint"]["id"] == foo_bp_id + ] - self.assertIn(main_bp_id, verified_breakpoint_ids) - self.assertIn(foo_bp_id, verified_breakpoint_ids) + self.assertTrue(main_bp_events) + self.assertTrue(foo_bp_events) diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py index ca881f1d817c5..09b13223e0a78 100644 --- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py +++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py @@ -156,6 +156,7 @@ def test_debuggerRoot(self): self.build_and_launch( program, debuggerRoot=program_parent_dir, initCommands=commands ) + self.continue_to_exit() output = self.get_console() self.assertTrue(output and len(output) > 0, "expect console output") lines = output.splitlines() @@ -171,7 +172,6 @@ def test_debuggerRoot(self): % (program_parent_dir, line[len(prefix) :]), ) self.assertTrue(found, "verified lldb-dap working directory") - self.continue_to_exit() def test_sourcePath(self): """ diff --git a/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py b/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py index 1f4afabbd161e..9d1d17b704f76 100644 --- a/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py +++ b/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py @@ -1,58 +1,58 @@ -import dap_server +""" +Test 'module' events for dynamically loaded libraries. +""" + from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * -from lldbsuite.test import lldbutil import lldbdap_testcase -import re class TestDAP_module_event(lldbdap_testcase.DAPTestCaseBase): + def lookup_module_id(self, name): + """Returns the identifier for the first module event starting with the given name.""" + for event in self.dap_server.module_events: + if self.get_dict_value(event, ["body", "module", "name"]).startswith(name): + return self.get_dict_value(event, ["body", "module", "id"]) + self.fail(f"No module events matching name={name}") + + def module_events(self, id): + """Finds all module events by identifier.""" + return [ + event + for event in self.dap_server.module_events + if self.get_dict_value(event, ["body", "module", "id"]) == id + ] + + def module_reasons(self, events): + """Returns the list of 'reason' values from the given events.""" + return [event["body"]["reason"] for event in events] + @skipIfWindows def test_module_event(self): + """ + Test that module events are fired on target load and when the list of + dynamic libraries updates while running. + """ program = self.getBuildArtifact("a.out") self.build_and_launch(program) + # We can analyze the order of events after the process exits. + self.continue_to_exit() - source = "main.cpp" - breakpoint1_line = line_number(source, "// breakpoint 1") - breakpoint2_line = line_number(source, "// breakpoint 2") - breakpoint3_line = line_number(source, "// breakpoint 3") + a_out_id = self.lookup_module_id("a.out") + a_out_events = self.module_events(id=a_out_id) - breakpoint_ids = self.set_source_breakpoints( - source, [breakpoint1_line, breakpoint2_line, breakpoint3_line] + self.assertIn( + "new", + self.module_reasons(a_out_events), + "Expected a.out to load during the debug session.", ) - self.continue_to_breakpoints(breakpoint_ids) - - # We're now stopped at breakpoint 1 before the dlopen. Flush all the module events. - event = self.dap_server.wait_for_event(["module"]) - while event is not None: - event = self.dap_server.wait_for_event(["module"]) - - # Continue to the second breakpoint, before the dlclose. - self.continue_to_breakpoints(breakpoint_ids) - - # Make sure we got a module event for libother. - event = self.dap_server.wait_for_event(["module"]) - self.assertIsNotNone(event, "didn't get a module event") - module_name = event["body"]["module"]["name"] - module_id = event["body"]["module"]["id"] - self.assertEqual(event["body"]["reason"], "new") - self.assertIn("libother", module_name) - - # Continue to the third breakpoint, after the dlclose. - self.continue_to_breakpoints(breakpoint_ids) - - # Make sure we got a module event for libother. - event = self.dap_server.wait_for_event(["module"]) - self.assertIsNotNone(event, "didn't get a module event") - reason = event["body"]["reason"] - self.assertEqual(reason, "removed") - self.assertEqual(event["body"]["module"]["id"], module_id) - - # The removed module event should omit everything but the module id and name - # as they are required fields. - module_data = event["body"]["module"] - required_keys = ["id", "name"] - self.assertListEqual(list(module_data.keys()), required_keys) - self.assertEqual(module_data["name"], "", "expects empty name.") - self.continue_to_exit() + libother_id = self.lookup_module_id( + "libother." # libother.so or libother.dylib based on OS. + ) + libother_events = self.module_events(id=libother_id) + self.assertEqual( + self.module_reasons(libother_events), + ["new", "removed"], + "Expected libother to be loaded then unloaded during the debug session.", + ) diff --git a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py index 0ed53dac5d869..2d00c512721c6 100644 --- a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py +++ b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py @@ -64,19 +64,18 @@ def check_symbols_loaded_with_size(): self.assertEqual(program, program_module["path"]) self.assertIn("addressRange", program_module) + self.continue_to_exit() + # Collect all the module names we saw as events. module_new_names = [] module_changed_names = [] - module_event = self.dap_server.wait_for_event(["module"]) - while module_event is not None: + for module_event in self.dap_server.module_events: reason = module_event["body"]["reason"] if reason == "new": module_new_names.append(module_event["body"]["module"]["name"]) elif reason == "changed": module_changed_names.append(module_event["body"]["module"]["name"]) - module_event = self.dap_server.wait_for_event(["module"]) - # Make sure we got an event for every active module. self.assertNotEqual(len(module_new_names), 0) for module in active_modules: @@ -86,7 +85,6 @@ def check_symbols_loaded_with_size(): # symbols got added. self.assertNotEqual(len(module_changed_names), 0) self.assertIn(program_module["name"], module_changed_names) - self.continue_to_exit() @skipIfWindows def test_modules(self): diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py index e1ad1425a993d..fa62ec243f5c5 100644 --- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py +++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py @@ -30,7 +30,11 @@ def verify_stopped_on_entry(self, stopped_events: List[Dict[str, Any]]): if reason == "entry": seen_stopped_event += 1 - self.assertEqual(seen_stopped_event, 1, "expect only one stopped entry event.") + self.assertEqual( + seen_stopped_event, + 1, + f"expect only one stopped entry event in {stopped_events}", + ) @skipIfAsan @skipIfWindows @@ -92,11 +96,13 @@ def test_stopOnEntry(self): self.build_and_launch(program, console="integratedTerminal", stopOnEntry=True) [bp_main] = self.set_function_breakpoints(["main"]) - self.dap_server.request_continue() # sends configuration done - stopped_events = self.dap_server.wait_for_stopped() + self.dap_server.request_configurationDone() + stopped_threads = list(self.dap_server.thread_stop_reasons.values()) # We should be stopped at the entry point. - self.assertGreaterEqual(len(stopped_events), 0, "expect stopped events") - self.verify_stopped_on_entry(stopped_events) + self.assertEqual( + len(stopped_threads), 1, "Expected the main thread to be stopped on entry." + ) + self.assertEqual(stopped_threads[0]["reason"], "entry") # Then, if we continue, we should hit the breakpoint at main. self.dap_server.request_continue() @@ -105,8 +111,12 @@ def test_stopOnEntry(self): # Restart and check that we still get a stopped event before reaching # main. self.dap_server.request_restart() - stopped_events = self.dap_server.wait_for_stopped() - self.verify_stopped_on_entry(stopped_events) + stopped_threads = list(self.dap_server.thread_stop_reasons.values()) + # We should be stopped at the entry point. + self.assertEqual( + len(stopped_threads), 1, "Expected the main thread to be stopped on entry." + ) + self.assertEqual(stopped_threads[0]["reason"], "entry") # continue to main self.dap_server.request_continue() diff --git a/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py index a01845669666f..0184020589176 100644 --- a/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py +++ b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py @@ -32,7 +32,7 @@ def test_send_event(self): ], ) self.set_source_breakpoints(source, [breakpoint_line]) - self.continue_to_next_stop() + self.do_continue() custom_event = self.dap_server.wait_for_event( filter=["my-custom-event-no-body"] From 08a9de355f2b0d8dc0bcd5c3edb00fdb0035f1d2 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 30 Oct 2025 17:33:05 +0000 Subject: [PATCH 224/539] [lldb][test] Fix typo in Arm Linux lldb-dap skip Fixes 17dbd8690e36f8e514fb47f4418f78420d0fc019. --- .../Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py index a897c1b014597..97c7f2d9e1b4a 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py @@ -16,7 +16,7 @@ # https://github.com/llvm/llvm-project/issues/137660 @skipIf(oslist=["windows"], archs=["aarch64"]) # The Arm Linux bot needs stable resources before it can run these tests reliably. -@skipif(oslist=["linux]"], archs=["arm$"]) +@skipif(oslist=["linux"], archs=["arm$"]) class DAPTestCaseBase(TestBase): # set timeout based on whether ASAN was enabled or not. Increase # timeout by a factor of 10 if ASAN is enabled. From ae7bd07a36de3010c44152ed1e3d146a0ce94cd8 Mon Sep 17 00:00:00 2001 From: Marcell Leleszi <59964679+mleleszi@users.noreply.github.com> Date: Thu, 30 Oct 2025 18:35:20 +0100 Subject: [PATCH 225/539] [libc] Remove faccessat entrypoint if faccessat2 syscall is not available (#164936) [#163091](https://github.com/llvm/llvm-project/issues/163091) Remove unistd.faccessat entrypoint for x86 linux if faccessat2 syscall is not available. Tested with non existent symbol and exclusion works. --- libc/CMakeLists.txt | 2 +- libc/config/linux/x86_64/exclude.txt | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt index 14718e2090bde..ae555a256ba66 100644 --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -363,7 +363,7 @@ elseif(LLVM_LIBC_FULL_BUILD) message(FATAL_ERROR "${LIBC_CONFIG_PATH}/headers.txt file not found and fullbuild requested.") endif() -# Check exclude.txt that appends to LIBC_EXCLUDE_ENTRYPOINTS list +# Check exclude.txt that appends to TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS list if(EXISTS "${LIBC_CONFIG_PATH}/exclude.txt") include("${LIBC_CONFIG_PATH}/exclude.txt") endif() diff --git a/libc/config/linux/x86_64/exclude.txt b/libc/config/linux/x86_64/exclude.txt index 2c218b753b176..a0686310d21ac 100644 --- a/libc/config/linux/x86_64/exclude.txt +++ b/libc/config/linux/x86_64/exclude.txt @@ -19,3 +19,11 @@ if(NOT has_sys_random) ) endif() endif() + +include(CheckSymbolExists) +check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2) +if(NOT HAVE_SYS_FACCESSAT2) + list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS + libc.src.unistd.faccessat + ) +endif() From d99ed292650026fbb7f79903ee1f7f447e60fbc9 Mon Sep 17 00:00:00 2001 From: Marcell Leleszi <59964679+mleleszi@users.noreply.github.com> Date: Thu, 30 Oct 2025 18:35:42 +0100 Subject: [PATCH 226/539] [libc] Fix off by one error in strftime (#165711) This patch fixes a bug in strftime's return value when the formatted output exactly fills the buffer, not including the null terminator. The previous check failed to account for the null terminator in this case, incorrectly returning the written count instead of 0. --- libc/src/time/strftime.cpp | 2 +- libc/src/time/strftime_l.cpp | 2 +- libc/test/src/time/strftime_test.cpp | 20 ++++++++++++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/libc/src/time/strftime.cpp b/libc/src/time/strftime.cpp index f36091bc9736e..89b7d9bb7c1b9 100644 --- a/libc/src/time/strftime.cpp +++ b/libc/src/time/strftime.cpp @@ -26,7 +26,7 @@ LLVM_LIBC_FUNCTION(size_t, strftime, int ret = strftime_core::strftime_main(&writer, format, timeptr); if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - return (ret < 0 || static_cast(ret) > buffsz) ? 0 : ret; + return (ret < 0 || static_cast(ret) >= buffsz) ? 0 : ret; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/time/strftime_l.cpp b/libc/src/time/strftime_l.cpp index 201b85da39ee2..409f8683b7289 100644 --- a/libc/src/time/strftime_l.cpp +++ b/libc/src/time/strftime_l.cpp @@ -29,7 +29,7 @@ LLVM_LIBC_FUNCTION(size_t, strftime_l, int ret = strftime_core::strftime_main(&writer, format, timeptr); if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - return (ret < 0 || static_cast(ret) > buffsz) ? 0 : ret; + return (ret < 0 || static_cast(ret) >= buffsz) ? 0 : ret; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/time/strftime_test.cpp b/libc/test/src/time/strftime_test.cpp index cac7560b2b945..38176f77804d5 100644 --- a/libc/test/src/time/strftime_test.cpp +++ b/libc/test/src/time/strftime_test.cpp @@ -2326,3 +2326,23 @@ TEST(LlvmLibcStrftimeTest, TimeFormatFullDateTime) { // size_t written = 0; // SimplePaddedNum spn; // } + +TEST(LlvmLibcStrftimeTest, BufferTooSmall) { + struct tm time; + char buffer[1]; + + time.tm_year = get_adjusted_year(2025); + time.tm_mon = 10; + time.tm_mday = 24; + + size_t written = + LIBC_NAMESPACE::strftime(buffer, sizeof(buffer), "%F", &time); + EXPECT_EQ(written, size_t{0}); + + char buffer2[10]; + + // The string "2025-11-24" is 10 chars, + // so strftime needs 10 + 1 bytes to write the string and the null terminator. + written = LIBC_NAMESPACE::strftime(buffer, sizeof(buffer2), "%F", &time); + EXPECT_EQ(written, size_t{0}); +} From 0dac416508d50881973f36a89ca87dbc4c2bf7fe Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Thu, 30 Oct 2025 10:37:15 -0700 Subject: [PATCH 227/539] [SLU][profcheck] Estimate branch weights in partial unswitch cases (#164035) In the case of a partial unswitch, we take the invariant part of an expression consisting of either conjunctions or disjunctions, and hoist it out of the loop, conditioning a branch on it (==the invariant part). We can't correctly calculate the branch probability of this new branch, but can use the probability of the existing branch as a bound. That would preserve block frequencies better than allowing for the default, static (50-50) probability for that branch. Issue #147390 --- .../Transforms/Scalar/SimpleLoopUnswitch.cpp | 60 ++++++- .../nontrivial-unswitch-profile.ll | 89 ++++++++++ .../Transforms/SimpleLoopUnswitch/pr60736.ll | 11 +- .../simple-unswitch-profile.ll | 157 ++++++++++++++++++ 4 files changed, 305 insertions(+), 12 deletions(-) create mode 100644 llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-profile.ll create mode 100644 llvm/test/Transforms/SimpleLoopUnswitch/simple-unswitch-profile.ll diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 5af6c96c56a06..bb6c879f4d47e 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -81,6 +81,7 @@ STATISTIC( STATISTIC(NumInvariantConditionsInjected, "Number of invariant conditions injected and unswitched"); +namespace llvm { static cl::opt EnableNonTrivialUnswitch( "enable-nontrivial-unswitch", cl::init(false), cl::Hidden, cl::desc("Forcibly enables non-trivial loop unswitching rather than " @@ -131,11 +132,17 @@ static cl::opt InjectInvariantConditions( static cl::opt InjectInvariantConditionHotnesThreshold( "simple-loop-unswitch-inject-invariant-condition-hotness-threshold", - cl::Hidden, cl::desc("Only try to inject loop invariant conditions and " - "unswitch on them to eliminate branches that are " - "not-taken 1/ times or less."), + cl::Hidden, + cl::desc("Only try to inject loop invariant conditions and " + "unswitch on them to eliminate branches that are " + "not-taken 1/ times or less."), cl::init(16)); +static cl::opt EstimateProfile("simple-loop-unswitch-estimate-profile", + cl::Hidden, cl::init(true)); +extern cl::opt ProfcheckDisableMetadataFixes; +} // namespace llvm + AnalysisKey ShouldRunExtraSimpleLoopUnswitch::Key; namespace { struct CompareDesc { @@ -268,13 +275,42 @@ static bool areLoopExitPHIsLoopInvariant(const Loop &L, llvm_unreachable("Basic blocks should never be empty!"); } -/// Copy a set of loop invariant values \p ToDuplicate and insert them at the +/// Copy a set of loop invariant values \p Invariants and insert them at the /// end of \p BB and conditionally branch on the copied condition. We only /// branch on a single value. +/// We attempt to estimate the profile of the resulting conditional branch from +/// \p ComputeProfFrom, which is the original conditional branch we're +/// unswitching. +/// When \p Direction is true, the \p Invariants form a disjunction, and the +/// branch conditioned on it exits the loop on the "true" case. When \p +/// Direction is false, the \p Invariants form a conjunction and the branch +/// exits on the "false" case. static void buildPartialUnswitchConditionalBranch( BasicBlock &BB, ArrayRef Invariants, bool Direction, BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze, - const Instruction *I, AssumptionCache *AC, const DominatorTree &DT) { + const Instruction *I, AssumptionCache *AC, const DominatorTree &DT, + const BranchInst &ComputeProfFrom) { + + SmallVector BranchWeights; + bool HasBranchWeights = EstimateProfile && !ProfcheckDisableMetadataFixes && + extractBranchWeights(ComputeProfFrom, BranchWeights); + // If Direction is true, that means we had a disjunction and that the "true" + // case exits. The probability of the disjunction of the subset of terms is at + // most as high as the original one. So, if the probability is higher than the + // one we'd assign in absence of a profile (i.e. 0.5), we will use 0.5, + // but if it's lower, we will use the original probability. + // Conversely, if Direction is false, that means we had a conjunction, and the + // probability of exiting is captured in the second branch weight. That + // probability is a disjunction (of the negation of the original terms). The + // same reasoning applies as above. + // Issue #165649: should we expect BFI to conserve, and use that to calculate + // the branch weights? + if (HasBranchWeights && + static_cast(BranchWeights[Direction ? 0 : 1]) / + static_cast(sum_of(BranchWeights)) > + 0.5) + HasBranchWeights = false; + IRBuilder<> IRB(&BB); IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); @@ -287,8 +323,14 @@ static void buildPartialUnswitchConditionalBranch( Value *Cond = Direction ? IRB.CreateOr(FrozenInvariants) : IRB.CreateAnd(FrozenInvariants); - IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, - Direction ? &NormalSucc : &UnswitchedSucc); + auto *BR = IRB.CreateCondBr( + Cond, Direction ? &UnswitchedSucc : &NormalSucc, + Direction ? &NormalSucc : &UnswitchedSucc, + HasBranchWeights ? ComputeProfFrom.getMetadata(LLVMContext::MD_prof) + : nullptr); + if (!HasBranchWeights) + setExplicitlyUnknownBranchWeightsIfProfiled( + *BR, *BR->getParent()->getParent(), DEBUG_TYPE); } /// Copy a set of loop invariant values, and conditionally branch on them. @@ -658,7 +700,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, " condition!"); buildPartialUnswitchConditionalBranch( *OldPH, Invariants, ExitDirection, *UnswitchedBB, *NewPH, - FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT); + FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT, BI); } // Update the dominator tree with the added edge. @@ -2477,7 +2519,7 @@ static void unswitchNontrivialInvariants( else { buildPartialUnswitchConditionalBranch( *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, - FreezeLoopUnswitchCond, BI, &AC, DT); + FreezeLoopUnswitchCond, BI, &AC, DT, *BI); } DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-profile.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-profile.ll new file mode 100644 index 0000000000000..9cc417f6b874e --- /dev/null +++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-profile.ll @@ -0,0 +1,89 @@ +; RUN: split-file %s %t +; RUN: cat %t/main.ll %t/probable-or.prof > %t/probable-or.ll +; RUN: cat %t/main.ll %t/probable-and.prof > %t/probable-and.ll +; RUN: opt -passes='loop(simple-loop-unswitch)' -S %t/probable-or.ll -o -| FileCheck %t/probable-or.prof +; RUN: opt -passes='loop(simple-loop-unswitch)' -S %t/probable-and.ll -o -| FileCheck %t/probable-and.prof + +;--- main.ll +declare i32 @a() +declare i32 @b() + +define i32 @or(ptr %ptr, i1 %cond) !prof !0 { +entry: + br label %loop_begin + +loop_begin: + %v1 = load i1, ptr %ptr + %cond_or = or i1 %v1, %cond + br i1 %cond_or, label %loop_a, label %loop_b, !prof !1 + +loop_a: + call i32 @a() + br label %latch + +loop_b: + call i32 @b() + br label %latch + +latch: + %v2 = load i1, ptr %ptr + br i1 %v2, label %loop_begin, label %loop_exit, !prof !2 + +loop_exit: + ret i32 0 +} + +define i32 @and(ptr %ptr, i1 %cond) !prof !0 { +entry: + br label %loop_begin + +loop_begin: + %v1 = load i1, ptr %ptr + %cond_and = and i1 %v1, %cond + br i1 %cond_and, label %loop_a, label %loop_b, !prof !1 + +loop_a: + call i32 @a() + br label %latch + +loop_b: + call i32 @b() + br label %latch + +latch: + %v2 = load i1, ptr %ptr + br i1 %v2, label %loop_begin, label %loop_exit, !prof !2 + +loop_exit: + ret i32 0 +} + +;--- probable-or.prof +!0 = !{!"function_entry_count", i32 10} +!1 = !{!"branch_weights", i32 1, i32 1000} +!2 = !{!"branch_weights", i32 5, i32 7} +; CHECK-LABEL: @or +; CHECK-LABEL: entry: +; CHECK-NEXT: %cond.fr = freeze i1 %cond +; CHECK-NEXT: br i1 %cond.fr, label %entry.split.us, label %entry.split, !prof !1 +; CHECK-LABEL: @and +; CHECK-LABEL: entry: +; CHECK-NEXT: %cond.fr = freeze i1 %cond +; CHECK-NEXT: br i1 %cond.fr, label %entry.split, label %entry.split.us, !prof !3 +; CHECK: !1 = !{!"branch_weights", i32 1, i32 1000} +; CHECK: !3 = !{!"unknown", !"simple-loop-unswitch"} + +;--- probable-and.prof +!0 = !{!"function_entry_count", i32 10} +!1 = !{!"branch_weights", i32 1000, i32 1} +!2 = !{!"branch_weights", i32 5, i32 7} +; CHECK-LABEL: @or +; CHECK-LABEL: entry: +; CHECK-NEXT: %cond.fr = freeze i1 %cond +; CHECK-NEXT: br i1 %cond.fr, label %entry.split.us, label %entry.split, !prof !1 +; CHECK-LABEL: @and +; CHECK-LABEL: entry: +; CHECK-NEXT: %cond.fr = freeze i1 %cond +; CHECK-NEXT: br i1 %cond.fr, label %entry.split, label %entry.split.us, !prof !3 +; CHECK: !1 = !{!"unknown", !"simple-loop-unswitch"} +; CHECK: !3 = !{!"branch_weights", i32 1000, i32 1} diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/pr60736.ll b/llvm/test/Transforms/SimpleLoopUnswitch/pr60736.ll index 0964c55d1dec6..3760be4b26f23 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/pr60736.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/pr60736.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt < %s -simple-loop-unswitch-inject-invariant-conditions=true -passes='loop(simple-loop-unswitch,loop-instsimplify)' -S | FileCheck %s define void @test() { @@ -7,7 +7,7 @@ define void @test() { ; CHECK-NEXT: [[TMP:%.*]] = call i1 @llvm.experimental.widenable.condition() ; CHECK-NEXT: [[TMP1:%.*]] = load atomic i32, ptr addrspace(1) poison unordered, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load atomic i32, ptr addrspace(1) poison unordered, align 8 -; CHECK-NEXT: br i1 [[TMP]], label [[BB_SPLIT:%.*]], label [[BB3_SPLIT_US:%.*]] +; CHECK-NEXT: br i1 [[TMP]], label [[BB_SPLIT:%.*]], label [[BB3_SPLIT_US:%.*]], !prof [[PROF0:![0-9]+]] ; CHECK: bb.split: ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb3: @@ -19,7 +19,7 @@ define void @test() { ; CHECK-NEXT: [[TMP6_US:%.*]] = phi i32 [ poison, [[BB3_SPLIT_US]] ] ; CHECK-NEXT: [[TMP7_US:%.*]] = add nuw nsw i32 [[TMP6_US]], 2 ; CHECK-NEXT: [[TMP8_US:%.*]] = icmp ult i32 [[TMP7_US]], [[TMP2]] -; CHECK-NEXT: br i1 [[TMP8_US]], label [[BB9_US:%.*]], label [[BB16_SPLIT_US:%.*]], !prof [[PROF0:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8_US]], label [[BB9_US:%.*]], label [[BB16_SPLIT_US:%.*]], !prof [[PROF0]] ; CHECK: bb9.us: ; CHECK-NEXT: br label [[BB17_SPLIT_US:%.*]] ; CHECK: bb16.split.us: @@ -96,3 +96,8 @@ declare i1 @llvm.experimental.widenable.condition() !0 = !{!"branch_weights", i32 1048576, i32 1} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(inaccessiblemem: readwrite) } +;. +; CHECK: [[PROF0]] = !{!"branch_weights", i32 1048576, i32 1} +;. diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/simple-unswitch-profile.ll b/llvm/test/Transforms/SimpleLoopUnswitch/simple-unswitch-profile.ll new file mode 100644 index 0000000000000..ec6baa5b3772f --- /dev/null +++ b/llvm/test/Transforms/SimpleLoopUnswitch/simple-unswitch-profile.ll @@ -0,0 +1,157 @@ +; RUN: split-file %s %t +; RUN: cat %t/main.ll %t/probable-or.prof > %t/probable-or.ll +; RUN: cat %t/main.ll %t/probable-and.prof > %t/probable-and.ll +; RUN: opt -passes='loop-mssa(simple-loop-unswitch)' -S %t/probable-or.ll -o - | FileCheck %t/probable-or.prof +; RUN: opt -passes='loop-mssa(simple-loop-unswitch)' -S %t/probable-and.ll -o - | FileCheck %t/probable-and.prof +; +; RUN: opt -passes='module(print),function(loop-mssa(simple-loop-unswitch)),module(print)' \ +; RUN: %t/probable-or.ll -disable-output -simple-loop-unswitch-estimate-profile=0 2>&1 | FileCheck %t/probable-or.prof --check-prefixes=PROFILE-COM,PROFILE-REF + +; RUN: opt -passes='module(print),function(loop-mssa(simple-loop-unswitch)),module(print)' \ +; RUN: %t/probable-or.ll -disable-output -simple-loop-unswitch-estimate-profile=1 2>&1 | FileCheck %t/probable-or.prof --check-prefixes=PROFILE-COM,PROFILE-CHK + +; RUN: opt -passes='module(print),function(loop-mssa(simple-loop-unswitch)),module(print)' \ +; RUN: %t/probable-and.ll -disable-output -simple-loop-unswitch-estimate-profile=0 2>&1 | FileCheck %t/probable-and.prof --check-prefixes=PROFILE-COM,PROFILE-REF + +; RUN: opt -passes='module(print),function(loop-mssa(simple-loop-unswitch)),module(print)' \ +; RUN: %t/probable-and.ll -disable-output -simple-loop-unswitch-estimate-profile=1 2>&1 | FileCheck %t/probable-and.prof --check-prefixes=PROFILE-COM,PROFILE-CHK + +;--- main.ll +declare void @some_func() noreturn + +define i32 @or(i1 %cond1, i32 %var1) !prof !0 { +entry: + br label %loop_begin + +loop_begin: + %var3 = phi i32 [%var1, %entry], [%var2, %do_something] + %cond2 = icmp eq i32 %var3, 10 + %cond.or = or i1 %cond1, %cond2 + br i1 %cond.or, label %loop_exit, label %do_something, !prof !1 + +do_something: + %var2 = add i32 %var3, 1 + call void @some_func() noreturn nounwind + br label %loop_begin + +loop_exit: + ret i32 0 +} + +define i32 @and(i1 %cond1, i32 %var1) !prof !0 { +entry: + br label %loop_begin + +loop_begin: + %var3 = phi i32 [%var1, %entry], [%var2, %do_something] + %cond2 = icmp eq i32 %var3, 10 + %cond.and = and i1 %cond1, %cond2 + br i1 %cond.and, label %do_something, label %loop_exit, !prof !1 + +do_something: + %var2 = add i32 %var3, 1 + call void @some_func() noreturn nounwind + br label %loop_begin + +loop_exit: + ret i32 0 +} + +;--- probable-or.prof +!0 = !{!"function_entry_count", i32 10} +!1 = !{!"branch_weights", i32 1, i32 1000} +; CHECK-LABEL: @or +; CHECK-LABEL: entry: +; CHECK-NEXT: %cond1.fr = freeze i1 %cond1 +; CHECK-NEXT: br i1 %cond1.fr, label %loop_exit.split, label %entry.split, !prof !1 +; CHECK-LABEL: @and +; CHECK-LABEL: entry: +; CHECK-NEXT: %cond1.fr = freeze i1 %cond1 +; CHECK-NEXT: br i1 %cond1.fr, label %entry.split, label %loop_exit.split, !prof !2 +; CHECK: !1 = !{!"branch_weights", i32 1, i32 1000} +; CHECK: !2 = !{!"unknown", !"simple-loop-unswitch"} + +; PROFILE-COM: Printing analysis results of BFI for function 'or': +; PROFILE-COM: block-frequency-info: or + ; PROFILE-COM: - entry: {{.*}} count = 10 + ; PROFILE-COM: - loop_begin: {{.*}} count = 10010 + ; PROFILE-COM: - do_something: {{.*}} count = 10000 + ; PROFILE-COM: - loop_exit: {{.*}} count = 10 + +; PROFILE-COM: Printing analysis results of BFI for function 'and': +; PROFILE-COM: block-frequency-info: and + ; PROFILE-COM: - entry: {{.*}} count = 10 + ; PROFILE-COM: - loop_begin: {{.*}} count = 10 + ; PROFILE-COM: - do_something: {{.*}} count = 0 + ; PROFILE-COM: - loop_exit: {{.*}} count = 10 + +; PROFILE-COM: Printing analysis results of BFI for function 'or': +; PROFILE-COM: block-frequency-info: or + ; PROFILE-COM: - entry: {{.*}} count = 10 + ; PROFILE-REF: - entry.split: {{.*}} count = 5 + ; PROFILE-CHK: - entry.split: {{.*}} count = 10 + ; PROFILE-REF: - loop_begin: {{.*}} count = 5005 + ; PROFILE-CHK: - loop_begin: {{.*}} count = 10000 + ; PROFILE-REF: - do_something: {{.*}} count = 5000 + ; PROFILE-CHK: - do_something: {{.*}} count = 9990 + ; PROFILE-REF: - loop_exit: {{.*}} count = 5 + ; PROFILE-CHK: - loop_exit: {{.*}} count = 10 + ; PROFILE-COM: - loop_exit.split: {{.*}} count = 10 + +; PROFILE-COM: Printing analysis results of BFI for function 'and': +; PROFILE-COM: block-frequency-info: and + ; PROFILE-COM: - entry: {{.*}} count = 10 + ; PROFILE-COM: - entry.split: {{.*}} count = 5 + ; PROFILE-COM: - loop_begin: {{.*}} count = 5 + ; PROFILE-COM: - do_something: {{.*}} count = 0 + ; PROFILE-COM: - loop_exit: {{.*}} count = 5 + ; PROFILE-COM: - loop_exit.split: {{.*}} count = 10 + +;--- probable-and.prof +!0 = !{!"function_entry_count", i32 10} +!1 = !{!"branch_weights", i32 1000, i32 1} +; CHECK-LABEL: @or +; CHECK-LABEL: entry: +; CHECK-NEXT: %cond1.fr = freeze i1 %cond1 +; CHECK-NEXT: br i1 %cond1.fr, label %loop_exit.split, label %entry.split, !prof !1 +; CHECK-LABEL: @and +; CHECK-LABEL: entry: +; CHECK-NEXT: %cond1.fr = freeze i1 %cond1 +; CHECK-NEXT: br i1 %cond1.fr, label %entry.split, label %loop_exit.split, !prof !2 +; CHECK: !1 = !{!"unknown", !"simple-loop-unswitch"} +; CHECK: !2 = !{!"branch_weights", i32 1000, i32 1} +; PROFILE-COM: Printing analysis results of BFI for function 'or': +; PROFILE-COM: block-frequency-info: or + ; PROFILE-COM: - entry: {{.*}}, count = 10 + ; PROFILE-COM: - loop_begin: {{.*}}, count = 10 + ; PROFILE-COM: - do_something: {{.*}}, count = 0 + ; PROFILE-COM: - loop_exit: {{.*}}, count = 10 + +; PROFILE-COM: Printing analysis results of BFI for function 'and': +; PROFILE-COM: block-frequency-info: and + ; PROFILE-COM: - entry: {{.*}} count = 10 + ; PROFILE-COM: - loop_begin: {{.*}} count = 10010 + ; PROFILE-COM: - do_something: {{.*}} count = 10000 + ; PROFILE-COM: - loop_exit: {{.*}} count = 10 + +; PROFILE-COM: Printing analysis results of BFI for function 'or': +; PROFILE-COM: block-frequency-info: or + ; PROFILE-COM: - entry: {{.*}} count = 10 + ; PROFILE-COM: - entry.split: {{.*}} count = 5 + ; PROFILE-COM: - loop_begin: {{.*}} count = 5 + ; PROFILE-COM: - do_something: {{.*}} count = 0 + ; PROFILE-COM: - loop_exit: {{.*}} count = 5 + ; PROFILE-COM: - loop_exit.split: {{.*}} count = 10 + +; PROFILE-COM: Printing analysis results of BFI for function 'and': +; PROFILE-COM: block-frequency-info: and + ; PROFILE-COM: - entry: {{.*}} count = 10 + ; PROFILE-REF: - entry.split: {{.*}} count = 5 + ; PROFILE-CHK: - entry.split: {{.*}} count = 10 + ; PROFILE-REF: - loop_begin: {{.*}} count = 5005 + ; PROFILE-CHK: - loop_begin: {{.*}} count = 10000 + ; PROFILE-REF: - do_something: {{.*}} count = 5000 + ; PROFILE-CHK: - do_something: {{.*}} count = 9990 + ; PROFILE-REF: - loop_exit: {{.*}} count = 5 + ; PROFILE-CHK: - loop_exit: {{.*}} count = 10 + ; PROFILE-COM: - loop_exit.split: {{.*}} count = 10 From 5a5abf05962a66d1ed682466d36b8cfb235346ff Mon Sep 17 00:00:00 2001 From: Amara Emerson Date: Thu, 30 Oct 2025 10:42:36 -0700 Subject: [PATCH 228/539] Move GlobalISel sync up meeting information from "past" to current sync ups. --- llvm/docs/GettingInvolved.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst index 4b4b09ad87aba..039d61624093d 100644 --- a/llvm/docs/GettingInvolved.rst +++ b/llvm/docs/GettingInvolved.rst @@ -223,6 +223,10 @@ what to add to your calendar invite. - `ics `__ `gcal `__ - + * - GlobalISel + - Every 2nd Tuesday of the month + - `gcal `__ + - `Meeting details/agenda `__ For event owners, our Discord bot also supports sending automated announcements @@ -254,10 +258,6 @@ the future. - `ics `__ `gcal `__ - `Minutes/docs `__ - * - GlobalISel - - Every 2nd Tuesday of the month - - `gcal `__ - - `Meeting details/agenda `__ * - Vector Predication - Every 2 weeks on Tuesdays, 3pm UTC - From f05204f4b2fc5699f6de3fefb8f24818032c4280 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 30 Oct 2025 17:42:28 +0000 Subject: [PATCH 229/539] [lit] Move ulimit_unlimited.txt test to non Darwin tests This fails on MacOS because setting it to unlimited there just sets the limit to the max value which causes differences that show up in the check lines. --- .../ulimit_unlimited.txt | 4 ++-- llvm/utils/lit/tests/shtest-ulimit-nondarwin.py | 8 +++++++- llvm/utils/lit/tests/shtest-ulimit.py | 8 +------- 3 files changed, 10 insertions(+), 10 deletions(-) rename llvm/utils/lit/tests/Inputs/{shtest-ulimit => shtest-ulimit-nondarwin}/ulimit_unlimited.txt (53%) diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_unlimited.txt b/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_unlimited.txt similarity index 53% rename from llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_unlimited.txt rename to llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_unlimited.txt index b8aa3d5071712..4c687e3061869 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_unlimited.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit-nondarwin/ulimit_unlimited.txt @@ -1,6 +1,6 @@ # RUN: ulimit -f 5 -# RUN: %{python} %S/print_limits.py +# RUN: %{python} %S/../shtest-ulimit/print_limits.py # RUN: ulimit -f unlimited -# RUN: %{python} %S/print_limits.py +# RUN: %{python} %S/../shtest-ulimit/print_limits.py # Fail the test so that we can assert on the output. # RUN: not echo return diff --git a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py index 022e8b5f41892..893270ec68f68 100644 --- a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py +++ b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py @@ -6,10 +6,16 @@ # RUN: not %{lit} -a -v %{inputs}/shtest-ulimit-nondarwin | FileCheck %s -# CHECK: -- Testing: 1 tests{{.*}} +# CHECK: -- Testing: 2 tests{{.*}} # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_okay.txt ({{[^)]*}}) # CHECK: ulimit -v 1048576 # CHECK: ulimit -s 256 # CHECK: RLIMIT_AS=1073741824 # CHECK: RLIMIT_STACK=262144 + +# CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_unlimited.txt ({{[^)]*}}) +# CHECK: ulimit -f 5 +# CHECK: RLIMIT_FSIZE=5 +# CHECK: ulimit -f unlimited +# CHECK: RLIMIT_FSIZE=-1 diff --git a/llvm/utils/lit/tests/shtest-ulimit.py b/llvm/utils/lit/tests/shtest-ulimit.py index e15e190920308..21e5a5e2491d1 100644 --- a/llvm/utils/lit/tests/shtest-ulimit.py +++ b/llvm/utils/lit/tests/shtest-ulimit.py @@ -11,7 +11,7 @@ # RUN: not %{lit} -a -v %{inputs}/shtest-ulimit --order=lexical \ # RUN: | FileCheck -DBASE_NOFILE_LIMIT=%{readfile:%t.nofile_limit} %s -# CHECK: -- Testing: 4 tests{{.*}} +# CHECK: -- Testing: 3 tests{{.*}} # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit-bad-arg.txt ({{[^)]*}}) # CHECK: ulimit -n @@ -25,9 +25,3 @@ # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_reset.txt ({{[^)]*}}) # CHECK: RLIMIT_NOFILE=[[BASE_NOFILE_LIMIT]] - -# CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_unlimited.txt ({{[^)]*}}) -# CHECK: ulimit -f 5 -# CHECK: RLIMIT_FSIZE=5 -# CHECK: ulimit -f unlimited -# CHECK: RLIMIT_FSIZE=-1 From 14b59ed73b9e7138e6258e5e5cbb3d1064d95727 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 30 Oct 2025 10:31:53 -0700 Subject: [PATCH 230/539] [RISCV] Adjust stackmaps test to provide coverage for non-64 bit values --- llvm/test/CodeGen/RISCV/rv64-stackmap.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll index d07f608bf7893..c3183a1a3e036 100644 --- a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll +++ b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll @@ -290,9 +290,9 @@ define void @liveConstant() { ; CHECK-NEXT: .half 2 ; CHECK-NEXT: .half 0 ; CHECK-NEXT: .word -define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) { +define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 zeroext %l26, i32 signext %l27) { entry: - call void (i64, i32, ptr, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 28, ptr null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) + call void (i64, i32, ptr, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 28, ptr null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 %l26, i32 %l27) ret void } From 6830fb5eaafa517658278833e260dff6e33228aa Mon Sep 17 00:00:00 2001 From: Ian Anderson Date: Thu, 30 Oct 2025 10:54:14 -0700 Subject: [PATCH 231/539] [clang][docs] assert.h is not a good candidate for a textual header (#165057) The C standard behavior of `assert` cannot be accomplished with clang modules, either as a normal modular header, or a textual header. As a normal modular header: #define NDEBUG #include This pattern doesn't work, NDEBUG has to be passed on the command line to take effect, and then will effect all `assert`s in the includer. As a textual header: #define NDEBUG #include This pattern doesn't work for similar reasons, modular_header_that_has_an_assert.h captured the value of NDEBUG when its module built and won't pick it up from the includer. -DNDEBUG can be passed when building the module, but will similarly effect the entire module. This has the additional problem that every module will contain a declaration for `assert`, which can possibly conflict with each other if they use different values of NDEBUG. So really just doesn't work properly with clang modules. Avoid the issue by not mentioning it in the Modules documentation, and use "X macros" as the example for textual headers. Don't use [extern_c] in the example modules, that should very rarely be used. Don't put multiple `header` declarations in a submodule, that has the confusing effect of "fusing" the headers. e.g. does not include , but if it's in the same submodule, then an `#include ` will mysteriously also include . --- clang/docs/Modules.rst | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/clang/docs/Modules.rst b/clang/docs/Modules.rst index acbe45e0be970..e45ee9ff9eac2 100644 --- a/clang/docs/Modules.rst +++ b/clang/docs/Modules.rst @@ -421,13 +421,7 @@ As an example, the module map file for the C standard library might look a bit l .. parsed-literal:: - module std [system] [extern_c] { - module assert { - textual header "assert.h" - header "bits/assert-decls.h" - export * - } - + module std [system] { module complex { header "complex.h" export * @@ -440,7 +434,6 @@ As an example, the module map file for the C standard library might look a bit l module errno { header "errno.h" - header "sys/errno.h" export * } @@ -673,14 +666,14 @@ of checking *use-declaration*\s, and must still be a lexically-valid header file. In the future, we intend to pre-tokenize such headers and include the token sequence within the prebuilt module representation. -A header with the ``exclude`` specifier is excluded from the module. It will not be included when the module is built, nor will it be considered to be part of the module, even if an ``umbrella`` header or directory would otherwise make it part of the module. +A header with the ``exclude`` specifier is excluded from the module. It will not be included when the module is built, nor will it be considered to be part of the module, even if an ``umbrella`` directory would otherwise make it part of the module. -**Example:** The C header ``assert.h`` is an excellent candidate for a textual header, because it is meant to be included multiple times (possibly with different ``NDEBUG`` settings). However, declarations within it should typically be split into a separate modular header. +**Example:** A "X macro" header is an excellent candidate for a textual header, because it is can't be compiled standalone, and by itself does not contain any declarations. .. parsed-literal:: - module std [system] { - textual header "assert.h" + module MyLib [system] { + textual header "xmacros.h" } A given header shall not be referenced by more than one *header-declaration*. From 6ad988b04757806cb7dc8867b194c86f25318fa4 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 30 Oct 2025 11:28:11 -0700 Subject: [PATCH 232/539] [lit] Expand late substitutions before running builtins This enables the use of readfile substitutions for populating environment variables. This is necessary in some compiler-rt tests. Reviewers: pawosm-arm Reviewed By: pawosm-arm Pull Request: https://github.com/llvm/llvm-project/pull/165140 --- llvm/utils/lit/lit/TestRunner.py | 7 ++++--- llvm/utils/lit/tests/Inputs/shtest-readfile/env.txt | 6 ++++++ llvm/utils/lit/tests/Inputs/shtest-readfile/lit.cfg | 1 + llvm/utils/lit/tests/shtest-readfile-external.py | 2 +- llvm/utils/lit/tests/shtest-readfile.py | 6 +++++- 5 files changed, 17 insertions(+), 5 deletions(-) create mode 100644 llvm/utils/lit/tests/Inputs/shtest-readfile/env.txt diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index 76beebd757a75..3176b1a257434 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -826,6 +826,10 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): not_args = [] not_count = 0 not_crash = False + + # Expand all late substitutions. + args = _expandLateSubstitutions(j, args, cmd_shenv.cwd) + while True: if args[0] == "env": # Create a copy of the global environment and modify it for @@ -875,9 +879,6 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): # Ensure args[0] is hashable. args[0] = expand_glob(args[0], cmd_shenv.cwd)[0] - # Expand all late substitutions. - args = _expandLateSubstitutions(j, args, cmd_shenv.cwd) - inproc_builtin = inproc_builtins.get(args[0], None) if inproc_builtin and (args[0] != "echo" or len(cmd.commands) == 1): # env calling an in-process builtin is useless, so we take the safe diff --git a/llvm/utils/lit/tests/Inputs/shtest-readfile/env.txt b/llvm/utils/lit/tests/Inputs/shtest-readfile/env.txt new file mode 100644 index 0000000000000..3e19373754976 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-readfile/env.txt @@ -0,0 +1,6 @@ +## Tests that readfile works with the env builtin. +# RUN: echo -n "hello" > %t.1 +# RUN: env TEST=%{readfile:%t.1} %{python} -c "import os; print(os.environ['TEST'])" + +## Fail the test so we can assert on the output. +# RUN: not echo return \ No newline at end of file diff --git a/llvm/utils/lit/tests/Inputs/shtest-readfile/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-readfile/lit.cfg index ee496674fdb62..80af27f57d35c 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-readfile/lit.cfg +++ b/llvm/utils/lit/tests/Inputs/shtest-readfile/lit.cfg @@ -10,6 +10,7 @@ use_lit_shell = lit.util.pythonize_bool(lit_shell_env) config.test_format = lit.formats.ShTest(execute_external=not use_lit_shell) config.test_source_root = None config.test_exec_root = None +config.substitutions.append(("%{python}", '"%s"' % (sys.executable))) # If we are testing with the external shell, remove the fake-externals from # PATH so that we use mkdir in the tests. diff --git a/llvm/utils/lit/tests/shtest-readfile-external.py b/llvm/utils/lit/tests/shtest-readfile-external.py index c00bff45c8703..6fe1088efd674 100644 --- a/llvm/utils/lit/tests/shtest-readfile-external.py +++ b/llvm/utils/lit/tests/shtest-readfile-external.py @@ -6,7 +6,7 @@ # UNSUPPORTED: system-windows # RUN: env LIT_USE_INTERNAL_SHELL=0 not %{lit} -a -v %{inputs}/shtest-readfile | FileCheck -match-full-lines -DTEMP_PATH=%S/Inputs/shtest-readfile/Output %s -# CHECK: -- Testing: 4 tests{{.*}} +# CHECK: -- Testing: 5 tests{{.*}} # CHECK-LABEL: FAIL: shtest-readfile :: absolute-paths.txt ({{[^)]*}}) # CHECK: echo $(cat [[TEMP_PATH]]/absolute-paths.txt.tmp) && test -e [[TEMP_PATH]]/absolute-paths.txt.tmp {{.*}} diff --git a/llvm/utils/lit/tests/shtest-readfile.py b/llvm/utils/lit/tests/shtest-readfile.py index 66e3a042bf787..218da2257bcff 100644 --- a/llvm/utils/lit/tests/shtest-readfile.py +++ b/llvm/utils/lit/tests/shtest-readfile.py @@ -5,12 +5,16 @@ # RUN: env LIT_USE_INTERNAL_SHELL=1 not %{lit} -a -v %{inputs}/shtest-readfile | FileCheck -match-full-lines -DTEMP_PATH=%S%{fs-sep}Inputs%{fs-sep}shtest-readfile%{fs-sep}Output %s -# CHECK: -- Testing: 4 tests{{.*}} +# CHECK: -- Testing: 5 tests{{.*}} # CHECK-LABEL: FAIL: shtest-readfile :: absolute-paths.txt ({{[^)]*}}) # CHECK: echo hello # CHECK: # executed command: echo '%{readfile:[[TEMP_PATH]]{{[\\\/]}}absolute-paths.txt.tmp}' +# CHECK-LABEL: FAIL: shtest-readfile :: env.txt ({{[^)]*}}) +# CHECK: env TEST=hello {{.*}} -c "import os; print(os.environ['TEST'])" +# CHECK: # | hello + # CHECK-LABEL: FAIL: shtest-readfile :: file-does-not-exist.txt ({{[^)]*}}) # CHECK: # executed command: @echo 'echo %{readfile:/file/does/not/exist}' # CHECK: # | File specified in readfile substitution does not exist: {{.*}}/file/does/not/exist From af7fce120f3cd863397cba1ffc4ec3c9ab8e13ff Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 30 Oct 2025 11:28:37 -0700 Subject: [PATCH 233/539] ELF: Rename RandomizePaddingSection to PaddingSection. This section type is about to be used by #147424 so let's give it a more generic name. Reviewers: smithp35, MaskRay Reviewed By: MaskRay Pull Request: https://github.com/llvm/llvm-project/pull/155540 --- lld/ELF/SyntheticSections.cpp | 7 +++---- lld/ELF/SyntheticSections.h | 4 ++-- lld/ELF/Writer.cpp | 7 +++---- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index bbf4b29a9fda5..a4150ebfa1653 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -2749,14 +2749,13 @@ RelroPaddingSection::RelroPaddingSection(Ctx &ctx) : SyntheticSection(ctx, ".relro_padding", SHT_NOBITS, SHF_ALLOC | SHF_WRITE, 1) {} -RandomizePaddingSection::RandomizePaddingSection(Ctx &ctx, uint64_t size, - OutputSection *parent) - : SyntheticSection(ctx, ".randomize_padding", SHT_PROGBITS, SHF_ALLOC, 1), +PaddingSection::PaddingSection(Ctx &ctx, uint64_t size, OutputSection *parent) + : SyntheticSection(ctx, ".padding", SHT_PROGBITS, SHF_ALLOC, 1), size(size) { this->parent = parent; } -void RandomizePaddingSection::writeTo(uint8_t *buf) { +void PaddingSection::writeTo(uint8_t *buf) { std::array filler = getParent()->getFiller(ctx); uint8_t *end = buf + size; for (; buf + 4 <= end; buf += 4) diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h index ac3ec63f0a7a5..38e68110e4bc0 100644 --- a/lld/ELF/SyntheticSections.h +++ b/lld/ELF/SyntheticSections.h @@ -779,11 +779,11 @@ class RelroPaddingSection final : public SyntheticSection { void writeTo(uint8_t *buf) override {} }; -class RandomizePaddingSection final : public SyntheticSection { +class PaddingSection final : public SyntheticSection { uint64_t size; public: - RandomizePaddingSection(Ctx &ctx, uint64_t size, OutputSection *parent); + PaddingSection(Ctx &ctx, uint64_t size, OutputSection *parent); size_t getSize() const override { return size; } void writeTo(uint8_t *buf) override; }; diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 4fa80397cbfa7..083b4fb1dbd22 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -1495,15 +1495,14 @@ static void randomizeSectionPadding(Ctx &ctx) { if (auto *isd = dyn_cast(bc)) { SmallVector tmp; if (os->ptLoad != curPtLoad) { - tmp.push_back(make( - ctx, g() % ctx.arg.maxPageSize, os)); + tmp.push_back( + make(ctx, g() % ctx.arg.maxPageSize, os)); curPtLoad = os->ptLoad; } for (InputSection *isec : isd->sections) { // Probability of inserting padding is 1 in 16. if (g() % 16 == 0) - tmp.push_back( - make(ctx, isec->addralign, os)); + tmp.push_back(make(ctx, isec->addralign, os)); tmp.push_back(isec); } isd->sections = std::move(tmp); From ff0a87ac2d8c25d133e1f3d2a5fd109ddd821711 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Thu, 30 Oct 2025 18:32:33 +0000 Subject: [PATCH 234/539] [LV] Strengthen assert: VPlan0 doesn't have WidenPHIs (NFC) (#165715) VPWidenCanonicalIV and VPBlend recipes are created by VPPredicator, and VPCanonicalIVPHI and VPInstruction recipes are created by VPlanConstruction. WidenPHIs are never created. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8ebc108080271..505fb435e91e6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8340,11 +8340,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( &R) || (isa(&R) && !UnderlyingValue)) continue; - - // FIXME: VPlan0, which models a copy of the original scalar loop, should - // not use VPWidenPHIRecipe to model the phis. - assert((isa(&R) || isa(&R)) && - UnderlyingValue && "unsupported recipe"); + assert(isa(&R) && UnderlyingValue && "unsupported recipe"); // TODO: Gradually replace uses of underlying instruction by analyses on // VPlan. From f658f383e9a114bab0cca275fd8ac5d0efc88cae Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 30 Oct 2025 18:34:05 +0000 Subject: [PATCH 235/539] [lldb][test] Fix typo in lldb-dap skip for Arm 32-bit Fixes 17dbd8690e36f8e514fb47f4418f78420d0fc019 (again) --- .../Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py index 97c7f2d9e1b4a..405e91fc2dc36 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py @@ -16,7 +16,7 @@ # https://github.com/llvm/llvm-project/issues/137660 @skipIf(oslist=["windows"], archs=["aarch64"]) # The Arm Linux bot needs stable resources before it can run these tests reliably. -@skipif(oslist=["linux"], archs=["arm$"]) +@skipIf(oslist=["linux"], archs=["arm$"]) class DAPTestCaseBase(TestBase): # set timeout based on whether ASAN was enabled or not. Increase # timeout by a factor of 10 if ASAN is enabled. From a812b38c7a0a3c9aadadad72842496edc976f258 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 30 Oct 2025 11:49:22 -0700 Subject: [PATCH 236/539] [libc][hdrgen] Sort identifiers with leading underscores specially (#165745) This makes the sorting behavior more uniform: functions and macros are always sorted (separately), not only when merging. This changes the sort order used for functions and other things sorted by their symbol names. Symbols are sorted alphabetically without regard to leading underscores, and then for identifiers that differ only in the number of leading underscores, the fewer underscores the earlier in the sort order. For the functions declared in a generated header, adjacent names with and without underscores will be grouped together without blank lines. This is implemented by factoring the name field, equality, and sorting support out of the various entity classes into a new common superclass (hdrgen.Symbol). This uncovered YAML's requirement to quote the string "NULL" to avoid pyyaml parsing it as None (equivalent to Javascript null) rather than a string. --- libc/include/locale.yaml | 2 +- libc/include/stdio.yaml | 2 +- libc/include/stdlib.yaml | 2 +- libc/include/string.yaml | 2 +- libc/include/time.yaml | 2 +- libc/include/wchar.yaml | 8 ++-- libc/utils/hdrgen/hdrgen/enumeration.py | 16 ++------ libc/utils/hdrgen/hdrgen/function.py | 16 ++------ libc/utils/hdrgen/hdrgen/header.py | 14 +++++-- libc/utils/hdrgen/hdrgen/macro.py | 16 ++------ libc/utils/hdrgen/hdrgen/main.py | 1 + libc/utils/hdrgen/hdrgen/object.py | 16 ++------ libc/utils/hdrgen/hdrgen/symbol.py | 41 +++++++++++++++++++ libc/utils/hdrgen/hdrgen/type.py | 20 +++------ .../hdrgen/tests/expected_output/sorting.h | 24 +++++++++++ libc/utils/hdrgen/tests/input/sorting.yaml | 20 +++++++++ libc/utils/hdrgen/tests/test_integration.py | 7 ++++ 17 files changed, 129 insertions(+), 80 deletions(-) create mode 100644 libc/utils/hdrgen/hdrgen/symbol.py create mode 100644 libc/utils/hdrgen/tests/expected_output/sorting.h create mode 100644 libc/utils/hdrgen/tests/input/sorting.yaml diff --git a/libc/include/locale.yaml b/libc/include/locale.yaml index 4566984ad83af..3c3998eb07aa4 100644 --- a/libc/include/locale.yaml +++ b/libc/include/locale.yaml @@ -1,7 +1,7 @@ header: locale.h header_template: locale.h.def macros: - - macro_name: NULL + - macro_name: "NULL" macro_header: null-macro.h types: - type_name: locale_t diff --git a/libc/include/stdio.yaml b/libc/include/stdio.yaml index 394437ba3bbcd..c50b4ecb0bf08 100644 --- a/libc/include/stdio.yaml +++ b/libc/include/stdio.yaml @@ -1,7 +1,7 @@ header: stdio.h header_template: stdio.h.def macros: - - macro_name: NULL + - macro_name: "NULL" macro_header: null-macro.h - macro_name: stdout macro_value: stdout diff --git a/libc/include/stdlib.yaml b/libc/include/stdlib.yaml index 3b2ff13c684b1..495eb7e1317b6 100644 --- a/libc/include/stdlib.yaml +++ b/libc/include/stdlib.yaml @@ -5,7 +5,7 @@ standards: merge_yaml_files: - stdlib-malloc.yaml macros: - - macro_name: NULL + - macro_name: "NULL" macro_header: null-macro.h types: - type_name: __atexithandler_t diff --git a/libc/include/string.yaml b/libc/include/string.yaml index 0bf297ee747a4..22010f4afa812 100644 --- a/libc/include/string.yaml +++ b/libc/include/string.yaml @@ -2,7 +2,7 @@ header: string.h standards: - stdc macros: - - macro_name: NULL + - macro_name: "NULL" macro_header: null-macro.h types: - type_name: locale_t diff --git a/libc/include/time.yaml b/libc/include/time.yaml index 2f8024298fad1..88e50d1288238 100644 --- a/libc/include/time.yaml +++ b/libc/include/time.yaml @@ -1,7 +1,7 @@ header: time.h header_template: time.h.def macros: - - macro_name: NULL + - macro_name: "NULL" macro_header: null-macro.h types: - type_name: struct_timeval diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index b8a0a748cd3ad..c8b9e21b56b28 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -1,7 +1,7 @@ header: wchar.h header_template: wchar.h.def macros: - - macro_name: NULL + - macro_name: "NULL" macro_header: null-macro.h types: - type_name: FILE @@ -188,8 +188,8 @@ functions: standards: - stdc return_type: wchar_t * - arguments: - - type: wchar_t *__restrict + arguments: + - type: wchar_t *__restrict - type: const wchar_t *__restrict - type: size_t - name: wmemmove @@ -212,7 +212,7 @@ functions: standards: - stdc return_type: wchar_t * - arguments: + arguments: - type: wchar_t *__restrict - type: const wchar_t *__restrict - name: wcslcat diff --git a/libc/utils/hdrgen/hdrgen/enumeration.py b/libc/utils/hdrgen/hdrgen/enumeration.py index 198720826720c..1e0f64aec1eda 100644 --- a/libc/utils/hdrgen/hdrgen/enumeration.py +++ b/libc/utils/hdrgen/hdrgen/enumeration.py @@ -6,24 +6,14 @@ # # ==-------------------------------------------------------------------------==# -from functools import total_ordering +from hdrgen.symbol import Symbol -@total_ordering -class Enumeration: +class Enumeration(Symbol): def __init__(self, name, value): - self.name = name + super().__init__(name) self.value = value - def __eq__(self, other): - return self.name == other.name - - def __lt__(self, other): - return self.name < other.name - - def __hash__(self): - return self.name.__hash__() - def __str__(self): if self.value != None: return f"{self.name} = {self.value}" diff --git a/libc/utils/hdrgen/hdrgen/function.py b/libc/utils/hdrgen/hdrgen/function.py index f039996584e31..4de3406cc408e 100644 --- a/libc/utils/hdrgen/hdrgen/function.py +++ b/libc/utils/hdrgen/hdrgen/function.py @@ -7,7 +7,7 @@ # ==-------------------------------------------------------------------------==# import re -from functools import total_ordering +from hdrgen.symbol import Symbol from hdrgen.type import Type @@ -37,14 +37,13 @@ NONIDENTIFIER = re.compile("[^a-zA-Z0-9_]+") -@total_ordering -class Function: +class Function(Symbol): def __init__( self, return_type, name, arguments, standards, guard=None, attributes=[] ): + super().__init__(name) assert return_type self.return_type = return_type - self.name = name self.arguments = [ arg if isinstance(arg, str) else arg["type"] for arg in arguments ] @@ -53,15 +52,6 @@ def __init__( self.guard = guard self.attributes = attributes or [] - def __eq__(self, other): - return self.name == other.name - - def __lt__(self, other): - return self.name < other.name - - def __hash__(self): - return self.name.__hash__() - def signature_types(self): def collapse(type_string): assert type_string diff --git a/libc/utils/hdrgen/hdrgen/header.py b/libc/utils/hdrgen/hdrgen/header.py index 558ee58469207..f592327f06ad6 100644 --- a/libc/utils/hdrgen/hdrgen/header.py +++ b/libc/utils/hdrgen/hdrgen/header.py @@ -147,8 +147,8 @@ def includes(self): } | { COMPILER_HEADER_TYPES.get( - typ.type_name, - PurePosixPath("llvm-libc-types") / f"{typ.type_name}.h", + typ.name, + PurePosixPath("llvm-libc-types") / f"{typ.name}.h", ) for typ in self.all_types() } @@ -227,7 +227,7 @@ def relpath(file): ) ] - for macro in self.macros: + for macro in sorted(self.macros): # When there is nothing to define, the Macro object converts to str # as an empty string. Don't emit a blank line for those cases. if str(macro): @@ -242,7 +242,12 @@ def relpath(file): content.append("\n__BEGIN_C_DECLS\n") current_guard = None - for function in self.functions: + last_name = None + for function in sorted(self.functions): + # If the last function's name was the same after underscores, + # elide the blank line between the declarations. + if last_name == function.name_without_underscores(): + content.pop() if function.guard == None and current_guard == None: content.append(str(function) + " __NOEXCEPT;") content.append("") @@ -264,6 +269,7 @@ def relpath(file): content.append(f"#ifdef {current_guard}") content.append(str(function) + " __NOEXCEPT;") content.append("") + last_name = function.name_without_underscores() if current_guard != None: content.pop() content.append(f"#endif // {current_guard}") diff --git a/libc/utils/hdrgen/hdrgen/macro.py b/libc/utils/hdrgen/hdrgen/macro.py index e42e82845694d..4664d9fb00494 100644 --- a/libc/utils/hdrgen/hdrgen/macro.py +++ b/libc/utils/hdrgen/hdrgen/macro.py @@ -6,25 +6,15 @@ # # ==-------------------------------------------------------------------------==# -from functools import total_ordering +from hdrgen.symbol import Symbol -@total_ordering -class Macro: +class Macro(Symbol): def __init__(self, name, value=None, header=None): - self.name = name + super().__init__(name) self.value = value self.header = header - def __eq__(self, other): - return self.name == other.name - - def __lt__(self, other): - return self.name < other.name - - def __hash__(self): - return self.name.__hash__() - def __str__(self): if self.header != None: return "" diff --git a/libc/utils/hdrgen/hdrgen/main.py b/libc/utils/hdrgen/hdrgen/main.py index 25df41e506a1f..c12e89ef771d1 100755 --- a/libc/utils/hdrgen/hdrgen/main.py +++ b/libc/utils/hdrgen/hdrgen/main.py @@ -105,6 +105,7 @@ def merge_from(paths): return 2 header.merge(merge_from_header) + assert header.name, f"`header: name.h` line is required in {yaml_file}" return header if args.json: diff --git a/libc/utils/hdrgen/hdrgen/object.py b/libc/utils/hdrgen/hdrgen/object.py index a311c37168d60..a2ab496bed013 100644 --- a/libc/utils/hdrgen/hdrgen/object.py +++ b/libc/utils/hdrgen/hdrgen/object.py @@ -6,23 +6,13 @@ # # ==-------------------------------------------------------------------------==# -from functools import total_ordering +from hdrgen.symbol import Symbol -@total_ordering -class Object: +class Object(Symbol): def __init__(self, name, type): - self.name = name + super().__init__(name) self.type = type - def __eq__(self, other): - return self.name == other.name - - def __lt__(self, other): - return self.name < other.name - - def __hash__(self): - return self.name.__hash__() - def __str__(self): return f"extern {self.type} {self.name};" diff --git a/libc/utils/hdrgen/hdrgen/symbol.py b/libc/utils/hdrgen/hdrgen/symbol.py new file mode 100644 index 0000000000000..28e9def128e47 --- /dev/null +++ b/libc/utils/hdrgen/hdrgen/symbol.py @@ -0,0 +1,41 @@ +# ====-- Symbol class for libc function headers----------------*- python -*--==# +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ==-------------------------------------------------------------------------==# + +from functools import total_ordering + + +@total_ordering +class Symbol: + """ + Symbol is the common superclass for each kind of entity named by an + identifier. It provides the name field, and defines sort ordering, + hashing, and equality based only on the name. The sorting is pretty + presentation order for identifiers, which is to say it first sorts + lexically but ignores leading underscores and secondarily sorts with the + fewest underscores first. + """ + + def __init__(self, name): + assert name + self.name = name + + def __eq__(self, other): + return self.name == other.name + + def __hash__(self): + return self.name.__hash__() + + def name_without_underscores(self): + return self.name.lstrip("_") + + def name_sort_key(self): + ident = self.name_without_underscores() + return ident, len(self.name) - len(ident) + + def __lt__(self, other): + return self.name_sort_key() < other.name_sort_key() diff --git a/libc/utils/hdrgen/hdrgen/type.py b/libc/utils/hdrgen/hdrgen/type.py index 0c0af8569c61e..20c1881a9379a 100644 --- a/libc/utils/hdrgen/hdrgen/type.py +++ b/libc/utils/hdrgen/hdrgen/type.py @@ -6,20 +6,10 @@ # # ==-------------------------------------------------------------------------==# -from functools import total_ordering +from hdrgen.symbol import Symbol -@total_ordering -class Type: - def __init__(self, type_name): - assert type_name - self.type_name = type_name - - def __eq__(self, other): - return self.type_name == other.type_name - - def __lt__(self, other): - return self.type_name < other.type_name - - def __hash__(self): - return self.type_name.__hash__() +class Type(Symbol): + # A type so far carries no specific information beyond its name. + def __init__(self, name): + super().__init__(name) diff --git a/libc/utils/hdrgen/tests/expected_output/sorting.h b/libc/utils/hdrgen/tests/expected_output/sorting.h new file mode 100644 index 0000000000000..a091a421b2c3f --- /dev/null +++ b/libc/utils/hdrgen/tests/expected_output/sorting.h @@ -0,0 +1,24 @@ +//===-- Standard C header --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#ifndef _LLVM_LIBC_SORTING_H +#define _LLVM_LIBC_SORTING_H + +#include "__llvm-libc-common.h" + +__BEGIN_C_DECLS + +void func_with_aliases(int) __NOEXCEPT; +void _func_with_aliases(int) __NOEXCEPT; +void __func_with_aliases(int) __NOEXCEPT; + +void gunk(const char *) __NOEXCEPT; + +__END_C_DECLS + +#endif // _LLVM_LIBC_SORTING_H diff --git a/libc/utils/hdrgen/tests/input/sorting.yaml b/libc/utils/hdrgen/tests/input/sorting.yaml new file mode 100644 index 0000000000000..3c26cde9e6c41 --- /dev/null +++ b/libc/utils/hdrgen/tests/input/sorting.yaml @@ -0,0 +1,20 @@ +header: sorting.h +standards: + - stdc +functions: + - name: gunk + return_type: void + arguments: + - type: const char * + - name: _func_with_aliases + return_type: void + arguments: + - type: int + - name: func_with_aliases + return_type: void + arguments: + - type: int + - name: __func_with_aliases + return_type: void + arguments: + - type: int diff --git a/libc/utils/hdrgen/tests/test_integration.py b/libc/utils/hdrgen/tests/test_integration.py index c6e76d826a3a4..b975d8ff007b1 100644 --- a/libc/utils/hdrgen/tests/test_integration.py +++ b/libc/utils/hdrgen/tests/test_integration.py @@ -75,6 +75,13 @@ def test_generate_json(self): self.compare_files(output_file, expected_output_file) + def test_sorting(self): + yaml_file = self.source_dir / "input" / "sorting.yaml" + expected_output_file = self.source_dir / "expected_output" / "sorting.h" + output_file = self.output_dir / "sorting.h" + self.run_script(yaml_file, output_file) + self.compare_files(output_file, expected_output_file) + def main(): parser = argparse.ArgumentParser(description="TestHeaderGenIntegration arguments") From d698df9bbc9f69df0bf4b9256fc897e80ece82ae Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 30 Oct 2025 11:58:41 -0700 Subject: [PATCH 237/539] [ASan] Make tests work with internal shell Some minor adjustmenets around environment variables to make a handful of tests work with the internal shell that did not before. Reviewers: fmayer, alexander-shaposhnikov Reviewed By: fmayer, alexander-shaposhnikov Pull Request: https://github.com/llvm/llvm-project/pull/165141 --- compiler-rt/test/asan/TestCases/log-path_test.cpp | 3 ++- compiler-rt/test/asan/TestCases/scariness_score_test.cpp | 4 ++-- compiler-rt/test/asan/lit.cfg.py | 3 +++ compiler-rt/test/lit.common.cfg.py | 2 ++ 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/compiler-rt/test/asan/TestCases/log-path_test.cpp b/compiler-rt/test/asan/TestCases/log-path_test.cpp index 3c5ca114cfd71..6875d57c43cc0 100644 --- a/compiler-rt/test/asan/TestCases/log-path_test.cpp +++ b/compiler-rt/test/asan/TestCases/log-path_test.cpp @@ -25,7 +25,8 @@ // RUN: FileCheck %s --check-prefix=CHECK-BAD-DIR < %t.out // Too long log_path. -// RUN: %env_asan_opts=log_path=`for((i=0;i<10000;i++)); do echo -n $i; done` \ +// RUN: %python -c "for i in range(0, 10000): print(i, end='')" > %t.long_log_path +// RUN: %env_asan_opts=log_path=%{readfile:%t.long_log_path} \ // RUN: not %run %t 2> %t.out // RUN: FileCheck %s --check-prefix=CHECK-LONG < %t.out diff --git a/compiler-rt/test/asan/TestCases/scariness_score_test.cpp b/compiler-rt/test/asan/TestCases/scariness_score_test.cpp index 9e55e33675fde..5d229cf383648 100644 --- a/compiler-rt/test/asan/TestCases/scariness_score_test.cpp +++ b/compiler-rt/test/asan/TestCases/scariness_score_test.cpp @@ -6,7 +6,7 @@ // RUN: %clangxx_asan -O0 -mllvm -asan-use-stack-safety=0 %s -o %t // On OSX and Windows, alloc_dealloc_mismatch=1 isn't 100% reliable, so it's // off by default. It's safe for these tests, though, so we turn it on. -// RUN: export %env_asan_opts=symbolize=0:detect_stack_use_after_return=1:handle_abort=1:print_scariness=1:alloc_dealloc_mismatch=1 +// RUN: %export_asan_opts=symbolize=0:detect_stack_use_after_return=1:handle_abort=1:print_scariness=1:alloc_dealloc_mismatch=1 // Make sure the stack is limited (may not be the default under GNU make) // RUN: ulimit -s 4096 // RUN: not %run %t 1 2>&1 | FileCheck %s --check-prefix=CHECK1 @@ -41,7 +41,7 @@ // RUN: %clangxx_asan -O0 %s -o %t -fsanitize-address-use-after-return=always -mllvm -asan-use-stack-safety=0 // On OSX and Windows, alloc_dealloc_mismatch=1 isn't 100% reliable, so it's // off by default. It's safe for these tests, though, so we turn it on. -// RUN: export %env_asan_opts=symbolize=0:handle_abort=1:print_scariness=1:alloc_dealloc_mismatch=1 +// RUN: %export_asan_opts=symbolize=0:handle_abort=1:print_scariness=1:alloc_dealloc_mismatch=1 // Make sure the stack is limited (may not be the default under GNU make) // RUN: ulimit -s 4096 // RUN: not %run %t 1 2>&1 | FileCheck %s --check-prefix=CHECK1 diff --git a/compiler-rt/test/asan/lit.cfg.py b/compiler-rt/test/asan/lit.cfg.py index 96201e679b0a3..0194c720d003b 100644 --- a/compiler-rt/test/asan/lit.cfg.py +++ b/compiler-rt/test/asan/lit.cfg.py @@ -41,6 +41,9 @@ def get_required_attr(config, attr_name): config.substitutions.append( ("%env_asan_opts=", "env ASAN_OPTIONS=" + default_asan_opts_str) ) +config.substitutions.append( + ("%export_asan_opts=", "export ASAN_OPTIONS=" + default_asan_opts_str) +) # Setup source root. config.test_source_root = os.path.dirname(__file__) diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py index 8d147055293ed..9d2f02189b8bd 100644 --- a/compiler-rt/test/lit.common.cfg.py +++ b/compiler-rt/test/lit.common.cfg.py @@ -1066,3 +1066,5 @@ def target_page_size(): # llvm. config.substitutions.append(("%crt_src", config.compiler_rt_src_root)) config.substitutions.append(("%llvm_src", config.llvm_src_root)) + +config.substitutions.append(("%python", '"%s"' % (sys.executable))) From f9619826f097c37ff9d7e0ddf03ec5b9dbe63be5 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 30 Oct 2025 12:05:01 -0700 Subject: [PATCH 238/539] [Support] Simplify the continuation condition in encodeSLEB128 (NFC) (#165651) The boolean expression to determine if more bytes are needed for a signed LEB128 value is quite complex: !((((Value == 0 ) && ((Byte & 0x40) == 0)) || ((Value == -1) && ((Byte & 0x40) != 0)))) This patch simplifies it to an equivalent expression using a ternary operator, which is much easier to understand. --- llvm/include/llvm/Support/LEB128.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/Support/LEB128.h b/llvm/include/llvm/Support/LEB128.h index 898b4ea1f19ab..4e2262fb15c56 100644 --- a/llvm/include/llvm/Support/LEB128.h +++ b/llvm/include/llvm/Support/LEB128.h @@ -29,8 +29,7 @@ inline unsigned encodeSLEB128(int64_t Value, raw_ostream &OS, uint8_t Byte = Value & 0x7f; // NOTE: this assumes that this signed shift is an arithmetic right shift. Value >>= 7; - More = !((((Value == 0 ) && ((Byte & 0x40) == 0)) || - ((Value == -1) && ((Byte & 0x40) != 0)))); + More = Value != ((Byte & 0x40) ? -1 : 0); Count++; if (More || Count < PadTo) Byte |= 0x80; // Mark this byte to show that more bytes will follow. @@ -58,8 +57,7 @@ inline unsigned encodeSLEB128(int64_t Value, uint8_t *p, unsigned PadTo = 0) { uint8_t Byte = Value & 0x7f; // NOTE: this assumes that this signed shift is an arithmetic right shift. Value >>= 7; - More = !((((Value == 0 ) && ((Byte & 0x40) == 0)) || - ((Value == -1) && ((Byte & 0x40) != 0)))); + More = Value != ((Byte & 0x40) ? -1 : 0); Count++; if (More || Count < PadTo) Byte |= 0x80; // Mark this byte to show that more bytes will follow. From 3ec8b71e0a51ac05977bdf080ee3b8cb2b22542a Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 30 Oct 2025 12:05:10 -0700 Subject: [PATCH 239/539] [Hexagon] Use default member initializations (NFC) (#165653) Identified with modernize-use-default-member-init. --- llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp | 6 +++--- llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp | 8 ++++---- llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp index 3b810d0b65fab..79863e1c3cb74 100644 --- a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp @@ -34,7 +34,7 @@ class HexagonCopyHoisting : public MachineFunctionPass { public: static char ID; - HexagonCopyHoisting() : MachineFunctionPass(ID), MFN(nullptr), MRI(nullptr) {} + HexagonCopyHoisting() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "Hexagon Copy Hoisting"; } @@ -56,8 +56,8 @@ class HexagonCopyHoisting : public MachineFunctionPass { void moveCopyInstr(MachineBasicBlock *DestBB, std::pair Key, MachineInstr *MI); - MachineFunction *MFN; - MachineRegisterInfo *MRI; + MachineFunction *MFN = nullptr; + MachineRegisterInfo *MRI = nullptr; std::vector, MachineInstr *>> CopyMIList; }; diff --git a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp index 93418f7e15e8d..a10c93704a85b 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp @@ -34,13 +34,13 @@ STATISTIC(HexagonNumStoreAbsConversions, namespace { class HexagonGenMemAbsolute : public MachineFunctionPass { - const HexagonInstrInfo *TII; - MachineRegisterInfo *MRI; - const TargetRegisterInfo *TRI; + const HexagonInstrInfo *TII = nullptr; + MachineRegisterInfo *MRI = nullptr; + const TargetRegisterInfo *TRI = nullptr; public: static char ID; - HexagonGenMemAbsolute() : MachineFunctionPass(ID), TII(0), MRI(0), TRI(0) {} + HexagonGenMemAbsolute() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "Hexagon Generate Load/Store Set Absolute Address Instruction"; diff --git a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp index 71bdfc6657c57..5a85f348fdaf7 100644 --- a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp @@ -43,7 +43,7 @@ namespace { class HexagonTfrCleanup : public MachineFunctionPass { public: static char ID; - HexagonTfrCleanup() : MachineFunctionPass(ID), HII(0), TRI(0) {} + HexagonTfrCleanup() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "Hexagon TFR Cleanup"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); @@ -52,8 +52,8 @@ class HexagonTfrCleanup : public MachineFunctionPass { bool runOnMachineFunction(MachineFunction &MF) override; private: - const HexagonInstrInfo *HII; - const TargetRegisterInfo *TRI; + const HexagonInstrInfo *HII = nullptr; + const TargetRegisterInfo *TRI = nullptr; typedef DenseMap ImmediateMap; From 3e576806c4a343647351f65be571e17e84df8648 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 30 Oct 2025 12:05:19 -0700 Subject: [PATCH 240/539] [llvm] Proofread HowToCrossCompileBuiltinsOnArm.rst (#165655) --- llvm/docs/HowToCrossCompileBuiltinsOnArm.rst | 30 ++++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/llvm/docs/HowToCrossCompileBuiltinsOnArm.rst b/llvm/docs/HowToCrossCompileBuiltinsOnArm.rst index d7759ad8edd06..58599404d5cd4 100644 --- a/llvm/docs/HowToCrossCompileBuiltinsOnArm.rst +++ b/llvm/docs/HowToCrossCompileBuiltinsOnArm.rst @@ -8,18 +8,18 @@ Introduction This document contains information about building and testing the builtins part of compiler-rt for an Arm target, from an x86_64 Linux machine. -While this document concentrates on Arm and Linux the general principles should +While this document concentrates on Arm and Linux, the general principles should apply to other targets supported by compiler-rt. Further contributions for other targets are welcome. The instructions in this document depend on libraries and programs external to -LLVM, there are many ways to install and configure these dependencies so you +LLVM. There are many ways to install and configure these dependencies, so you may need to adapt the instructions here to fit your own situation. Prerequisites ============= -In this use case we will be using cmake on a Debian-based Linux system, +In this use case, we will be using cmake on a Debian-based Linux system, cross-compiling from an x86_64 host to a hard-float Armv7-A target. We will be using as many of the LLVM tools as we can, but it is possible to use GNU equivalents. @@ -35,7 +35,7 @@ You will need: An existing sysroot is required because some of the builtins include C library headers and a sysroot is the easiest way to get those. -In this example we will be using ``ninja`` as the build tool. +In this example, we will be using ``ninja`` as the build tool. See https://compiler-rt.llvm.org/ for information about the dependencies on clang and LLVM. @@ -46,7 +46,7 @@ the source for LLVM and compiler-rt. ``qemu-arm`` should be available as a package for your Linux distribution. The most complicated of the prerequisites to satisfy is the ``arm-linux-gnueabihf`` -sysroot. In theory it is possible to use the Linux distributions multiarch +sysroot. In theory, it is possible to use the Linux distributions multiarch support to fulfill the dependencies for building but unfortunately due to ``/usr/local/include`` being added some host includes are selected. @@ -153,7 +153,7 @@ The cmake try compile stage fails At an early stage cmake will attempt to compile and link a simple C program to test if the toolchain is working. -This stage can often fail at link time if the ``--sysroot=``, ``--target`` or +This stage can often fail at link time if the ``--sysroot=``, ``--target``, or ``--gcc-toolchain=`` options are not passed to the compiler. Check the ``CMAKE__FLAGS`` and ``CMAKE__COMPILER_TARGET`` flags along with any of the specific CMake sysroot and toolchain options. @@ -165,7 +165,7 @@ to make sure it is working. For example:: Clang uses the host header files -------------------------------- -On debian based systems it is possible to install multiarch support for +On Debian-based systems, it is possible to install multiarch support for ``arm-linux-gnueabi`` and ``arm-linux-gnueabihf``. In many cases clang can successfully use this multiarch support when ``--gcc-toolchain=`` and ``--sysroot=`` are not supplied. Unfortunately clang adds ``/usr/local/include`` before @@ -177,8 +177,8 @@ use a separate ``arm-linux-gnueabihf`` toolchain. No target passed to clang ------------------------- -If clang is not given a target it will typically use the host target, this will -not understand the Arm assembly language files resulting in error messages such +If clang is not given a target, it will typically use the host target. This will +not understand the Arm assembly language files, resulting in error messages such as ``error: unknown directive .syntax unified``. You can check the clang invocation in the error message to see if there is no @@ -217,7 +217,7 @@ target to use is: * ``-DCMAKE_C_COMPILER_TARGET=arm-linux-gnueabi`` -Depending on whether you want to use floating point instructions or not you +Depending on whether you want to use floating point instructions or not, you may need extra c-flags such as ``-mfloat-abi=softfp`` for use of floating-point instructions, and ``-mfloat-abi=soft -mfpu=none`` for software floating-point emulation. @@ -241,7 +241,7 @@ To build and test the libraries using a similar method to Armv7-A is possible but more difficult. The main problems are: * There is not a ``qemu-arm`` user-mode emulator for bare-metal systems. - ``qemu-system-arm`` can be used but this is significantly more difficult + ``qemu-system-arm`` can be used, but this is significantly more difficult to setup. This document does not explain how to do this. * The targets to compile compiler-rt have the suffix ``-none-eabi``. This uses the BareMetal driver in clang and by default will not find the libraries @@ -252,8 +252,8 @@ that are supported on Armv7-A we can still get most of the value of running the tests using the same ``qemu-arm`` that we used for Armv7-A by building and running the test cases for Armv7-A but using the builtins compiled for Armv6-M, Armv7-M or Armv7E-M. This will test that the builtins can be linked -into a binary and execute the tests correctly but it will not catch if the -builtins use instructions that are supported on Armv7-A but not Armv6-M, +into a binary and execute the tests correctly, but it will not catch if the +builtins use instructions that are supported on Armv7-A but not on Armv6-M, Armv7-M and Armv7E-M. This requires a second ``arm-none-eabi`` toolchain for building the builtins. @@ -321,9 +321,9 @@ command for Armv7-A build and test:: The Armv6-M builtins will use the soft-float ABI. When compiling the tests for Armv7-A we must include ``"-mthumb -mfloat-abi=soft -mfpu=none"`` in the -test-c-flags. We must use an Armv7-A soft-float abi sysroot for ``qemu-arm``. +test-c-flags. We must use an Armv7-A soft-float ABI sysroot for ``qemu-arm``. -Depending on the linker used for the test cases you may encounter BuildAttribute +Depending on the linker used for the test cases, you may encounter BuildAttribute mismatches between the M-profile objects from compiler-rt and the A-profile objects from the test. The lld linker does not check the profile BuildAttribute so it can be used to link the tests by adding ``-fuse-ld=lld`` to the From 72b12188f3c17219ce3bf17a628999497f7d975a Mon Sep 17 00:00:00 2001 From: Ahmed Nour Date: Thu, 30 Oct 2025 22:07:00 +0300 Subject: [PATCH 241/539] [Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - allow insertps intrinsic to be used in constexp (#165513) Resolves #165161 --- clang/include/clang/Basic/BuiltinsX86.td | 3 +- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 42 +++++++++++++++++++--- clang/lib/AST/ExprConstant.cpp | 46 ++++++++++++++++++++---- clang/test/CodeGen/X86/sse41-builtins.c | 10 ++++++ 4 files changed, 88 insertions(+), 13 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 500aa85fe5356..9e877b92eac68 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -328,7 +328,6 @@ let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorW } let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">; def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">; def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">; def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">; @@ -342,6 +341,8 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { + def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, " + "_Vector<4, float>, _Constant char)">; def ptestz128 : X86Builtin<"int(_Vector<2, long long int>, _Vector<2, long long int>)">; def ptestc128 diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index b3ab82da5e01a..8b57b963c538f 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -3411,7 +3411,7 @@ static bool interp__builtin_x86_byteshift( static bool interp__builtin_ia32_shuffle_generic( InterpState &S, CodePtr OpPC, const CallExpr *Call, - llvm::function_ref(unsigned, unsigned)> + llvm::function_ref(unsigned, unsigned)> GetSourceIndex) { assert(Call->getNumArgs() == 3); @@ -3428,8 +3428,19 @@ static bool interp__builtin_ia32_shuffle_generic( for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) { auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask); - const Pointer &Src = (SrcVecIdx == 0) ? A : B; - TYPE_SWITCH(ElemT, { Dst.elem(DstIdx) = Src.elem(SrcIdx); }); + + if (SrcIdx < 0) { + // Zero out this element + if (ElemT == PT_Float) { + Dst.elem(DstIdx) = Floating( + S.getASTContext().getFloatTypeSemantics(VecT->getElementType())); + } else { + INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem(DstIdx) = T::from(0); }); + } + } else { + const Pointer &Src = (SrcVecIdx == 0) ? A : B; + TYPE_SWITCH(ElemT, { Dst.elem(DstIdx) = Src.elem(SrcIdx); }); + } } Dst.initializeAllElements(); @@ -4382,7 +4393,8 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned SrcIdx = ElemInLane >= NumSelectableElems ? 1 : 0; unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits; unsigned Index = (ShuffleMask >> BitIndex) & IndexMask; - return std::pair{SrcIdx, LaneOffset + Index}; + return std::pair{SrcIdx, + static_cast(LaneOffset + Index)}; }); case X86::BI__builtin_ia32_shufpd: case X86::BI__builtin_ia32_shufpd256: @@ -4400,7 +4412,27 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned SrcIdx = ElemInLane >= NumSelectableElems ? 1 : 0; unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits; unsigned Index = (ShuffleMask >> BitIndex) & IndexMask; - return std::pair{SrcIdx, LaneOffset + Index}; + return std::pair{SrcIdx, + static_cast(LaneOffset + Index)}; + }); + case X86::BI__builtin_ia32_insertps128: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned Mask) { + // Bits [3:0]: zero mask - if bit is set, zero this element + if ((Mask & (1 << DstIdx)) != 0) { + return std::pair{0, -1}; + } + // Bits [7:6]: select element from source vector Y (0-3) + // Bits [5:4]: select destination position (0-3) + unsigned SrcElem = (Mask >> 6) & 0x3; + unsigned DstElem = (Mask >> 4) & 0x3; + if (DstIdx == DstElem) { + // Insert element from source vector (B) at this position + return std::pair{1, static_cast(SrcElem)}; + } else { + // Copy from destination vector (A) + return std::pair{0, static_cast(DstIdx)}; + } }); case X86::BI__builtin_ia32_pshufb128: case X86::BI__builtin_ia32_pshufb256: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index d0404b957ab03..97eeba8b9d6cc 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11621,7 +11621,7 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result, static bool evalShuffleGeneric( EvalInfo &Info, const CallExpr *Call, APValue &Out, - llvm::function_ref(unsigned, unsigned)> + llvm::function_ref(unsigned, unsigned)> GetSourceIndex) { const auto *VT = Call->getType()->getAs(); @@ -11644,8 +11644,16 @@ static bool evalShuffleGeneric( for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) { auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask); - const APValue &Src = (SrcVecIdx == 0) ? A : B; - ResultElements.push_back(Src.getVectorElt(SrcIdx)); + + if (SrcIdx < 0) { + // Zero out this element + QualType ElemTy = VT->getElementType(); + ResultElements.push_back( + APValue(APFloat::getZero(Info.Ctx.getFloatTypeSemantics(ElemTy)))); + } else { + const APValue &Src = (SrcVecIdx == 0) ? A : B; + ResultElements.push_back(Src.getVectorElt(SrcIdx)); + } } Out = APValue(ResultElements.data(), ResultElements.size()); @@ -12438,7 +12446,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if (!evalShuffleGeneric( Info, E, R, [](unsigned DstIdx, - unsigned ShuffleMask) -> std::pair { + unsigned ShuffleMask) -> std::pair { constexpr unsigned LaneBits = 128u; unsigned NumElemPerLane = LaneBits / 32; unsigned NumSelectableElems = NumElemPerLane / 2; @@ -12451,7 +12459,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits; unsigned SrcIdx = (ElemInLane < NumSelectableElems) ? 0 : 1; unsigned Index = (ShuffleMask >> BitIndex) & IndexMask; - return {SrcIdx, LaneOffset + Index}; + return {SrcIdx, static_cast(LaneOffset + Index)}; })) return false; return Success(R, E); @@ -12463,7 +12471,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if (!evalShuffleGeneric( Info, E, R, [](unsigned DstIdx, - unsigned ShuffleMask) -> std::pair { + unsigned ShuffleMask) -> std::pair { constexpr unsigned LaneBits = 128u; unsigned NumElemPerLane = LaneBits / 64; unsigned NumSelectableElems = NumElemPerLane / 2; @@ -12476,7 +12484,31 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits; unsigned SrcIdx = (ElemInLane < NumSelectableElems) ? 0 : 1; unsigned Index = (ShuffleMask >> BitIndex) & IndexMask; - return {SrcIdx, LaneOffset + Index}; + return {SrcIdx, static_cast(LaneOffset + Index)}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_insertps128: { + APValue R; + if (!evalShuffleGeneric( + Info, E, R, + [](unsigned DstIdx, unsigned Mask) -> std::pair { + // Bits [3:0]: zero mask - if bit is set, zero this element + if ((Mask & (1 << DstIdx)) != 0) { + return {0, -1}; + } + // Bits [7:6]: select element from source vector Y (0-3) + // Bits [5:4]: select destination position (0-3) + unsigned SrcElem = (Mask >> 6) & 0x3; + unsigned DstElem = (Mask >> 4) & 0x3; + if (DstIdx == DstElem) { + // Insert element from source vector (B) at this position + return {1, static_cast(SrcElem)}; + } else { + // Copy from destination vector (A) + return {0, static_cast(DstIdx)}; + } })) return false; return Success(R, E); diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c index 62cd392824bb2..35fa65a99836b 100644 --- a/clang/test/CodeGen/X86/sse41-builtins.c +++ b/clang/test/CodeGen/X86/sse41-builtins.c @@ -307,6 +307,16 @@ __m128 test_mm_insert_ps(__m128 x, __m128 y) { return _mm_insert_ps(x, y, 4); } +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x10), 1.0f, 10.0f, 3.0f, 4.0f))); // Insert Y[0] into X[1] +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x00), 10.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[0] into X[0] +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x20), 1.0f, 2.0f, 10.0f, 4.0f))); // Insert Y[0] into X[2] +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x30), 1.0f, 2.0f, 3.0f, 10.0f))); // Insert Y[0] into X[3] +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x80), 30.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[2] into X[0] +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x01), 0.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[0] into X[0], zero X[0] +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x0A), 10.0f, 0.0f, 3.0f, 0.0f))); // Insert Y[0] into X[0], zero X[1] and X[3] +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x0F), 0.0f, 0.0f, 0.0f, 0.0f))); // Insert Y[0] into X[0], zero all +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0xCF), 0.0f, 0.0f, 0.0f, 0.0f))); // Insert Y[3] into X[0], zero all + __m128i test_mm_max_epi8(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_max_epi8 // CHECK: call <16 x i8> @llvm.smax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) From fc28715b4beb9a750c47edd61fc469d51a333967 Mon Sep 17 00:00:00 2001 From: lonely eagle <2020382038@qq.com> Date: Fri, 31 Oct 2025 03:07:21 +0800 Subject: [PATCH 242/539] [mlir][bufferize] Use resolveCallableInTable to cleanup getCalledFunction (NFC) (#165658) Simplify the implementation of `getCalledFunction` using `resolveCallableInTable`. --- .../Transforms/FuncBufferizableOpInterfaceImpl.cpp | 7 +------ .../Bufferization/Transforms/OneShotModuleBufferize.cpp | 6 +----- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp index d9d69342e42a8..8655ed3005a93 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp @@ -95,12 +95,7 @@ getBufferizedFunctionArgType(FuncOp funcOp, int64_t index, /// Return the FuncOp called by `callOp`. static FuncOp getCalledFunction(CallOpInterface callOp, SymbolTableCollection &symbolTables) { - SymbolRefAttr sym = - llvm::dyn_cast_if_present(callOp.getCallableForCallee()); - if (!sym) - return nullptr; - return dyn_cast_or_null( - symbolTables.lookupNearestSymbolFrom(callOp, sym)); + return dyn_cast_or_null(callOp.resolveCallableInTable(&symbolTables)); } /// Return the FuncOp called by `callOp`. diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp index aa53f94fe839d..c233e24c2a151 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp @@ -285,12 +285,8 @@ static void removeBufferizationAttributes(BlockArgument bbArg) { static func::FuncOp getCalledFunction(func::CallOp callOp, mlir::SymbolTableCollection &symbolTable) { - SymbolRefAttr sym = - llvm::dyn_cast_if_present(callOp.getCallableForCallee()); - if (!sym) - return nullptr; return dyn_cast_or_null( - symbolTable.lookupNearestSymbolFrom(callOp, sym)); + callOp.resolveCallableInTable(&symbolTable)); } /// Return "true" if the given function signature has tensor semantics. From 5b6903070dbf027051363021b81e4fd113d4bc8c Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Thu, 30 Oct 2025 15:10:59 -0400 Subject: [PATCH 243/539] [mlir] Simplify Default cases in type switches. NFC. (#165767) Use default values instead of lambdas when possible. `std::nullopt` and `nullptr` can be used now because of https://github.com/llvm/llvm-project/pull/165724. --- mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 2 +- mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp | 2 +- mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp | 2 +- mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp | 2 +- mlir/lib/Dialect/Arith/IR/ArithOps.cpp | 2 +- mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp | 6 +++--- mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 2 +- mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp | 2 +- mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp | 2 +- mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp | 2 +- mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp | 2 +- .../lib/Dialect/Linalg/Transforms/SimplifyDepthwiseConv.cpp | 2 +- mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp | 2 +- mlir/lib/Dialect/MemRef/Transforms/FlattenMemRefs.cpp | 2 +- mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp | 2 +- mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp | 4 ++-- mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp | 2 +- mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp | 2 +- mlir/lib/Dialect/Tensor/Transforms/RewriteAsConstant.cpp | 2 +- mlir/lib/TableGen/Type.cpp | 2 +- mlir/lib/Target/LLVMIR/DebugTranslation.cpp | 6 +++--- 21 files changed, 26 insertions(+), 26 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 41e333c621eda..3a307a0756d93 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -935,7 +935,7 @@ static std::optional mfmaTypeSelectCode(Type mlirElemType) { .Case([](Float6E2M3FNType) { return 2u; }) .Case([](Float6E3M2FNType) { return 3u; }) .Case([](Float4E2M1FNType) { return 4u; }) - .Default([](Type) { return std::nullopt; }); + .Default(std::nullopt); } /// If there is a scaled MFMA instruction for the input element types `aType` diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp index 247dba101cfc1..cfdcd9cc2d86d 100644 --- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp +++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp @@ -432,7 +432,7 @@ static Value getOriginalVectorValue(Value value) { current = op.getSource(); return false; }) - .Default([](Operation *) { return false; }); + .Default(false); if (!skipOp) { break; diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp index 25f1e1b184d61..425594b3382f0 100644 --- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp +++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp @@ -259,7 +259,7 @@ struct GPUShuffleConversion final : ConvertOpToLLVMPattern { } return std::nullopt; }) - .Default([](auto) { return std::nullopt; }); + .Default(std::nullopt); } static std::optional getFuncName(gpu::ShuffleMode mode, diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp index e2c7d803e5a5e..91c1aa55fdb4e 100644 --- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp +++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp @@ -46,7 +46,7 @@ static bool isZeroConstant(Value val) { [](auto floatAttr) { return floatAttr.getValue().isZero(); }) .Case( [](auto intAttr) { return intAttr.getValue().isZero(); }) - .Default([](auto) { return false; }); + .Default(false); } static LogicalResult storeLoadPreconditions(PatternRewriter &rewriter, diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp index 898d76ce8d9b5..980442efdf708 100644 --- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp +++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp @@ -2751,7 +2751,7 @@ std::optional mlir::arith::getNeutralElement(Operation *op) { .Case([](arith::MaxSIOp op) { return AtomicRMWKind::maxs; }) .Case([](arith::MinSIOp op) { return AtomicRMWKind::mins; }) .Case([](arith::MulIOp op) { return AtomicRMWKind::muli; }) - .Default([](Operation *op) { return std::nullopt; }); + .Default(std::nullopt); if (!maybeKind) { return std::nullopt; } diff --git a/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp b/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp index d2c2138d61638..025d1acf8d6ba 100644 --- a/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp @@ -330,7 +330,7 @@ static Value getBase(Value v) { v = op.getSrc(); return true; }) - .Default([](Operation *) { return false; }); + .Default(false); if (!shouldContinue) break; } @@ -354,7 +354,7 @@ static Value propagatesCapture(Operation *op) { .Case([](memref::TransposeOp transpose) { return transpose.getIn(); }) .Case( [](auto op) { return op.getSrc(); }) - .Default([](Operation *) { return Value(); }); + .Default(nullptr); } /// Returns `true` if the given operation is known to capture the given value, @@ -371,7 +371,7 @@ static std::optional getKnownCapturingStatus(Operation *op, Value v) { // These operations are known not to capture. .Case([](memref::DeallocOp) { return false; }) // By default, we don't know anything. - .Default([](Operation *) { return std::nullopt; }); + .Default(std::nullopt); } /// Returns `true` if the value may be captured by any of its users, i.e., if diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index 3eae67f4c1f98..2731069d6ef54 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -698,7 +698,7 @@ static void destructureIndices(Type currType, ArrayRef indices, return structType.getBody()[memberIndex]; return nullptr; }) - .Default(Type(nullptr)); + .Default(nullptr); } } diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp index cee943d2d86c6..7d9058c262562 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp @@ -1111,7 +1111,7 @@ memsetCanUsesBeRemoved(MemsetIntr op, const MemorySlot &slot, .Case([](auto type) { return type.getWidth() % 8 == 0 && type.getWidth() > 0; }) - .Default([](Type) { return false; }); + .Default(false); if (!canConvertType) return false; diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp index ac35eea66e9d6..ce93d18f56d39 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp @@ -798,7 +798,7 @@ static bool isCompatibleImpl(Type type, DenseSet &compatibleTypes) { // clang-format on .Case( [](Type type) { return isCompatiblePtrType(type); }) - .Default([](Type) { return false; }); + .Default(false); if (!result) compatibleTypes.erase(type); diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 8b89244486339..b09112bcf0bb7 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -4499,7 +4499,7 @@ DiagnosedSilenceableFailure transform::DecomposeWinogradOp::applyToOne( maybeTransformed = decomposeWinogradOutputTransformOp(rewriter, op); return true; }) - .Default([&](Operation *op) { return false; }); + .Default(false); if (!supported) { DiagnosedSilenceableFailure diag = diff --git a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp index f05ffa8334d9c..6519c4f64dd05 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp @@ -322,7 +322,7 @@ promoteSubViews(ImplicitLocOpBuilder &b, tmp = arith::ConstantOp::create(b, IntegerAttr::get(et, 0)); return complex::CreateOp::create(b, t, tmp, tmp); }) - .Default([](auto) { return Value(); }); + .Default(nullptr); if (!fillVal) return failure(); linalg::FillOp::create(b, fillVal, promotionInfo->fullLocalView); diff --git a/mlir/lib/Dialect/Linalg/Transforms/SimplifyDepthwiseConv.cpp b/mlir/lib/Dialect/Linalg/Transforms/SimplifyDepthwiseConv.cpp index 27ccf3c2ba148..6becc1f29afbd 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/SimplifyDepthwiseConv.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/SimplifyDepthwiseConv.cpp @@ -89,7 +89,7 @@ matchAndReplaceDepthwiseConv(Operation *operation, Value input, Value kernel, ValueRange{input, collapsedKernel, iZp, kZp}, ValueRange{collapsedInit}, stride, dilation); }) - .Default([](Operation *op) { return nullptr; }); + .Default(nullptr); if (!newConv) return failure(); for (auto attr : preservedAttrs) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 0f317eac8fa41..cb6199f026e03 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -656,7 +656,7 @@ mlir::linalg::getCombinerOpKind(Operation *combinerOp) { [&](auto op) { return CombiningKind::MUL; }) .Case([&](auto op) { return CombiningKind::OR; }) .Case([&](auto op) { return CombiningKind::XOR; }) - .Default([&](auto op) { return std::nullopt; }); + .Default(std::nullopt); } /// Check whether `outputOperand` is a reduction with a single combiner diff --git a/mlir/lib/Dialect/MemRef/Transforms/FlattenMemRefs.cpp b/mlir/lib/Dialect/MemRef/Transforms/FlattenMemRefs.cpp index 1208fddf37e0b..e6850890bf8fe 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/FlattenMemRefs.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/FlattenMemRefs.cpp @@ -104,7 +104,7 @@ static Value getTargetMemref(Operation *op) { vector::MaskedStoreOp, vector::TransferReadOp, vector::TransferWriteOp>( [](auto op) { return op.getBase(); }) - .Default([](auto) { return Value{}; }); + .Default(nullptr); } template diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp index 4ebd90dbcc1d5..d380c46f7fbee 100644 --- a/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp @@ -55,7 +55,7 @@ static bool isShapePreserving(ForOp forOp, int64_t arg) { ? forOp.getInitArgs()[opResult.getResultNumber()] : Value(); }) - .Default([&](auto op) { return Value(); }); + .Default(nullptr); } return false; } diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp index 0c8114d5e957e..938952ed273cd 100644 --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp @@ -346,7 +346,7 @@ LogicalResult spirv::CompositeConstructOp::verify() { llvm::TypeSwitch(getType()) .Case( [](auto coopType) { return coopType.getElementType(); }) - .Default([](Type) { return nullptr; }); + .Default(nullptr); // Case 1. -- matrices. if (coopElementType) { @@ -1708,7 +1708,7 @@ LogicalResult spirv::MatrixTimesScalarOp::verify() { llvm::TypeSwitch(getMatrix().getType()) .Case( [](auto matrixType) { return matrixType.getElementType(); }) - .Default([](Type) { return nullptr; }); + .Default(nullptr); assert(elementType && "Unhandled type"); diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp index f895807ea1d18..d1e275d590f78 100644 --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp @@ -731,7 +731,7 @@ std::optional SPIRVType::getSizeInBytes() { return *elementSize * type.getNumElements(); return std::nullopt; }) - .Default(std::optional()); + .Default(std::nullopt); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp index 88e1ab6ab1e4d..cb9b7f6ec2fd2 100644 --- a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp +++ b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp @@ -1467,7 +1467,7 @@ mlir::spirv::getNativeVectorShape(Operation *op) { return TypeSwitch>>(op) .Case( [](auto typedOp) { return getNativeVectorShapeImpl(typedOp); }) - .Default([](Operation *) { return std::nullopt; }); + .Default(std::nullopt); } LogicalResult mlir::spirv::unrollVectorsInSignatures(Operation *op) { diff --git a/mlir/lib/Dialect/Tensor/Transforms/RewriteAsConstant.cpp b/mlir/lib/Dialect/Tensor/Transforms/RewriteAsConstant.cpp index 69e649d2eebe8..bc4f5a5ac7f23 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/RewriteAsConstant.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/RewriteAsConstant.cpp @@ -189,7 +189,7 @@ struct PadOpToConstant final : public OpRewritePattern { return constantFoldPadOp( rewriter, loc, inputAttr, integerAttr, *lowPad, *highPad); }) - .Default(Value()); + .Default(nullptr); if (!newOp) return rewriter.notifyMatchFailure(padTensorOp, diff --git a/mlir/lib/TableGen/Type.cpp b/mlir/lib/TableGen/Type.cpp index b31377e0de3e9..0f1bf83d1987b 100644 --- a/mlir/lib/TableGen/Type.cpp +++ b/mlir/lib/TableGen/Type.cpp @@ -56,7 +56,7 @@ std::optional TypeConstraint::getBuilderCall() const { StringRef value = init->getValue(); return value.empty() ? std::optional() : value; }) - .Default([](auto *) { return std::nullopt; }); + .Default(std::nullopt); } // Return the C++ type for this type (which may just be ::mlir::Type). diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp index eeb87253e5eb8..e3bcf2749be13 100644 --- a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp @@ -390,7 +390,7 @@ llvm::DISubrange *DebugTranslation::translateImpl(DISubrangeAttr attr) { .Case<>([&](LLVM::DIGlobalVariableAttr global) { return translate(global); }) - .Default([&](Attribute attr) { return nullptr; }); + .Default(nullptr); return metadata; }; return llvm::DISubrange::get(llvmCtx, getMetadataOrNull(attr.getCount()), @@ -420,10 +420,10 @@ DebugTranslation::translateImpl(DIGenericSubrangeAttr attr) { .Case([&](LLVM::DILocalVariableAttr local) { return translate(local); }) - .Case<>([&](LLVM::DIGlobalVariableAttr global) { + .Case([&](LLVM::DIGlobalVariableAttr global) { return translate(global); }) - .Default([&](Attribute attr) { return nullptr; }); + .Default(nullptr); return metadata; }; return llvm::DIGenericSubrange::get(llvmCtx, From 7aae8313ab6db251f579265d39e8b9b0d520f4e3 Mon Sep 17 00:00:00 2001 From: Gedare Bloom Date: Thu, 30 Oct 2025 13:24:44 -0600 Subject: [PATCH 244/539] [clang-format] Add BreakAfterOpenBracket* and BreakBeforeCloseBracket* (#108332) Replace the `AlwaysBreak` and `BlockIndent` suboptions of `AlignAfterOpenBracket` with new style options `BreakAfterOpenBracket*` and `BreakBeforeCloseBracket*` for `*` in `BracedList` for braced list initializers, `if` for if conditional statements, `Loop` for loop control statements (for/while), `Switch` for switch statements, and `Function` for function calls/declarations/definitions. Deprecates `AlwaysBreak` and `BlockIndent`. Fixes #67738 Fixes #79176 Fixes #80123 Fixes #151844 --- clang/docs/ClangFormatStyleOptions.rst | 192 ++++++++++++++----- clang/docs/ReleaseNotes.rst | 8 + clang/include/clang/Format/Format.h | 182 ++++++++++++++---- clang/lib/Format/ContinuationIndenter.cpp | 97 ++++++---- clang/lib/Format/Format.cpp | 111 +++++++++-- clang/lib/Format/FormatToken.cpp | 4 +- clang/lib/Format/FormatToken.h | 6 + clang/lib/Format/TokenAnnotator.cpp | 25 ++- clang/unittests/Format/AlignBracketsTest.cpp | 58 +++--- clang/unittests/Format/ConfigParseTest.cpp | 37 ++-- clang/unittests/Format/FormatTest.cpp | 36 ++-- clang/unittests/Format/FormatTestJS.cpp | 2 +- 12 files changed, 566 insertions(+), 192 deletions(-) diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index 570cab262c115..0b4a4849f6ccc 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -197,57 +197,29 @@ the configuration (without a prefix: ``Auto``). .. _AlignAfterOpenBracket: -**AlignAfterOpenBracket** (``BracketAlignmentStyle``) :versionbadge:`clang-format 3.8` :ref:`¶ ` +**AlignAfterOpenBracket** (``Boolean``) :versionbadge:`clang-format 3.8` :ref:`¶ ` If ``true``, horizontally aligns arguments after an open bracket. - This applies to round brackets (parentheses), angle brackets and square - brackets. - - Possible values: - - * ``BAS_Align`` (in configuration: ``Align``) - Align parameters on the open bracket, e.g.: - - .. code-block:: c++ - - someLongFunction(argument1, - argument2); - - * ``BAS_DontAlign`` (in configuration: ``DontAlign``) - Don't align, instead use ``ContinuationIndentWidth``, e.g.: - - .. code-block:: c++ - - someLongFunction(argument1, - argument2); - - * ``BAS_AlwaysBreak`` (in configuration: ``AlwaysBreak``) - Always break after an open bracket, if the parameters don't fit - on a single line, e.g.: - - .. code-block:: c++ - someLongFunction( - argument1, argument2); - - * ``BAS_BlockIndent`` (in configuration: ``BlockIndent``) - Always break after an open bracket, if the parameters don't fit - on a single line. Closing brackets will be placed on a new line. - E.g.: - - .. code-block:: c++ + .. code-block:: c++ - someLongFunction( - argument1, argument2 - ) + true: vs. false + someLongFunction(argument1, someLongFunction(argument1, + argument2); argument2); - .. note:: - - This currently only applies to braced initializer lists (when - ``Cpp11BracedListStyle`` is not ``Block``) and parentheses. + .. note:: + As of clang-format 22 this option is a bool with the previous + option of ``Align`` replaced with ``true``, ``DontAlign`` replaced + with ``false``, and the options of ``AlwaysBreak`` and ``BlockIndent`` + replaced with ``true`` and with setting of new style options using + ``BreakAfterOpenBracketBracedList``, ``BreakAfterOpenBracketFunction``, + ``BreakAfterOpenBracketIf``, ``BreakBeforeCloseBracketBracedList``, + ``BreakBeforeCloseBracketFunction``, and ``BreakBeforeCloseBracketIf``. + This applies to round brackets (parentheses), angle brackets and square + brackets. .. _AlignArrayOfStructures: @@ -2746,6 +2718,67 @@ the configuration (without a prefix: ``Auto``). @Mock DataLoad loader; +.. _BreakAfterOpenBracketBracedList: + +**BreakAfterOpenBracketBracedList** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break after the left bracket of a braced initializer list (when + ``Cpp11BracedListStyle`` is ``true``) when the list exceeds the column + limit. + + .. code-block:: c++ + + true: false: + vector x { vs. vector x {1, + 1, 2, 3} 2, 3} + +.. _BreakAfterOpenBracketFunction: + +**BreakAfterOpenBracketFunction** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break after the left parenthesis of a function (declaration, + definition, call) when the parameters exceed the column limit. + + .. code-block:: c++ + + true: false: + foo ( vs. foo (a, + a , b) b) + +.. _BreakAfterOpenBracketIf: + +**BreakAfterOpenBracketIf** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break after the left parenthesis of an if control statement + when the expression exceeds the column limit. + + .. code-block:: c++ + + true: false: + if constexpr ( vs. if constexpr (a || + a || b) b) + +.. _BreakAfterOpenBracketLoop: + +**BreakAfterOpenBracketLoop** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break after the left parenthesis of a loop control statement + when the expression exceeds the column limit. + + .. code-block:: c++ + + true: false: + while ( vs. while (a && + a && b) { b) { + +.. _BreakAfterOpenBracketSwitch: + +**BreakAfterOpenBracketSwitch** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break after the left parenthesis of a switch control statement + when the expression exceeds the column limit. + + .. code-block:: c++ + + true: false: + switch ( vs. switch (a + + a + b) { b) { + .. _BreakAfterReturnType: **BreakAfterReturnType** (``ReturnTypeBreakingStyle``) :versionbadge:`clang-format 19` :ref:`¶ ` @@ -3383,6 +3416,79 @@ the configuration (without a prefix: ``Auto``). +.. _BreakBeforeCloseBracketBracedList: + +**BreakBeforeCloseBracketBracedList** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break before the right bracket of a braced initializer list (when + ``Cpp11BracedListStyle`` is ``true``) when the list exceeds the column + limit. The break before the right bracket is only made if there is a + break after the opening bracket. + + .. code-block:: c++ + + true: false: + vector x { vs. vector x { + 1, 2, 3 1, 2, 3} + } + +.. _BreakBeforeCloseBracketFunction: + +**BreakBeforeCloseBracketFunction** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break before the right parenthesis of a function (declaration, + definition, call) when the parameters exceed the column limit. + + .. code-block:: c++ + + true: false: + foo ( vs. foo ( + a , b a , b) + ) + +.. _BreakBeforeCloseBracketIf: + +**BreakBeforeCloseBracketIf** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break before the right parenthesis of an if control statement + when the expression exceeds the column limit. The break before the + closing parenthesis is only made if there is a break after the opening + parenthesis. + + .. code-block:: c++ + + true: false: + if constexpr ( vs. if constexpr ( + a || b a || b ) + ) + +.. _BreakBeforeCloseBracketLoop: + +**BreakBeforeCloseBracketLoop** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break before the right parenthesis of a loop control statement + when the expression exceeds the column limit. The break before the + closing parenthesis is only made if there is a break after the opening + parenthesis. + + .. code-block:: c++ + + true: false: + while ( vs. while ( + a && b a && b) { + ) { + +.. _BreakBeforeCloseBracketSwitch: + +**BreakBeforeCloseBracketSwitch** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ ` + Force break before the right parenthesis of a switch control statement + when the expression exceeds the column limit. The break before the + closing parenthesis is only made if there is a break after the opening + parenthesis. + + .. code-block:: c++ + + true: false: + switch ( vs. switch ( + a + b a + b) { + ) { + .. _BreakBeforeConceptDeclarations: **BreakBeforeConceptDeclarations** (``BreakBeforeConceptDeclarationsStyle``) :versionbadge:`clang-format 12` :ref:`¶ ` diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 8435f367029a5..ba737b9efb003 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -613,6 +613,14 @@ clang-format literals. - Add ``Leave`` suboption to ``IndentPPDirectives``. - Add ``AllowBreakBeforeQtProperty`` option. +- Add ``BreakAfterOpenBracketBracedList'', ``BreakAfterOpenBracketFunction'', + ``BreakAfterOpenBracketIf``, ``BreakAfterOpenBracketLoop``, + ``BreakAfterOpenBracketSwitch``, ``BreakBeforeCloseBracketBracedList'', + ``BreakBeforeCloseBracketFunction``, ``BreakBeforeCloseBracketIf``, + ``BreakBeforeCloseBracketLoop``, ``BreakBeforeCloseBracketSwitch`` options. +- Deprecate ``AlwaysBreak`` and ``BlockIndent`` suboptions from the + ``AlignAfterOpenBracket`` option, and make ``AlignAfterOpenBracket`` a + ``bool`` type. libclang -------- diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index 2852c4a2916a4..f246defc1fe81 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -62,49 +62,28 @@ struct FormatStyle { /// \version 3.3 int AccessModifierOffset; - /// Different styles for aligning after open brackets. - enum BracketAlignmentStyle : int8_t { - /// Align parameters on the open bracket, e.g.: - /// \code - /// someLongFunction(argument1, - /// argument2); - /// \endcode - BAS_Align, - /// Don't align, instead use ``ContinuationIndentWidth``, e.g.: - /// \code - /// someLongFunction(argument1, - /// argument2); - /// \endcode - BAS_DontAlign, - /// Always break after an open bracket, if the parameters don't fit - /// on a single line, e.g.: - /// \code - /// someLongFunction( - /// argument1, argument2); - /// \endcode - BAS_AlwaysBreak, - /// Always break after an open bracket, if the parameters don't fit - /// on a single line. Closing brackets will be placed on a new line. - /// E.g.: - /// \code - /// someLongFunction( - /// argument1, argument2 - /// ) - /// \endcode - /// - /// \note - /// This currently only applies to braced initializer lists (when - /// ``Cpp11BracedListStyle`` is not ``Block``) and parentheses. - /// \endnote - BAS_BlockIndent, - }; - /// If ``true``, horizontally aligns arguments after an open bracket. /// + /// \code + /// true: vs. false + /// someLongFunction(argument1, someLongFunction(argument1, + /// argument2); argument2); + /// \endcode + /// + /// \note + /// As of clang-format 22 this option is a bool with the previous + /// option of ``Align`` replaced with ``true``, ``DontAlign`` replaced + /// with ``false``, and the options of ``AlwaysBreak`` and ``BlockIndent`` + /// replaced with ``true`` and with setting of new style options using + /// ``BreakAfterOpenBracketBracedList``, ``BreakAfterOpenBracketFunction``, + /// ``BreakAfterOpenBracketIf``, ``BreakBeforeCloseBracketBracedList``, + /// ``BreakBeforeCloseBracketFunction``, and ``BreakBeforeCloseBracketIf``. + /// \endnote + /// /// This applies to round brackets (parentheses), angle brackets and square /// brackets. /// \version 3.8 - BracketAlignmentStyle AlignAfterOpenBracket; + bool AlignAfterOpenBracket; /// Different style for aligning array initializers. enum ArrayInitializerAlignmentStyle : int8_t { @@ -1708,6 +1687,57 @@ struct FormatStyle { /// \version 16 AttributeBreakingStyle BreakAfterAttributes; + /// Force break after the left bracket of a braced initializer list (when + /// ``Cpp11BracedListStyle`` is ``true``) when the list exceeds the column + /// limit. + /// \code + /// true: false: + /// vector x { vs. vector x {1, + /// 1, 2, 3} 2, 3} + /// \endcode + /// \version 22 + bool BreakAfterOpenBracketBracedList; + + /// Force break after the left parenthesis of a function (declaration, + /// definition, call) when the parameters exceed the column limit. + /// \code + /// true: false: + /// foo ( vs. foo (a, + /// a , b) b) + /// \endcode + /// \version 22 + bool BreakAfterOpenBracketFunction; + + /// Force break after the left parenthesis of an if control statement + /// when the expression exceeds the column limit. + /// \code + /// true: false: + /// if constexpr ( vs. if constexpr (a || + /// a || b) b) + /// \endcode + /// \version 22 + bool BreakAfterOpenBracketIf; + + /// Force break after the left parenthesis of a loop control statement + /// when the expression exceeds the column limit. + /// \code + /// true: false: + /// while ( vs. while (a && + /// a && b) { b) { + /// \endcode + /// \version 22 + bool BreakAfterOpenBracketLoop; + + /// Force break after the left parenthesis of a switch control statement + /// when the expression exceeds the column limit. + /// \code + /// true: false: + /// switch ( vs. switch (a + + /// a + b) { b) { + /// \endcode + /// \version 22 + bool BreakAfterOpenBracketSwitch; + /// The function declaration return type breaking style to use. /// \version 19 ReturnTypeBreakingStyle BreakAfterReturnType; @@ -2221,6 +2251,69 @@ struct FormatStyle { /// \version 3.7 BraceBreakingStyle BreakBeforeBraces; + /// Force break before the right bracket of a braced initializer list (when + /// ``Cpp11BracedListStyle`` is ``true``) when the list exceeds the column + /// limit. The break before the right bracket is only made if there is a + /// break after the opening bracket. + /// \code + /// true: false: + /// vector x { vs. vector x { + /// 1, 2, 3 1, 2, 3} + /// } + /// \endcode + /// \version 22 + bool BreakBeforeCloseBracketBracedList; + + /// Force break before the right parenthesis of a function (declaration, + /// definition, call) when the parameters exceed the column limit. + /// \code + /// true: false: + /// foo ( vs. foo ( + /// a , b a , b) + /// ) + /// \endcode + /// \version 22 + bool BreakBeforeCloseBracketFunction; + + /// Force break before the right parenthesis of an if control statement + /// when the expression exceeds the column limit. The break before the + /// closing parenthesis is only made if there is a break after the opening + /// parenthesis. + /// \code + /// true: false: + /// if constexpr ( vs. if constexpr ( + /// a || b a || b ) + /// ) + /// \endcode + /// \version 22 + bool BreakBeforeCloseBracketIf; + + /// Force break before the right parenthesis of a loop control statement + /// when the expression exceeds the column limit. The break before the + /// closing parenthesis is only made if there is a break after the opening + /// parenthesis. + /// \code + /// true: false: + /// while ( vs. while ( + /// a && b a && b) { + /// ) { + /// \endcode + /// \version 22 + bool BreakBeforeCloseBracketLoop; + + /// Force break before the right parenthesis of a switch control statement + /// when the expression exceeds the column limit. The break before the + /// closing parenthesis is only made if there is a break after the opening + /// parenthesis. + /// \code + /// true: false: + /// switch ( vs. switch ( + /// a + b a + b) { + /// ) { + /// \endcode + /// \version 22 + bool BreakBeforeCloseBracketSwitch; + /// Different ways to break before concept declarations. enum BreakBeforeConceptDeclarationsStyle : int8_t { /// Keep the template declaration line together with ``concept``. @@ -5530,10 +5623,23 @@ struct FormatStyle { BreakAdjacentStringLiterals == R.BreakAdjacentStringLiterals && BreakAfterAttributes == R.BreakAfterAttributes && BreakAfterJavaFieldAnnotations == R.BreakAfterJavaFieldAnnotations && + BreakAfterOpenBracketBracedList == + R.BreakAfterOpenBracketBracedList && + BreakAfterOpenBracketFunction == R.BreakAfterOpenBracketFunction && + BreakAfterOpenBracketIf == R.BreakAfterOpenBracketIf && + BreakAfterOpenBracketLoop == R.BreakAfterOpenBracketLoop && + BreakAfterOpenBracketSwitch == R.BreakAfterOpenBracketSwitch && BreakAfterReturnType == R.BreakAfterReturnType && BreakArrays == R.BreakArrays && BreakBeforeBinaryOperators == R.BreakBeforeBinaryOperators && BreakBeforeBraces == R.BreakBeforeBraces && + BreakBeforeCloseBracketBracedList == + R.BreakBeforeCloseBracketBracedList && + BreakBeforeCloseBracketFunction == + R.BreakBeforeCloseBracketFunction && + BreakBeforeCloseBracketIf == R.BreakBeforeCloseBracketIf && + BreakBeforeCloseBracketLoop == R.BreakBeforeCloseBracketLoop && + BreakBeforeCloseBracketSwitch == R.BreakBeforeCloseBracketSwitch && BreakBeforeConceptDeclarations == R.BreakBeforeConceptDeclarations && BreakBeforeInlineASMColon == R.BreakBeforeInlineASMColon && BreakBeforeTemplateCloser == R.BreakBeforeTemplateCloser && diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index e5abf833194d4..9ab024a03fbd7 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -356,9 +356,11 @@ bool ContinuationIndenter::canBreak(const LineState &State) { return CurrentState.BreakBeforeClosingBrace; } - // Allow breaking before the right parens with block indentation if there was - // a break after the left parens, which is tracked by BreakBeforeClosingParen. - if (Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent && + // Check need to break before the right parens if there was a break after + // the left parens, which is tracked by BreakBeforeClosingParen. + if ((Style.BreakBeforeCloseBracketFunction || + Style.BreakBeforeCloseBracketIf || Style.BreakBeforeCloseBracketLoop || + Style.BreakBeforeCloseBracketSwitch) && Current.is(tok::r_paren)) { return CurrentState.BreakBeforeClosingParen; } @@ -837,32 +839,38 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, return Tok.is(tok::l_brace) && Tok.isNot(BK_Block) && Style.Cpp11BracedListStyle != FormatStyle::BLS_Block; }; - if (Tok.isNoneOf(tok::l_paren, TT_TemplateOpener, tok::l_square) && - !IsStartOfBracedList()) { + if (IsStartOfBracedList()) + return Style.BreakAfterOpenBracketBracedList; + if (Tok.isNoneOf(tok::l_paren, TT_TemplateOpener, tok::l_square)) return false; - } if (!Tok.Previous) return true; if (Tok.Previous->isIf()) - return Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak; - return Tok.Previous->isNoneOf(TT_CastRParen, tok::kw_for, tok::kw_while, - tok::kw_switch) && - !(Style.isJavaScript() && Tok.Previous->is(Keywords.kw_await)); + return Style.BreakAfterOpenBracketIf; + if (Tok.Previous->isLoop(Style)) + return Style.BreakAfterOpenBracketLoop; + if (Tok.Previous->is(tok::kw_switch)) + return Style.BreakAfterOpenBracketSwitch; + if (Style.BreakAfterOpenBracketFunction) { + return !Tok.Previous->is(TT_CastRParen) && + !(Style.isJavaScript() && Tok.is(Keywords.kw_await)); + } + return false; }; auto IsFunctionCallParen = [](const FormatToken &Tok) { return Tok.is(tok::l_paren) && Tok.ParameterCount > 0 && Tok.Previous && Tok.Previous->is(tok::identifier); }; - auto IsInTemplateString = [this](const FormatToken &Tok) { + auto IsInTemplateString = [this](const FormatToken &Tok, bool NestBlocks) { if (!Style.isJavaScript()) return false; for (const auto *Prev = &Tok; Prev; Prev = Prev->Previous) { if (Prev->is(TT_TemplateString) && Prev->opensScope()) return true; - if (Prev->opensScope() || - (Prev->is(TT_TemplateString) && Prev->closesScope())) { - break; - } + if (Prev->opensScope() && !NestBlocks) + return false; + if (Prev->is(TT_TemplateString) && Prev->closesScope()) + return false; } return false; }; @@ -884,21 +892,25 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, Tok.isOneOf(tok::ellipsis, Keywords.kw_await))) { return true; } - if (const auto *Previous = Tok.Previous; - !Previous || (Previous->isNoneOf(TT_FunctionDeclarationLParen, - TT_LambdaDefinitionLParen) && - !IsFunctionCallParen(*Previous))) { + const auto *Previous = TokAfterLParen.Previous; + assert(Previous); // IsOpeningBracket(Previous) + if (Previous->Previous && + (Previous->Previous->isIf() || Previous->Previous->isLoop(Style) || + Previous->Previous->is(tok::kw_switch))) { + return false; + } + if (Previous->isNoneOf(TT_FunctionDeclarationLParen, + TT_LambdaDefinitionLParen) && + !IsFunctionCallParen(*Previous)) { return true; } - if (IsOpeningBracket(Tok) || IsInTemplateString(Tok)) + if (IsOpeningBracket(Tok) || IsInTemplateString(Tok, true)) return true; const auto *Next = Tok.Next; return !Next || Next->isMemberAccess() || Next->is(TT_FunctionDeclarationLParen) || IsFunctionCallParen(*Next); }; - if ((Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak || - Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent) && - IsOpeningBracket(Previous) && State.Column > getNewLineColumn(State) && + if (IsOpeningBracket(Previous) && State.Column > getNewLineColumn(State) && // Don't do this for simple (no expressions) one-argument function calls // as that feels like needlessly wasting whitespace, e.g.: // @@ -920,7 +932,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, // Note: This doesn't apply to macro expansion lines, which are MACRO( , , ) // with args as children of the '(' and ',' tokens. It does not make sense to // align the commas with the opening paren. - if (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign && + if (Style.AlignAfterOpenBracket && !CurrentState.IsCSharpGenericTypeConstraint && Previous.opensScope() && Previous.isNoneOf(TT_ObjCMethodExpr, TT_RequiresClause, TT_TableGenDAGArgOpener, @@ -933,7 +945,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, Previous.Previous->isNoneOf(tok::identifier, tok::l_paren, BK_BracedInit))) || Previous.is(TT_VerilogMultiLineListLParen)) && - !IsInTemplateString(Current)) { + !IsInTemplateString(Current, false)) { CurrentState.Indent = State.Column + Spaces; CurrentState.IsAligned = true; } @@ -1271,8 +1283,20 @@ unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State, } if (PreviousNonComment && PreviousNonComment->is(tok::l_paren)) { - CurrentState.BreakBeforeClosingParen = - Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent; + if (auto Previous = PreviousNonComment->Previous) { + if (Previous->isIf()) { + CurrentState.BreakBeforeClosingParen = Style.BreakBeforeCloseBracketIf; + } else if (Previous->isLoop(Style)) { + CurrentState.BreakBeforeClosingParen = + Style.BreakBeforeCloseBracketLoop; + } else if (Previous->is(tok::kw_switch)) { + CurrentState.BreakBeforeClosingParen = + Style.BreakBeforeCloseBracketSwitch; + } else { + CurrentState.BreakBeforeClosingParen = + Style.BreakBeforeCloseBracketFunction; + } + } } if (PreviousNonComment && PreviousNonComment->is(TT_TemplateOpener)) @@ -1416,13 +1440,17 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) { State.Stack.size() > 1) { return State.Stack[State.Stack.size() - 2].LastSpace; } - if (Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent && - (Current.is(tok::r_paren) || - (Current.is(tok::r_brace) && Current.MatchingParen && - Current.MatchingParen->is(BK_BracedInit))) && + if (Style.BreakBeforeCloseBracketBracedList && Current.is(tok::r_brace) && + Current.MatchingParen && Current.MatchingParen->is(BK_BracedInit) && State.Stack.size() > 1) { return State.Stack[State.Stack.size() - 2].LastSpace; } + if ((Style.BreakBeforeCloseBracketFunction || + Style.BreakBeforeCloseBracketIf || Style.BreakBeforeCloseBracketLoop || + Style.BreakBeforeCloseBracketSwitch) && + Current.is(tok::r_paren) && State.Stack.size() > 1) { + return State.Stack[State.Stack.size() - 2].LastSpace; + } if (Style.BreakBeforeTemplateCloser && Current.is(TT_TemplateCloser) && State.Stack.size() > 1) { return State.Stack[State.Stack.size() - 2].LastSpace; @@ -1844,8 +1872,8 @@ void ContinuationIndenter::moveStatePastFakeLParens(LineState &State, PrecedenceLevel < prec::Assignment) && (!Previous || Previous->isNot(tok::kw_return) || (!Style.isJava() && PrecedenceLevel > 0)) && - (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign || - PrecedenceLevel > prec::Comma || Current.NestingLevel == 0) && + (Style.AlignAfterOpenBracket || PrecedenceLevel > prec::Comma || + Current.NestingLevel == 0) && (!Style.isTableGen() || (Previous && Previous->isOneOf(TT_TableGenDAGArgListComma, TT_TableGenDAGArgListCommaToBreak)))) { @@ -1885,8 +1913,7 @@ void ContinuationIndenter::moveStatePastFakeLParens(LineState &State, if (PrecedenceLevel > prec::Unknown) NewParenState.LastSpace = std::max(NewParenState.LastSpace, State.Column); if (PrecedenceLevel != prec::Conditional && - Current.isNot(TT_UnaryOperator) && - Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign) { + Current.isNot(TT_UnaryOperator) && Style.AlignAfterOpenBracket) { NewParenState.StartOfFunctionCall = State.Column; } diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index edd126c7724b8..dd14fcd72922f 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -32,6 +32,13 @@ using clang::format::FormatStyle; LLVM_YAML_IS_SEQUENCE_VECTOR(FormatStyle::RawStringFormat) +enum BracketAlignmentStyle : int8_t { + BAS_Align, + BAS_DontAlign, + BAS_AlwaysBreak, + BAS_BlockIndent +}; + namespace llvm { namespace yaml { template <> @@ -204,16 +211,16 @@ template <> struct MappingTraits { } }; -template <> struct ScalarEnumerationTraits { - static void enumeration(IO &IO, FormatStyle::BracketAlignmentStyle &Value) { - IO.enumCase(Value, "Align", FormatStyle::BAS_Align); - IO.enumCase(Value, "DontAlign", FormatStyle::BAS_DontAlign); - IO.enumCase(Value, "AlwaysBreak", FormatStyle::BAS_AlwaysBreak); - IO.enumCase(Value, "BlockIndent", FormatStyle::BAS_BlockIndent); +template <> struct ScalarEnumerationTraits { + static void enumeration(IO &IO, BracketAlignmentStyle &Value) { + IO.enumCase(Value, "Align", BAS_Align); + IO.enumCase(Value, "DontAlign", BAS_DontAlign); // For backward compatibility. - IO.enumCase(Value, "true", FormatStyle::BAS_Align); - IO.enumCase(Value, "false", FormatStyle::BAS_DontAlign); + IO.enumCase(Value, "true", BAS_Align); + IO.enumCase(Value, "false", BAS_DontAlign); + IO.enumCase(Value, "AlwaysBreak", BAS_AlwaysBreak); + IO.enumCase(Value, "BlockIndent", BAS_BlockIndent); } }; @@ -979,6 +986,54 @@ template <> struct MappingTraits { bool SpacesInCStyleCastParentheses = false; bool SpacesInParentheses = false; + if (IO.outputting()) { + IO.mapOptional("AlignAfterOpenBracket", Style.AlignAfterOpenBracket); + } else { + // For backward compatibility. + BracketAlignmentStyle LocalBAS = BAS_Align; + if (IsGoogleOrChromium) { + FormatStyle::LanguageKind Language = Style.Language; + if (Language == FormatStyle::LK_None) + Language = ((FormatStyle *)IO.getContext())->Language; + if (Language == FormatStyle::LK_JavaScript) + LocalBAS = BAS_AlwaysBreak; + else if (Language == FormatStyle::LK_Java) + LocalBAS = BAS_DontAlign; + } else if (BasedOnStyle.equals_insensitive("webkit")) { + LocalBAS = BAS_DontAlign; + } + IO.mapOptional("AlignAfterOpenBracket", LocalBAS); + Style.BreakAfterOpenBracketBracedList = false; + Style.BreakAfterOpenBracketFunction = false; + Style.BreakAfterOpenBracketIf = false; + Style.BreakAfterOpenBracketLoop = false; + Style.BreakAfterOpenBracketSwitch = false; + Style.BreakBeforeCloseBracketBracedList = false; + Style.BreakBeforeCloseBracketFunction = false; + Style.BreakBeforeCloseBracketIf = false; + Style.BreakBeforeCloseBracketLoop = false; + Style.BreakBeforeCloseBracketSwitch = false; + + switch (LocalBAS) { + case BAS_DontAlign: + Style.AlignAfterOpenBracket = false; + break; + case BAS_BlockIndent: + Style.BreakBeforeCloseBracketBracedList = true; + Style.BreakBeforeCloseBracketFunction = true; + Style.BreakBeforeCloseBracketIf = true; + [[fallthrough]]; + case BAS_AlwaysBreak: + Style.BreakAfterOpenBracketBracedList = true; + Style.BreakAfterOpenBracketFunction = true; + Style.BreakAfterOpenBracketIf = true; + [[fallthrough]]; + case BAS_Align: + Style.AlignAfterOpenBracket = true; + break; + } + } + // For backward compatibility. if (!IO.outputting()) { IO.mapOptional("AlignEscapedNewlinesLeft", Style.AlignEscapedNewlines); @@ -1014,7 +1069,6 @@ template <> struct MappingTraits { } IO.mapOptional("AccessModifierOffset", Style.AccessModifierOffset); - IO.mapOptional("AlignAfterOpenBracket", Style.AlignAfterOpenBracket); IO.mapOptional("AlignArrayOfStructures", Style.AlignArrayOfStructures); IO.mapOptional("AlignConsecutiveAssignments", Style.AlignConsecutiveAssignments); @@ -1079,10 +1133,29 @@ template <> struct MappingTraits { IO.mapOptional("BreakAfterAttributes", Style.BreakAfterAttributes); IO.mapOptional("BreakAfterJavaFieldAnnotations", Style.BreakAfterJavaFieldAnnotations); + IO.mapOptional("BreakAfterOpenBracketBracedList", + Style.BreakAfterOpenBracketBracedList); + IO.mapOptional("BreakAfterOpenBracketFunction", + Style.BreakAfterOpenBracketFunction); + IO.mapOptional("BreakAfterOpenBracketIf", Style.BreakAfterOpenBracketIf); + IO.mapOptional("BreakAfterOpenBracketLoop", + Style.BreakAfterOpenBracketLoop); + IO.mapOptional("BreakAfterOpenBracketSwitch", + Style.BreakAfterOpenBracketSwitch); IO.mapOptional("BreakAfterReturnType", Style.BreakAfterReturnType); IO.mapOptional("BreakArrays", Style.BreakArrays); IO.mapOptional("BreakBeforeBinaryOperators", Style.BreakBeforeBinaryOperators); + IO.mapOptional("BreakBeforeCloseBracketBracedList", + Style.BreakBeforeCloseBracketBracedList); + IO.mapOptional("BreakBeforeCloseBracketFunction", + Style.BreakBeforeCloseBracketFunction); + IO.mapOptional("BreakBeforeCloseBracketIf", + Style.BreakBeforeCloseBracketIf); + IO.mapOptional("BreakBeforeCloseBracketLoop", + Style.BreakBeforeCloseBracketLoop); + IO.mapOptional("BreakBeforeCloseBracketSwitch", + Style.BreakBeforeCloseBracketSwitch); IO.mapOptional("BreakBeforeConceptDeclarations", Style.BreakBeforeConceptDeclarations); IO.mapOptional("BreakBeforeBraces", Style.BreakBeforeBraces); @@ -1561,7 +1634,7 @@ static void expandPresetsSpacesInParens(FormatStyle &Expanded) { FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) { FormatStyle LLVMStyle; LLVMStyle.AccessModifierOffset = -2; - LLVMStyle.AlignAfterOpenBracket = FormatStyle::BAS_Align; + LLVMStyle.AlignAfterOpenBracket = true; LLVMStyle.AlignArrayOfStructures = FormatStyle::AIAS_None; LLVMStyle.AlignConsecutiveAssignments = {}; LLVMStyle.AlignConsecutiveAssignments.PadOperators = true; @@ -1621,10 +1694,20 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) { LLVMStyle.BreakAdjacentStringLiterals = true; LLVMStyle.BreakAfterAttributes = FormatStyle::ABS_Leave; LLVMStyle.BreakAfterJavaFieldAnnotations = false; + LLVMStyle.BreakAfterOpenBracketBracedList = false; + LLVMStyle.BreakAfterOpenBracketFunction = false; + LLVMStyle.BreakAfterOpenBracketIf = false; + LLVMStyle.BreakAfterOpenBracketLoop = false; + LLVMStyle.BreakAfterOpenBracketSwitch = false; LLVMStyle.BreakAfterReturnType = FormatStyle::RTBS_None; LLVMStyle.BreakArrays = true; LLVMStyle.BreakBeforeBinaryOperators = FormatStyle::BOS_None; LLVMStyle.BreakBeforeBraces = FormatStyle::BS_Attach; + LLVMStyle.BreakBeforeCloseBracketBracedList = false; + LLVMStyle.BreakBeforeCloseBracketFunction = false; + LLVMStyle.BreakBeforeCloseBracketIf = false; + LLVMStyle.BreakBeforeCloseBracketLoop = false; + LLVMStyle.BreakBeforeCloseBracketSwitch = false; LLVMStyle.BreakBeforeConceptDeclarations = FormatStyle::BBCDS_Always; LLVMStyle.BreakBeforeInlineASMColon = FormatStyle::BBIAS_OnlyMultiline; LLVMStyle.BreakBeforeTemplateCloser = false; @@ -1877,7 +1960,7 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) { GoogleStyle.PenaltyReturnTypeOnItsOwnLine = 200; if (Language == FormatStyle::LK_Java) { - GoogleStyle.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign; + GoogleStyle.AlignAfterOpenBracket = false; GoogleStyle.AlignOperands = FormatStyle::OAS_DontAlign; GoogleStyle.AlignTrailingComments = {}; GoogleStyle.AlignTrailingComments.Kind = FormatStyle::TCAS_Never; @@ -1889,7 +1972,9 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) { GoogleStyle.SpaceAfterCStyleCast = true; GoogleStyle.SpacesBeforeTrailingComments = 1; } else if (Language == FormatStyle::LK_JavaScript) { - GoogleStyle.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + GoogleStyle.BreakAfterOpenBracketBracedList = true; + GoogleStyle.BreakAfterOpenBracketFunction = true; + GoogleStyle.BreakAfterOpenBracketIf = true; GoogleStyle.AlignOperands = FormatStyle::OAS_DontAlign; GoogleStyle.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Empty; // TODO: still under discussion whether to switch to SLS_All. @@ -2026,7 +2111,7 @@ FormatStyle getMozillaStyle() { FormatStyle getWebKitStyle() { FormatStyle Style = getLLVMStyle(); Style.AccessModifierOffset = -4; - Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign; + Style.AlignAfterOpenBracket = false; Style.AlignOperands = FormatStyle::OAS_DontAlign; Style.AlignTrailingComments = {}; Style.AlignTrailingComments.Kind = FormatStyle::TCAS_Never; diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp index d1c62642efd43..28fdbcbf0e47f 100644 --- a/clang/lib/Format/FormatToken.cpp +++ b/clang/lib/Format/FormatToken.cpp @@ -68,7 +68,7 @@ bool FormatToken::isBlockIndentedInitRBrace(const FormatStyle &Style) const { assert(MatchingParen); assert(MatchingParen->is(tok::l_brace)); if (Style.Cpp11BracedListStyle == FormatStyle::BLS_Block || - Style.AlignAfterOpenBracket != FormatStyle::BAS_BlockIndent) { + !Style.BreakBeforeCloseBracketBracedList) { return false; } const auto *LBrace = MatchingParen; @@ -198,7 +198,7 @@ void CommaSeparatedList::precomputeFormattingInfos(const FormatToken *Token) { return; // Column format doesn't really make sense if we don't align after brackets. - if (Style.AlignAfterOpenBracket == FormatStyle::BAS_DontAlign) + if (!Style.AlignAfterOpenBracket) return; FormatToken *ItemBegin = Token->Next; diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h index 6f3d24aefc1ca..d833130a538f1 100644 --- a/clang/lib/Format/FormatToken.h +++ b/clang/lib/Format/FormatToken.h @@ -666,6 +666,12 @@ struct FormatToken { (endsSequence(tok::identifier, tok::kw_if) && AllowConstexprMacro); } + bool isLoop(const FormatStyle &Style) const { + return isOneOf(tok::kw_for, tok::kw_while) || + (Style.isJavaScript() && isNot(tok::l_paren) && Previous && + Previous->is(tok::kw_for)); + } + bool closesScopeAfterBlock() const { if (getBlockKind() == BK_Block) return true; diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 021d8c658eb11..8e227da2a79ab 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -4427,10 +4427,8 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line, if (Left.is(tok::l_paren) && Style.PenaltyBreakOpenParenthesis != 0) return Style.PenaltyBreakOpenParenthesis; - if (Left.is(tok::l_paren) && InFunctionDecl && - Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign) { + if (Left.is(tok::l_paren) && InFunctionDecl && Style.AlignAfterOpenBracket) return 100; - } if (Left.is(tok::l_paren) && Left.Previous && (Left.Previous->isOneOf(tok::kw_for, tok::kw__Generic) || Left.Previous->isIf())) { @@ -4446,7 +4444,7 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line, // If we aren't aligning after opening parens/braces we can always break // here unless the style does not want us to place all arguments on the // next line. - if (Style.AlignAfterOpenBracket == FormatStyle::BAS_DontAlign && + if (!Style.AlignAfterOpenBracket && (Left.ParameterCount <= 1 || Style.AllowAllArgumentsOnNextLine)) { return 0; } @@ -6226,24 +6224,31 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, (Right.isBlockIndentedInitRBrace(Style))); } - // We only break before r_paren if we're in a block indented context. + // We can break before r_paren if we're in a block indented context or + // a control statement with an explicit style option. if (Right.is(tok::r_paren)) { - if (Style.AlignAfterOpenBracket != FormatStyle::BAS_BlockIndent || - !Right.MatchingParen) { + if (!Right.MatchingParen) return false; - } auto Next = Right.Next; if (Next && Next->is(tok::r_paren)) Next = Next->Next; if (Next && Next->is(tok::l_paren)) return false; const FormatToken *Previous = Right.MatchingParen->Previous; - return !(Previous && (Previous->is(tok::kw_for) || Previous->isIf())); + if (!Previous) + return false; + if (Previous->isIf()) + return Style.BreakBeforeCloseBracketIf; + if (Previous->isLoop(Style)) + return Style.BreakBeforeCloseBracketLoop; + if (Previous->is(tok::kw_switch)) + return Style.BreakBeforeCloseBracketSwitch; + return Style.BreakBeforeCloseBracketFunction; } if (Left.isOneOf(tok::r_paren, TT_TrailingAnnotation) && Right.is(TT_TrailingAnnotation) && - Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent) { + Style.BreakBeforeCloseBracketFunction) { return false; } diff --git a/clang/unittests/Format/AlignBracketsTest.cpp b/clang/unittests/Format/AlignBracketsTest.cpp index ea8db51a4d18e..10ca5fb7da1ce 100644 --- a/clang/unittests/Format/AlignBracketsTest.cpp +++ b/clang/unittests/Format/AlignBracketsTest.cpp @@ -28,7 +28,7 @@ TEST_F(AlignBracketsTest, AlignsAfterOpenBracket) { "SomeLongVariableName->someFunction(foooooooo(aaaaaaaaaaaaaaa,\n" " aaaaaaaaaaaaaaaaaaaaa));"); FormatStyle Style = getLLVMStyle(); - Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign; + Style.AlignAfterOpenBracket = false; verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n" " aaaaaaaaaaa aaaaaaaa, aaaaaaaaa aaaaaaa) {}", Style); @@ -64,7 +64,7 @@ TEST_F(AlignBracketsTest, AlignsAfterOpenBracket) { Style); Style.ColumnLimit = 80; - Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + Style.BreakAfterOpenBracketFunction = true; Style.BinPackArguments = false; Style.BinPackParameters = FormatStyle::BPPS_OnePerLine; verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n" @@ -115,7 +115,9 @@ TEST_F(AlignBracketsTest, AlignsAfterOpenBracket) { " XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXZZZZZZZZZZZZZZZZZZZZZZZZZ()));", Style); - Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent; + Style.BreakAfterOpenBracketFunction = true; + Style.BreakBeforeCloseBracketFunction = true; + Style.BreakBeforeCloseBracketBracedList = true; Style.BinPackArguments = false; Style.BinPackParameters = FormatStyle::BPPS_OnePerLine; verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n" @@ -254,7 +256,8 @@ TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndent) { "argument5));", Style); - Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent; + Style.BreakAfterOpenBracketFunction = true; + Style.BreakBeforeCloseBracketFunction = true; verifyFormat(Short, Style); verifyFormat( @@ -378,7 +381,8 @@ TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndentIfStatement) { "}", Style); - Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent; + Style.BreakAfterOpenBracketFunction = true; + Style.BreakBeforeCloseBracketFunction = true; verifyFormat("if (foo()) {\n" " return;\n" @@ -440,7 +444,8 @@ TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndentForStatement) { "}", Style); - Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent; + Style.BreakAfterOpenBracketFunction = true; + Style.BreakBeforeCloseBracketFunction = true; verifyFormat("for (int i = 0; i < 5; ++i) {\n" " doSomething();\n" @@ -457,7 +462,8 @@ TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndentForStatement) { TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndentInitializers) { auto Style = getLLVMStyleWithColumns(60); - Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent; + Style.BreakAfterOpenBracketBracedList = true; + Style.BreakBeforeCloseBracketBracedList = true; // Aggregate initialization. verifyFormat("int LooooooooooooooooooooooooongVariable[2] = {\n" " 10000000, 20000000\n" @@ -611,13 +617,13 @@ TEST_F(AlignBracketsTest, AllowAllArgumentsOnNextLineDontAlign) { StringRef Input = "functionCall(paramA, paramB, paramC);\n" "void functionDecl(int A, int B, int C);"; Style.AllowAllArgumentsOnNextLine = false; - Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign; + Style.AlignAfterOpenBracket = false; verifyFormat(StringRef("functionCall(paramA, paramB,\n" " paramC);\n" "void functionDecl(int A, int B,\n" " int C);"), Input, Style); - Style.AlignAfterOpenBracket = FormatStyle::BAS_Align; + Style.AlignAfterOpenBracket = true; verifyFormat(StringRef("functionCall(paramA, paramB,\n" " paramC);\n" "void functionDecl(int A, int B,\n" @@ -625,13 +631,14 @@ TEST_F(AlignBracketsTest, AllowAllArgumentsOnNextLineDontAlign) { Input, Style); // However, BAS_AlwaysBreak and BAS_BlockIndent should take precedence over // AllowAllArgumentsOnNextLine. - Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + Style.BreakAfterOpenBracketFunction = true; verifyFormat(StringRef("functionCall(\n" " paramA, paramB, paramC);\n" "void functionDecl(\n" " int A, int B, int C);"), Input, Style); - Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent; + Style.BreakAfterOpenBracketFunction = true; + Style.BreakBeforeCloseBracketFunction = true; verifyFormat("functionCall(\n" " paramA, paramB, paramC\n" ");\n" @@ -639,11 +646,12 @@ TEST_F(AlignBracketsTest, AllowAllArgumentsOnNextLineDontAlign) { " int A, int B, int C\n" ");", Input, Style); + Style.BreakBeforeCloseBracketFunction = false; // When AllowAllArgumentsOnNextLine is set, we prefer breaking before the // first argument. Style.AllowAllArgumentsOnNextLine = true; - Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + Style.BreakAfterOpenBracketFunction = true; verifyFormat(StringRef("functionCall(\n" " paramA, paramB, paramC);\n" "void functionDecl(\n" @@ -651,13 +659,14 @@ TEST_F(AlignBracketsTest, AllowAllArgumentsOnNextLineDontAlign) { Input, Style); // It wouldn't fit on one line with aligned parameters so this setting // doesn't change anything for BAS_Align. - Style.AlignAfterOpenBracket = FormatStyle::BAS_Align; + Style.AlignAfterOpenBracket = true; + Style.BreakAfterOpenBracketFunction = false; verifyFormat(StringRef("functionCall(paramA, paramB,\n" " paramC);\n" "void functionDecl(int A, int B,\n" " int C);"), Input, Style); - Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign; + Style.BreakAfterOpenBracketFunction = true; verifyFormat(StringRef("functionCall(\n" " paramA, paramB, paramC);\n" "void functionDecl(\n" @@ -678,13 +687,14 @@ TEST_F(AlignBracketsTest, FormatsDeclarationBreakAlways) { // Ensure AlignAfterOpenBracket interacts correctly with BinPackParameters set // to BPPS_AlwaysOnePerLine. - BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + BreakAlways.BreakAfterOpenBracketFunction = true; verifyFormat( "void someLongFunctionName(\n" " int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n" " int b);", BreakAlways); - BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent; + BreakAlways.BreakAfterOpenBracketFunction = true; + BreakAlways.BreakBeforeCloseBracketFunction = true; verifyFormat( "void someLongFunctionName(\n" " int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n" @@ -734,7 +744,7 @@ TEST_F(AlignBracketsTest, FormatsDefinitionBreakAlways) { // Ensure AlignAfterOpenBracket interacts correctly with BinPackParameters set // to BPPS_AlwaysOnePerLine. - BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + BreakAlways.BreakAfterOpenBracketFunction = true; verifyFormat( "void someLongFunctionName(\n" " int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n" @@ -743,7 +753,8 @@ TEST_F(AlignBracketsTest, FormatsDefinitionBreakAlways) { " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa, b);\n" "}", BreakAlways); - BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent; + BreakAlways.BreakAfterOpenBracketFunction = true; + BreakAlways.BreakBeforeCloseBracketFunction = true; verifyFormat( "void someLongFunctionName(\n" " int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n" @@ -761,17 +772,17 @@ TEST_F(AlignBracketsTest, ParenthesesAndOperandAlignment) { verifyFormat("int a = f(aaaaaaaaaaaaaaaaaaaaaa &&\n" " bbbbbbbbbbbbbbbbbbbbbb);", Style); - Style.AlignAfterOpenBracket = FormatStyle::BAS_Align; + Style.AlignAfterOpenBracket = true; Style.AlignOperands = FormatStyle::OAS_DontAlign; verifyFormat("int a = f(aaaaaaaaaaaaaaaaaaaaaa &&\n" " bbbbbbbbbbbbbbbbbbbbbb);", Style); - Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign; + Style.AlignAfterOpenBracket = false; Style.AlignOperands = FormatStyle::OAS_Align; verifyFormat("int a = f(aaaaaaaaaaaaaaaaaaaaaa &&\n" " bbbbbbbbbbbbbbbbbbbbbb);", Style); - Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign; + Style.AlignAfterOpenBracket = false; Style.AlignOperands = FormatStyle::OAS_DontAlign; verifyFormat("int a = f(aaaaaaaaaaaaaaaaaaaaaa &&\n" " bbbbbbbbbbbbbbbbbbbbbb);", @@ -781,7 +792,10 @@ TEST_F(AlignBracketsTest, ParenthesesAndOperandAlignment) { TEST_F(AlignBracketsTest, BlockIndentAndNamespace) { auto Style = getLLVMStyleWithColumns(120); Style.AllowShortNamespacesOnASingleLine = true; - Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent; + Style.BreakAfterOpenBracketFunction = true; + Style.BreakAfterOpenBracketBracedList = true; + Style.BreakBeforeCloseBracketFunction = true; + Style.BreakBeforeCloseBracketBracedList = true; verifyNoCrash( "namespace {\n" diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp index 6488e38badee7..43b21176962ea 100644 --- a/clang/unittests/Format/ConfigParseTest.cpp +++ b/clang/unittests/Format/ConfigParseTest.cpp @@ -172,6 +172,16 @@ TEST(ConfigParseTest, ParsesConfigurationBools) { CHECK_PARSE_BOOL(BinPackLongBracedList); CHECK_PARSE_BOOL(BreakAdjacentStringLiterals); CHECK_PARSE_BOOL(BreakAfterJavaFieldAnnotations); + CHECK_PARSE_BOOL(BreakAfterOpenBracketBracedList); + CHECK_PARSE_BOOL(BreakAfterOpenBracketFunction); + CHECK_PARSE_BOOL(BreakAfterOpenBracketIf); + CHECK_PARSE_BOOL(BreakAfterOpenBracketLoop); + CHECK_PARSE_BOOL(BreakAfterOpenBracketSwitch); + CHECK_PARSE_BOOL(BreakBeforeCloseBracketBracedList); + CHECK_PARSE_BOOL(BreakBeforeCloseBracketFunction); + CHECK_PARSE_BOOL(BreakBeforeCloseBracketIf); + CHECK_PARSE_BOOL(BreakBeforeCloseBracketLoop); + CHECK_PARSE_BOOL(BreakBeforeCloseBracketSwitch); CHECK_PARSE_BOOL(BreakBeforeTemplateCloser); CHECK_PARSE_BOOL(BreakBeforeTernaryOperators); CHECK_PARSE_BOOL(BreakStringLiterals); @@ -533,20 +543,23 @@ TEST(ConfigParseTest, ParsesConfiguration) { CHECK_PARSE("EnumTrailingComma: Remove", EnumTrailingComma, FormatStyle::ETC_Remove); - Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; - CHECK_PARSE("AlignAfterOpenBracket: Align", AlignAfterOpenBracket, - FormatStyle::BAS_Align); - CHECK_PARSE("AlignAfterOpenBracket: DontAlign", AlignAfterOpenBracket, - FormatStyle::BAS_DontAlign); + Style.AlignAfterOpenBracket = false; + CHECK_PARSE("AlignAfterOpenBracket: Align", AlignAfterOpenBracket, true); + CHECK_PARSE("AlignAfterOpenBracket: DontAlign", AlignAfterOpenBracket, false); + // For backward compatibility: CHECK_PARSE("AlignAfterOpenBracket: AlwaysBreak", AlignAfterOpenBracket, - FormatStyle::BAS_AlwaysBreak); + true); + CHECK_PARSE("AlignAfterOpenBracket: AlwaysBreak\n" + "BreakAfterOpenBracketIf: false", + BreakAfterOpenBracketIf, false); + CHECK_PARSE("BreakAfterOpenBracketLoop: true\n" + "AlignAfterOpenBracket: AlwaysBreak", + BreakAfterOpenBracketLoop, true); + CHECK_PARSE("AlignAfterOpenBracket: false", AlignAfterOpenBracket, false); CHECK_PARSE("AlignAfterOpenBracket: BlockIndent", AlignAfterOpenBracket, - FormatStyle::BAS_BlockIndent); - // For backward compatibility: - CHECK_PARSE("AlignAfterOpenBracket: false", AlignAfterOpenBracket, - FormatStyle::BAS_DontAlign); - CHECK_PARSE("AlignAfterOpenBracket: true", AlignAfterOpenBracket, - FormatStyle::BAS_Align); + true); + Style.AlignAfterOpenBracket = false; + CHECK_PARSE("AlignAfterOpenBracket: true", AlignAfterOpenBracket, true); Style.AlignEscapedNewlines = FormatStyle::ENAS_Left; CHECK_PARSE("AlignEscapedNewlines: DontAlign", AlignEscapedNewlines, diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index d45babe1b82ad..ca9e7925e5e95 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -5126,7 +5126,8 @@ TEST_F(FormatTest, DesignatedInitializers) { TEST_F(FormatTest, BracedInitializerIndentWidth) { auto Style = getLLVMStyleWithColumns(60); Style.BinPackArguments = true; - Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + Style.BreakAfterOpenBracketFunction = true; + Style.BreakAfterOpenBracketBracedList = true; Style.BracedInitializerIndentWidth = 6; // Non-initializing braces are unaffected by BracedInitializerIndentWidth. @@ -5302,7 +5303,8 @@ TEST_F(FormatTest, BracedInitializerIndentWidth) { Style); // Aligning after open braces unaffected by BracedInitializerIndentWidth. - Style.AlignAfterOpenBracket = FormatStyle::BAS_Align; + Style.AlignAfterOpenBracket = true; + Style.BreakAfterOpenBracketBracedList = false; verifyFormat("SomeStruct s{\"xxxxxxxxxxxxx\", \"yyyyyyyyyyyyy\",\n" " \"zzzzzzzzzzzzz\"};", Style); @@ -7459,7 +7461,7 @@ TEST_F(FormatTest, ExpressionIndentationBreakingBeforeOperators) { Style.IndentWidth = 4; Style.TabWidth = 4; Style.UseTab = FormatStyle::UT_Always; - Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign; + Style.AlignAfterOpenBracket = false; Style.AlignOperands = FormatStyle::OAS_DontAlign; verifyFormat("return someVeryVeryLongConditionThatBarelyFitsOnALine\n" "\t&& (someOtherLongishConditionPart1\n" @@ -7470,7 +7472,7 @@ TEST_F(FormatTest, ExpressionIndentationBreakingBeforeOperators) { Style); Style = getLLVMStyleWithColumns(20); - Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + Style.BreakAfterOpenBracketFunction = true; Style.BinPackParameters = FormatStyle::BPPS_OnePerLine; Style.BreakBeforeBinaryOperators = FormatStyle::BOS_NonAssignment; Style.ContinuationIndentWidth = 2; @@ -7632,7 +7634,7 @@ TEST_F(FormatTest, NoOperandAlignment) { " * cccccccccccccccccccccccccccccccccccc;", Style); - Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign; + Style.AlignAfterOpenBracket = false; verifyFormat("return (a > b\n" " // comment1\n" " // comment2\n" @@ -11248,7 +11250,7 @@ TEST_F(FormatTest, BreakBeforeTemplateCloser) { TEST_F(FormatTest, WrapsTemplateParameters) { FormatStyle Style = getLLVMStyle(); - Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign; + Style.AlignAfterOpenBracket = false; Style.BreakBeforeBinaryOperators = FormatStyle::BOS_None; verifyFormat( "template struct q {};\n" @@ -11256,7 +11258,7 @@ TEST_F(FormatTest, WrapsTemplateParameters) { " aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaa>\n" " y;", Style); - Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign; + Style.AlignAfterOpenBracket = false; Style.BreakBeforeBinaryOperators = FormatStyle::BOS_All; verifyFormat( "template struct r {};\n" @@ -11264,7 +11266,7 @@ TEST_F(FormatTest, WrapsTemplateParameters) { " aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaa>\n" " y;", Style); - Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + Style.BreakAfterOpenBracketFunction = true; Style.BreakBeforeBinaryOperators = FormatStyle::BOS_None; verifyFormat("template struct s {};\n" "extern s<\n" @@ -11274,7 +11276,7 @@ TEST_F(FormatTest, WrapsTemplateParameters) { "aaaaaaaaaaaaaaaaaaaaaa>\n" " y;", Style); - Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + Style.BreakAfterOpenBracketFunction = true; Style.BreakBeforeBinaryOperators = FormatStyle::BOS_All; verifyFormat("template struct t {};\n" "extern t<\n" @@ -14302,7 +14304,7 @@ TEST_F(FormatTest, LayoutCxx11BraceInitializers) { "};", NoBinPacking); - NoBinPacking.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + NoBinPacking.BreakAfterOpenBracketBracedList = true; verifyFormat("static uint8 CddDp83848Reg[] = {\n" " CDDDP83848_BMCR_REGISTER,\n" " CDDDP83848_BMSR_REGISTER,\n" @@ -15972,13 +15974,14 @@ TEST_F(FormatTest, BreaksStringLiteralOperands) { // In a function call with two operands, with AlignAfterOpenBracket enabled, // the first must be broken with a line break before it. FormatStyle Style = getLLVMStyleWithColumns(25); - Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + Style.BreakAfterOpenBracketFunction = true; verifyFormat("someFunction(\n" " \"long long long \"\n" " \"long\",\n" " a);", "someFunction(\"long long long long\", a);", Style); - Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent; + Style.BreakAfterOpenBracketFunction = true; + Style.BreakBeforeCloseBracketFunction = true; verifyFormat("someFunction(\n" " \"long long long \"\n" " \"long\",\n" @@ -17773,7 +17776,7 @@ TEST_F(FormatTest, ConfigurableSpacesInParens) { Spaces.ColumnLimit = 80; Spaces.IndentWidth = 4; - Spaces.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + Spaces.BreakAfterOpenBracketFunction = true; verifyFormat("void foo( ) {\n" " size_t foo = (*(function))(\n" " Foooo, Barrrrr, Foooo, Barrrr, FoooooooooLooooong, " @@ -17798,7 +17801,8 @@ TEST_F(FormatTest, ConfigurableSpacesInParens) { "}", Spaces); - Spaces.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent; + Spaces.BreakAfterOpenBracketFunction = true; + Spaces.BreakBeforeCloseBracketFunction = true; verifyFormat("void foo( ) {\n" " size_t foo = (*(function))(\n" " Foooo, Barrrrr, Foooo, Barrrr, FoooooooooLooooong, " @@ -22827,7 +22831,7 @@ TEST_F(FormatTest, ConstructorInitializerIndentWidth) { ": aaaaaaaaaaaaa(aaaaaaaaaaaaaa), aaaaaaaaaaaaa(aaaaaaaaaaaaaa),\n" " aaaaaaaaaaaaa(aaaaaaaaaaaaaa) {}", Style); - Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + Style.BreakAfterOpenBracketFunction = true; verifyFormat( "SomeLongTemplateVariableName<\n" " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa>", @@ -24082,7 +24086,7 @@ TEST_F(FormatTest, FormatsLambdas) { " return aFunkyFunctionCall(qux);\n" " }} {}", Style); - Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + Style.BreakAfterOpenBracketFunction = true; // FIXME: The following test should pass, but fails at the time of writing. #if 0 // As long as all the non-lambda arguments fit on a single line, AlwaysBreak diff --git a/clang/unittests/Format/FormatTestJS.cpp b/clang/unittests/Format/FormatTestJS.cpp index 91577b9a49167..4847151c14b33 100644 --- a/clang/unittests/Format/FormatTestJS.cpp +++ b/clang/unittests/Format/FormatTestJS.cpp @@ -2883,7 +2883,7 @@ TEST_F(FormatTestJS, DontBreakFieldsAsGoToLabels) { TEST_F(FormatTestJS, BreakAfterOpenBracket) { auto Style = getGoogleStyle(FormatStyle::LK_JavaScript); - EXPECT_EQ(Style.AlignAfterOpenBracket, FormatStyle::BAS_AlwaysBreak); + EXPECT_EQ(Style.BreakAfterOpenBracketFunction, true); verifyFormat("ctrl.onCopy(/** @type {!WizEvent}*/ (\n" " {event, targetElement: {el: () => selectedElement}}));", Style); From f157f3e996eac95e2741df9bf9b2a5871cdd1a05 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 30 Oct 2025 12:29:09 -0700 Subject: [PATCH 245/539] [NSan] Make Tests work with Internal Shell There was one test that was using a subshell. This is not supported by lit's internal shell. Rewrite the test to use the readfile substitution. Reviewers: alexander-shaposhnikov, fmayer Reviewed By: alexander-shaposhnikov, fmayer Pull Request: https://github.com/llvm/llvm-project/pull/165142 --- compiler-rt/test/nsan/Posix/allocator_mapping.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compiler-rt/test/nsan/Posix/allocator_mapping.cpp b/compiler-rt/test/nsan/Posix/allocator_mapping.cpp index 3a3e655e259d0..a92962e16d9d2 100644 --- a/compiler-rt/test/nsan/Posix/allocator_mapping.cpp +++ b/compiler-rt/test/nsan/Posix/allocator_mapping.cpp @@ -2,7 +2,8 @@ /// Test that a module constructor can not map memory over the NSan heap /// (without MAP_FIXED, of course). // RUN: %clangxx_nsan -O0 %s -o %t_1 -// RUN: %clangxx_nsan -O0 -DHEAP_ADDRESS=$(%run %t_1) %s -o %t_2 && %run %t_2 +// RUN: %run %t_1 > %t.heap_address +// RUN: %clangxx_nsan -O0 -DHEAP_ADDRESS=%{readfile:%t.heap_address} %s -o %t_2 && %run %t_2 #include #include From 246c4c73767da75cd344f06e155cb83fcaee7dbe Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 30 Oct 2025 12:30:47 -0700 Subject: [PATCH 246/539] [XRay] Make Test Work with Internal Shell There was one test that set an environment variable without using env and also used a subshell. These are features the internal shell does not support. Rewrite the test to use readfile/env. Reviewers: fmayer, MaskRay Reviewed By: fmayer Pull Request: https://github.com/llvm/llvm-project/pull/165143 --- compiler-rt/test/xray/TestCases/Posix/fdr-single-thread.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/compiler-rt/test/xray/TestCases/Posix/fdr-single-thread.cpp b/compiler-rt/test/xray/TestCases/Posix/fdr-single-thread.cpp index b8803aedc8851..36a4e65988f9a 100644 --- a/compiler-rt/test/xray/TestCases/Posix/fdr-single-thread.cpp +++ b/compiler-rt/test/xray/TestCases/Posix/fdr-single-thread.cpp @@ -1,11 +1,12 @@ // RUN: %clangxx_xray -g -std=c++11 %s -o %t // RUN: rm -f fdr-logging-1thr-* -// RUN: XRAY_OPTIONS=XRAY_OPTIONS="verbosity=1 patch_premain=true \ +// RUN: env XRAY_OPTIONS=XRAY_OPTIONS="verbosity=1 patch_premain=true \ // RUN: xray_fdr_log=true \ // RUN: xray_fdr_log_func_duration_threshold_us=0 \ // RUN: xray_logfile_base=fdr-logging-1thr-" %run %t 2>&1 +// RUN: ls fdr-logging-1thr-* | head -n1 | tr -d '\n' > %t.xray_input // RUN: %llvm_xray convert --output-format=yaml --symbolize --instr_map=%t \ -// RUN: "`ls fdr-logging-1thr-* | head -n1`" | FileCheck %s +// RUN: "%{readfile:%t.xray_input}" | FileCheck %s // RUN: rm fdr-logging-1thr-* // UNSUPPORTED: target=arm{{.*}} From 04f65dc8c7ff3d9f954d6b182eb04688e41f7447 Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Thu, 30 Oct 2025 12:36:51 -0700 Subject: [PATCH 247/539] [libc] Fix strftime_test (#165770) A typo in #165711 caused sanitizer failures (the small buffer was used for the larger test). Renamed the variables to avoid the mistake in future. --- libc/test/src/time/strftime_test.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/libc/test/src/time/strftime_test.cpp b/libc/test/src/time/strftime_test.cpp index 38176f77804d5..5222152791905 100644 --- a/libc/test/src/time/strftime_test.cpp +++ b/libc/test/src/time/strftime_test.cpp @@ -2329,20 +2329,21 @@ TEST(LlvmLibcStrftimeTest, TimeFormatFullDateTime) { TEST(LlvmLibcStrftimeTest, BufferTooSmall) { struct tm time; - char buffer[1]; + char tiny_buffer[1]; time.tm_year = get_adjusted_year(2025); time.tm_mon = 10; time.tm_mday = 24; size_t written = - LIBC_NAMESPACE::strftime(buffer, sizeof(buffer), "%F", &time); + LIBC_NAMESPACE::strftime(tiny_buffer, sizeof(tiny_buffer), "%F", &time); EXPECT_EQ(written, size_t{0}); - char buffer2[10]; + char small_buffer[10]; // The string "2025-11-24" is 10 chars, // so strftime needs 10 + 1 bytes to write the string and the null terminator. - written = LIBC_NAMESPACE::strftime(buffer, sizeof(buffer2), "%F", &time); + written = + LIBC_NAMESPACE::strftime(small_buffer, sizeof(small_buffer), "%F", &time); EXPECT_EQ(written, size_t{0}); } From c5b88e6bd544ecc62c5a00e7f38c4c30eb24fb87 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 30 Oct 2025 12:42:27 -0700 Subject: [PATCH 248/539] [MSan] Make Test work with Internal Shell This test used a subshell which is not supported by lit's internal shell. Rewrite it to use the readfile substitution. Reviewers: thurstond, fmayer Reviewed By: thurstond, fmayer Pull Request: https://github.com/llvm/llvm-project/pull/165144 --- compiler-rt/test/msan/allocator_mapping.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compiler-rt/test/msan/allocator_mapping.cpp b/compiler-rt/test/msan/allocator_mapping.cpp index e7a12da489152..6eaba7e16a5be 100644 --- a/compiler-rt/test/msan/allocator_mapping.cpp +++ b/compiler-rt/test/msan/allocator_mapping.cpp @@ -3,7 +3,8 @@ // mapping the heap early, in __msan_init. // // RUN: %clangxx_msan -O0 %s -o %t_1 -// RUN: %clangxx_msan -O0 -DHEAP_ADDRESS=$(%run %t_1) %s -o %t_2 && %run %t_2 +// RUN: %run %t_1 > %t.heap_address +// RUN: %clangxx_msan -O0 -DHEAP_ADDRESS=%{readfile:%t.heap_address} %s -o %t_2 && %run %t_2 // // This test only makes sense for the 64-bit allocator. The 32-bit allocator // does not have a fixed mapping. Exclude platforms that use the 32-bit From 7282cb182aa37ce9ad2219045293d49ddc7c8473 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 30 Oct 2025 12:46:01 -0700 Subject: [PATCH 249/539] [Profile] Rewrite Test to work with Internal Shell There was one test that used subshells to read a file. Replace those subshells with the readfile substitution. Reviewers: fmayer, mingmingl-llvm Reviewed By: mingmingl-llvm, fmayer Pull Request: https://github.com/llvm/llvm-project/pull/165145 --- compiler-rt/test/profile/instrprof-hostname.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler-rt/test/profile/instrprof-hostname.c b/compiler-rt/test/profile/instrprof-hostname.c index b77cf8df158bd..c0b3426eeaa84 100644 --- a/compiler-rt/test/profile/instrprof-hostname.c +++ b/compiler-rt/test/profile/instrprof-hostname.c @@ -1,7 +1,7 @@ // RUN: %clang_profgen -o %t -O3 %s // RUN: env LLVM_PROFILE_FILE=%h.%t-%h.profraw_%h %run %t -// RUN: %run uname -n > %t.n -// RUN: llvm-profdata merge -o %t.profdata `cat %t.n`.%t-`cat %t.n`.profraw_`cat %t.n` +// RUN: %run uname -n | tr -d '\n' > %t.n +// RUN: llvm-profdata merge -o %t.profdata %{readfile:%t.n}.%t-%{readfile:%t.n}.profraw_%{readfile:%t.n} // RUN: %clang_profuse=%t.profdata -o - -S -emit-llvm %s | FileCheck %s // REQUIRES: shell From 0d7f1b7569bcd3b17c087c72240a42252111f1ec Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 30 Oct 2025 12:47:14 -0700 Subject: [PATCH 250/539] [MemProf] Make Test work with Internal Shell There is one test that uses a subshell to generate a long path name. Replace it with a python invocation and a readfile substitution. This helps move compiler-rt over to lit's internal shell. Reviewers: fmayer, snehasish, teresajohnson Reviewed By: fmayer, teresajohnson Pull Request: https://github.com/llvm/llvm-project/pull/165146 --- compiler-rt/test/memprof/TestCases/log_path_test.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compiler-rt/test/memprof/TestCases/log_path_test.cpp b/compiler-rt/test/memprof/TestCases/log_path_test.cpp index 664ab79393195..683ca67122c31 100644 --- a/compiler-rt/test/memprof/TestCases/log_path_test.cpp +++ b/compiler-rt/test/memprof/TestCases/log_path_test.cpp @@ -18,7 +18,8 @@ // RUN: %env_memprof_opts=print_text=true:log_path=/dev/null/INVALID not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-BAD-DIR --dump-input=always // Too long log_path. -// RUN: %env_memprof_opts=print_text=true:log_path=`for((i=0;i<10000;i++)); do echo -n $i; done` \ +// RUN: %python -c "for i in range(0, 10000): print(i, end='')" > %t.long_log_path +// RUN: %env_memprof_opts=print_text=true:log_path=%{readfile:%t.long_log_path} \ // RUN: not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-LONG --dump-input=always // Specifying the log name via the __memprof_profile_filename variable. From 8233c50df6dacf9789081f1aa2ccdca7c9110839 Mon Sep 17 00:00:00 2001 From: wdx727 Date: Fri, 31 Oct 2025 04:11:08 +0800 Subject: [PATCH 251/539] Adding Matching and Inference Functionality to Propeller-PR3: Read basic block hashes from propeller profile. (#164223) Adding Matching and Inference Functionality to Propeller. For detailed information, please refer to the following RFC: https://discourse.llvm.org/t/rfc-adding-matching-and-inference-functionality-to-propeller/86238. This is the third PR, which is used to read basic block hashes from the propeller profile. The associated PRs are: PR1: https://github.com/llvm/llvm-project/pull/160706 PR2: https://github.com/llvm/llvm-project/pull/162963 co-authors: lifengxiang1025 [lifengxiang@kuaishou.com](mailto:lifengxiang@kuaishou.com); zcfh [wuminghui03@kuaishou.com](mailto:wuminghui03@kuaishou.com) Co-authored-by: lifengxiang1025 Co-authored-by: zcfh --- .../CodeGen/BasicBlockSectionsProfileReader.h | 4 ++ .../BasicBlockSectionsProfileReader.cpp | 19 +++++++ .../X86/basic-block-sections-bb-hash.ll | 51 +++++++++++++++++++ .../basic-block-sections-clusters-error.ll | 14 +++++ 4 files changed, 88 insertions(+) create mode 100644 llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h index 48650a6df22ff..823753021ff74 100644 --- a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h +++ b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h @@ -54,6 +54,10 @@ struct FunctionPathAndClusterInfo { DenseMap NodeCounts; // Edge counts for each edge, stored as a nested map. DenseMap> EdgeCounts; + // Hash for each basic block. The Hashes are stored for every original block + // (not cloned blocks), hence the map key being unsigned instead of + // UniqueBBID. + DenseMap BBHashes; }; class BasicBlockSectionsProfileReader { diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp index fbcd614b85d18..485b44ae4c4aa 100644 --- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp +++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp @@ -287,6 +287,25 @@ Error BasicBlockSectionsProfileReader::ReadV1Profile() { } continue; } + case 'h': { // Basic block hash secifier. + // Skip the profile when the profile iterator (FI) refers to the + // past-the-end element. + if (FI == ProgramPathAndClusterInfo.end()) + continue; + for (auto BBIDHashStr : Values) { + auto [BBIDStr, HashStr] = BBIDHashStr.split(':'); + unsigned long long BBID = 0, Hash = 0; + if (getAsUnsignedInteger(BBIDStr, 10, BBID)) + return createProfileParseError(Twine("unsigned integer expected: '") + + BBIDStr + "'"); + if (getAsUnsignedInteger(HashStr, 16, Hash)) + return createProfileParseError( + Twine("unsigned integer expected in hex format: '") + HashStr + + "'"); + FI->second.BBHashes[BBID] = Hash; + } + continue; + } default: return createProfileParseError(Twine("invalid specifier: '") + Twine(Specifier) + "'"); diff --git a/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll b/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll new file mode 100644 index 0000000000000..f46d6ed262b2c --- /dev/null +++ b/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll @@ -0,0 +1,51 @@ +; BB section test with basic block hashes. +; +; RUN: llc %s -O0 -mtriple=x86_64-pc-linux -function-sections -filetype=obj -basic-block-address-map -emit-bb-hash -o %t.o +; RUN: obj2yaml %t.o -o %t.yaml +; +;; Profile for version 1: +; RUN: echo 'v1' > %t +; RUN: echo 'f foo' >> %t +; RUN: echo 'g 0:10,1:9,2:1 1:8,3:8 2:2,3:2 3:11' >> %t +; RUN: echo 'c 0 2 3' >> %t + +; These commands read BB hashes from SHT_LLVM_BB_ADDR_MAP +; and put them into the basic blocks sections profile. +; RUN: grep -E '^\s+(- ID:|Hash:)' %t.yaml | \ +; RUN: grep -B1 'Hash:' | \ +; RUN: sed 's/^\s*//; s/^- ID: *//; s/Hash: *0x//' | \ +; RUN: paste -d: - - | \ +; RUN: tr '\n' ' ' | \ +; RUN: sed 's/ $/\n/; s/^/h /' >> %t +; +; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t | FileCheck %s +; +define void @foo(i1 zeroext) nounwind { + %2 = alloca i8, align 1 + %3 = zext i1 %0 to i8 + store i8 %3, ptr %2, align 1 + %4 = load i8, ptr %2, align 1 + %5 = trunc i8 %4 to i1 + br i1 %5, label %6, label %8 + +6: ; preds = %1 + %7 = call i32 @bar() + br label %10 + +8: ; preds = %1 + %9 = call i32 @baz() + br label %10 + +10: ; preds = %8, %6 + ret void +} + +declare i32 @bar() #1 + +declare i32 @baz() #1 + +; CHECK: .section .text.foo,"ax",@progbits +; CHECK: callq baz +; CHECK: retq +; CHECK: .section .text.split.foo,"ax",@progbits +; CHECK: callq bar diff --git a/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll b/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll index 751ab76722c07..eb0a14b2820b4 100644 --- a/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll @@ -69,6 +69,20 @@ ; RUN: echo 'g 0:4,1:2:3' >> %t15 ; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t15 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR15 ; CHECK-ERROR15: LLVM ERROR: invalid profile {{.*}} at line 4: unsigned integer expected: '2:3' +; RUN: echo 'v1' > %t16 +; RUN: echo 'f dummy1' >> %t16 +; RUN: echo 'c 0 1' >> %t16 +; RUN: echo 'g 0:4,1:2' >> %t16 +; RUN: echo 'h a:1111111111111111 1:ffffffffffffffff' >> %t16 +; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t16 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR16 +; CHECK-ERROR16: LLVM ERROR: invalid profile {{.*}} at line 5: unsigned integer expected: 'a' +; RUN: echo 'v1' > %t17 +; RUN: echo 'f dummy1' >> %t17 +; RUN: echo 'c 0 1' >> %t17 +; RUN: echo 'g 0:4,1:2' >> %t17 +; RUN: echo 'h 0:111111111111111g 1:ffffffffffffffff' >> %t17 +; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t17 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR17 +; CHECK-ERROR17: LLVM ERROR: invalid profile {{.*}} at line 5: unsigned integer expected in hex format: '111111111111111g' define i32 @dummy1(i32 %x, i32 %y, i32 %z) { From 82034f52493c7b77d301c43d9821e21b472aadb0 Mon Sep 17 00:00:00 2001 From: Rahman Lavaee Date: Thu, 30 Oct 2025 13:12:06 -0700 Subject: [PATCH 252/539] [SHT_LLVM_BB_ADDR] Implement ELF and YAML support for Propeller CFG data in PGO analysis map. (#164914) This PR implements the ELF support for PostLink CFG in PGO analysis map as discussed in [RFC](https://discourse.llvm.org/t/rfc-extending-the-pgo-analysis-map-with-propeller-cfg-frequencies/88617/2). A later PR will implement the Codegen Support. --- llvm/include/llvm/Object/ELFTypes.h | 42 +++++---- llvm/include/llvm/ObjectYAML/ELFYAML.h | 4 +- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 2 +- llvm/lib/Object/ELF.cpp | 26 ++++-- llvm/lib/ObjectYAML/ELFEmitter.cpp | 18 +++- llvm/lib/ObjectYAML/ELFYAML.cpp | 4 +- .../ELF/bb-addr-map-feature-warning.test | 37 ++++++++ .../ELF/bb-addr-map-pgo-analysis-map.test | 17 ++-- .../ELF/bb-addr-map-pgo-analysis-map.yaml | 49 +++++----- .../ELF/bb-addr-map-pgo-analysis-map.yaml | 25 +++--- llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml | 4 +- llvm/tools/llvm-readobj/ELFDumper.cpp | 4 + llvm/tools/obj2yaml/elf2yaml.cpp | 11 ++- llvm/unittests/Object/ELFObjectFileTest.cpp | 89 ++++++++++--------- llvm/unittests/Object/ELFTypesTest.cpp | 38 ++++---- 15 files changed, 242 insertions(+), 128 deletions(-) create mode 100644 llvm/test/tools/llvm-readobj/ELF/bb-addr-map-feature-warning.test diff --git a/llvm/include/llvm/Object/ELFTypes.h b/llvm/include/llvm/Object/ELFTypes.h index e9a417d3d4fb3..467ab6fd3c1e9 100644 --- a/llvm/include/llvm/Object/ELFTypes.h +++ b/llvm/include/llvm/Object/ELFTypes.h @@ -834,30 +834,32 @@ struct BBAddrMap { bool OmitBBEntries : 1; bool CallsiteEndOffsets : 1; bool BBHash : 1; + bool PostLinkCfg : 1; bool hasPGOAnalysis() const { return FuncEntryCount || BBFreq || BrProb; } bool hasPGOAnalysisBBData() const { return BBFreq || BrProb; } // Encodes to minimum bit width representation. - uint8_t encode() const { - return (static_cast(FuncEntryCount) << 0) | - (static_cast(BBFreq) << 1) | - (static_cast(BrProb) << 2) | - (static_cast(MultiBBRange) << 3) | - (static_cast(OmitBBEntries) << 4) | - (static_cast(CallsiteEndOffsets) << 5) | - (static_cast(BBHash) << 6); + uint16_t encode() const { + return (static_cast(FuncEntryCount) << 0) | + (static_cast(BBFreq) << 1) | + (static_cast(BrProb) << 2) | + (static_cast(MultiBBRange) << 3) | + (static_cast(OmitBBEntries) << 4) | + (static_cast(CallsiteEndOffsets) << 5) | + (static_cast(BBHash) << 6) | + (static_cast(PostLinkCfg) << 7); } // Decodes from minimum bit width representation and validates no // unnecessary bits are used. - static Expected decode(uint8_t Val) { + static Expected decode(uint16_t Val) { Features Feat{ static_cast(Val & (1 << 0)), static_cast(Val & (1 << 1)), static_cast(Val & (1 << 2)), static_cast(Val & (1 << 3)), static_cast(Val & (1 << 4)), static_cast(Val & (1 << 5)), - static_cast(Val & (1 << 6))}; + static_cast(Val & (1 << 6)), static_cast(Val & (1 << 7))}; if (Feat.encode() != Val) return createStringError( std::error_code(), "invalid encoding for BBAddrMap::Features: 0x%x", @@ -867,10 +869,11 @@ struct BBAddrMap { bool operator==(const Features &Other) const { return std::tie(FuncEntryCount, BBFreq, BrProb, MultiBBRange, - OmitBBEntries, CallsiteEndOffsets, BBHash) == + OmitBBEntries, CallsiteEndOffsets, BBHash, PostLinkCfg) == std::tie(Other.FuncEntryCount, Other.BBFreq, Other.BrProb, Other.MultiBBRange, Other.OmitBBEntries, - Other.CallsiteEndOffsets, Other.BBHash); + Other.CallsiteEndOffsets, Other.BBHash, + Other.PostLinkCfg); } }; @@ -1010,23 +1013,30 @@ struct PGOAnalysisMap { /// probability associated with it. struct SuccessorEntry { /// Unique ID of this successor basic block. - uint32_t ID; + uint32_t ID = 0; /// Branch Probability of the edge to this successor taken from MBPI. BranchProbability Prob; + /// Raw edge count from the post link profile (e.g., from bolt or + /// propeller). + uint64_t PostLinkFreq = 0; bool operator==(const SuccessorEntry &Other) const { - return std::tie(ID, Prob) == std::tie(Other.ID, Other.Prob); + return std::tie(ID, Prob, PostLinkFreq) == + std::tie(Other.ID, Other.Prob, Other.PostLinkFreq); } }; /// Block frequency taken from MBFI BlockFrequency BlockFreq; + /// Raw block count taken from the post link profile (e.g., from bolt or + /// propeller). + uint64_t PostLinkBlockFreq = 0; /// List of successors of the current block llvm::SmallVector Successors; bool operator==(const PGOBBEntry &Other) const { - return std::tie(BlockFreq, Successors) == - std::tie(Other.BlockFreq, Other.Successors); + return std::tie(BlockFreq, PostLinkBlockFreq, Successors) == + std::tie(Other.BlockFreq, PostLinkBlockFreq, Other.Successors); } }; diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h index a7c7c7c436dc2..a8236ca37b5ed 100644 --- a/llvm/include/llvm/ObjectYAML/ELFYAML.h +++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h @@ -166,7 +166,7 @@ struct BBAddrMapEntry { std::optional Hash; }; uint8_t Version; - llvm::yaml::Hex8 Feature; + llvm::yaml::Hex16 Feature; struct BBRangeEntry { llvm::yaml::Hex64 BaseAddress; @@ -203,8 +203,10 @@ struct PGOAnalysisMapEntry { struct SuccessorEntry { uint32_t ID; llvm::yaml::Hex32 BrProb; + std::optional PostLinkBrFreq; }; std::optional BBFreq; + std::optional PostLinkBBFreq; std::optional> Successors; }; std::optional FuncEntryCount; diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 8aa488f0efd8f..f65d88a669f13 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1443,7 +1443,7 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges, MF.hasBBSections() && NumMBBSectionRanges > 1, // Use static_cast to avoid breakage of tests on windows. static_cast(BBAddrMapSkipEmitBBEntries), HasCalls, - static_cast(EmitBBHash)}; + static_cast(EmitBBHash), false}; } void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) { diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp index 6da97f9b3755d..354c51d66419c 100644 --- a/llvm/lib/Object/ELF.cpp +++ b/llvm/lib/Object/ELF.cpp @@ -831,17 +831,17 @@ decodeBBAddrMapImpl(const ELFFile &EF, }; uint8_t Version = 0; - uint8_t Feature = 0; + uint16_t Feature = 0; BBAddrMap::Features FeatEnable{}; while (!ULEBSizeErr && !MetadataDecodeErr && Cur && Cur.tell() < Content.size()) { Version = Data.getU8(Cur); if (!Cur) break; - if (Version < 2 || Version > 4) + if (Version < 2 || Version > 5) return createError("unsupported SHT_LLVM_BB_ADDR_MAP version: " + Twine(static_cast(Version))); - Feature = Data.getU8(Cur); // Feature byte + Feature = Version < 5 ? Data.getU8(Cur) : Data.getU16(Cur); if (!Cur) break; auto FeatEnableOrErr = BBAddrMap::Features::decode(Feature); @@ -858,6 +858,11 @@ decodeBBAddrMapImpl(const ELFFile &EF, "basic block hash feature is enabled: version = " + Twine(static_cast(Version)) + " feature = " + Twine(static_cast(Feature))); + if (FeatEnable.PostLinkCfg && Version < 5) + return createError("version should be >= 5 for SHT_LLVM_BB_ADDR_MAP when " + "post link cfg feature is enabled: version = " + + Twine(static_cast(Version)) + + " feature = " + Twine(static_cast(Feature))); uint32_t NumBlocksInBBRange = 0; uint32_t NumBBRanges = 1; typename ELFFile::uintX_t RangeBaseAddress = 0; @@ -946,6 +951,10 @@ decodeBBAddrMapImpl(const ELFFile &EF, uint64_t BBF = FeatEnable.BBFreq ? readULEB128As(Data, Cur, ULEBSizeErr) : 0; + uint32_t PostLinkBBFreq = + FeatEnable.PostLinkCfg + ? readULEB128As(Data, Cur, ULEBSizeErr) + : 0; // Branch probability llvm::SmallVector @@ -955,13 +964,20 @@ decodeBBAddrMapImpl(const ELFFile &EF, for (uint64_t I = 0; I < SuccCount; ++I) { uint32_t BBID = readULEB128As(Data, Cur, ULEBSizeErr); uint32_t BrProb = readULEB128As(Data, Cur, ULEBSizeErr); + uint32_t PostLinkFreq = + FeatEnable.PostLinkCfg + ? readULEB128As(Data, Cur, ULEBSizeErr) + : 0; + if (PGOAnalyses) - Successors.push_back({BBID, BranchProbability::getRaw(BrProb)}); + Successors.push_back( + {BBID, BranchProbability::getRaw(BrProb), PostLinkFreq}); } } if (PGOAnalyses) - PGOBBEntries.push_back({BlockFrequency(BBF), std::move(Successors)}); + PGOBBEntries.push_back( + {BlockFrequency(BBF), PostLinkBBFreq, std::move(Successors)}); } if (PGOAnalyses) diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp index 8b75fbe8291f0..8530785d07c93 100644 --- a/llvm/lib/ObjectYAML/ELFEmitter.cpp +++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp @@ -1465,13 +1465,19 @@ void ELFState::writeSectionContent( for (const auto &[Idx, E] : llvm::enumerate(*Section.Entries)) { // Write version and feature values. if (Section.Type == llvm::ELF::SHT_LLVM_BB_ADDR_MAP) { - if (E.Version > 4) + if (E.Version > 5) WithColor::warning() << "unsupported SHT_LLVM_BB_ADDR_MAP version: " << static_cast(E.Version) << "; encoding using the most recent version"; CBA.write(E.Version); - CBA.write(E.Feature); - SHeader.sh_size += 2; + SHeader.sh_size += 1; + if (E.Version < 5) { + CBA.write(static_cast(E.Feature)); + SHeader.sh_size += 1; + } else { + CBA.write(E.Feature, ELFT::Endianness); + SHeader.sh_size += 2; + } } auto FeatureOrErr = llvm::object::BBAddrMap::Features::decode(E.Feature); bool MultiBBRangeFeatureEnabled = false; @@ -1556,11 +1562,15 @@ void ELFState::writeSectionContent( for (const auto &PGOBBE : PGOBBEntries) { if (PGOBBE.BBFreq) SHeader.sh_size += CBA.writeULEB128(*PGOBBE.BBFreq); + if (FeatureOrErr->PostLinkCfg || PGOBBE.PostLinkBBFreq.has_value()) + SHeader.sh_size += CBA.writeULEB128(PGOBBE.PostLinkBBFreq.value_or(0)); if (PGOBBE.Successors) { SHeader.sh_size += CBA.writeULEB128(PGOBBE.Successors->size()); - for (const auto &[ID, BrProb] : *PGOBBE.Successors) { + for (const auto &[ID, BrProb, PostLinkBrFreq] : *PGOBBE.Successors) { SHeader.sh_size += CBA.writeULEB128(ID); SHeader.sh_size += CBA.writeULEB128(BrProb); + if (FeatureOrErr->PostLinkCfg || PostLinkBrFreq.has_value()) + SHeader.sh_size += CBA.writeULEB128(PostLinkBrFreq.value_or(0)); } } } diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index f8a84b075b779..e5e5fc20728e8 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -1886,7 +1886,7 @@ void MappingTraits::mapping( IO &IO, ELFYAML::BBAddrMapEntry &E) { assert(IO.getContext() && "The IO context is not initialized"); IO.mapRequired("Version", E.Version); - IO.mapOptional("Feature", E.Feature, Hex8(0)); + IO.mapOptional("Feature", E.Feature, Hex16(0)); IO.mapOptional("NumBBRanges", E.NumBBRanges); IO.mapOptional("BBRanges", E.BBRanges); } @@ -1920,6 +1920,7 @@ void MappingTraits::mapping( IO &IO, ELFYAML::PGOAnalysisMapEntry::PGOBBEntry &E) { assert(IO.getContext() && "The IO context is not initialized"); IO.mapOptional("BBFreq", E.BBFreq); + IO.mapOptional("PostLinkBBFreq", E.PostLinkBBFreq); IO.mapOptional("Successors", E.Successors); } @@ -1929,6 +1930,7 @@ void MappingTraits:: assert(IO.getContext() && "The IO context is not initialized"); IO.mapRequired("ID", E.ID); IO.mapRequired("BrProb", E.BrProb); + IO.mapOptional("PostLinkBrFreq", E.PostLinkBrFreq); } void MappingTraits::mapping(IO &IO, diff --git a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-feature-warning.test b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-feature-warning.test new file mode 100644 index 0000000000000..24726c34d3509 --- /dev/null +++ b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-feature-warning.test @@ -0,0 +1,37 @@ +## This test checks that we output a warning when the specified version is too old to support the given features. + +# RUN: yaml2obj %s -o %t +# RUN: llvm-readobj --bb-addr-map %t 2>&1 | FileCheck -DFILE=%t %s + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + +# CHECK: BBAddrMap [ +# CHECK-NEXT: warning: '[[FILE]]': unable to dump SHT_LLVM_BB_ADDR_MAP section with index 1: version should be >= 3 for SHT_LLVM_BB_ADDR_MAP when callsite offsets feature is enabled: version = 2 feature = 32 +Sections: + - Name: '.llvm_bb_addr_map (1)' + Type: SHT_LLVM_BB_ADDR_MAP + Entries: + - Version: 2 + Feature: 0x20 + +# CHECK: BBAddrMap [ +# CHECK-NEXT: warning: '[[FILE]]': unable to dump SHT_LLVM_BB_ADDR_MAP section with index 2: version should be >= 4 for SHT_LLVM_BB_ADDR_MAP when basic block hash feature is enabled: version = 3 feature = 64 + + - Name: '.llvm_bb_addr_map (2)' + Type: SHT_LLVM_BB_ADDR_MAP + Entries: + - Version: 3 + Feature: 0x40 + +# CHECK: BBAddrMap [ +# CHECK-NEXT: warning: '[[FILE]]': unable to dump SHT_LLVM_BB_ADDR_MAP section with index 3: version should be >= 5 for SHT_LLVM_BB_ADDR_MAP when post link cfg feature is enabled: version = 4 feature = 128 + + - Name: '.llvm_bb_addr_map (3)' + Type: SHT_LLVM_BB_ADDR_MAP + Entries: + - Version: 4 + Feature: 0x80 diff --git a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test index 5faafd4d83b2f..8e9d2271b8721 100644 --- a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test +++ b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test @@ -15,7 +15,7 @@ ## Check that a malformed section can be handled. # RUN: yaml2obj %s -DBITS=32 -DSIZE=24 -o %t2.o -# RUN: llvm-readobj %t2.o --bb-addr-map 2>&1 | FileCheck --match-full-lines %s -DOFFSET=0x00000018 -DFILE=%t2.o --check-prefix=TRUNCATED +# RUN: llvm-readobj %t2.o --bb-addr-map 2>&1 | FileCheck --match-full-lines %s -DOFFSET=0x00000015 -DFILE=%t2.o --check-prefix=TRUNCATED ## Check that missing features can be handled. # RUN: yaml2obj %s -DBITS=32 -DFEATURE=0x2 -o %t3.o @@ -59,17 +59,20 @@ # CHECK-NEXT: { # RAW-NEXT: Frequency: 100 # PRETTY-NEXT: Frequency: 1.0 +# CHECK-NEXT: PostLink Frequency: 10 # CHECK-NEXT: Successors [ # CHECK-NEXT: { # CHECK-NEXT: ID: 2 # RAW-NEXT: Probability: 0x80000000 # PRETTY-NEXT: Probability: 0x80000000 / 0x80000000 = 100.00% +# CHECK-NEXT: PostLink Probability: 7 # CHECK-NEXT: } # CHECK-NEXT: ] # CHECK-NEXT: } # CHECK-NEXT: { # RAW-NEXT: Frequency: 100 # PRETTY-NEXT: Frequency: 1.0 +# CHECK-NEXT: PostLink Frequency: 0 # CHECK-NEXT: Successors [ # CHECK-NEXT: ] # CHECK-NEXT: } @@ -172,8 +175,8 @@ Sections: ShSize: [[SIZE=]] Link: .text Entries: - - Version: 2 - Feature: 0x7 + - Version: 5 + Feature: 0x87 BBRanges: - BaseAddress: [[ADDR=0x11111]] BBEntries: @@ -197,10 +200,12 @@ Sections: PGOAnalyses: - FuncEntryCount: 100 PGOBBEntries: - - BBFreq: 100 + - BBFreq: 100 + PostLinkBBFreq: 10 Successors: - - ID: 2 - BrProb: 0x80000000 + - ID: 2 + BrProb: 0x80000000 + PostLinkBrFreq: 7 - BBFreq: 100 Successors: [] - FuncEntryCount: 8888 diff --git a/llvm/test/tools/obj2yaml/ELF/bb-addr-map-pgo-analysis-map.yaml b/llvm/test/tools/obj2yaml/ELF/bb-addr-map-pgo-analysis-map.yaml index 299bf463cf4bc..645507af080cb 100644 --- a/llvm/test/tools/obj2yaml/ELF/bb-addr-map-pgo-analysis-map.yaml +++ b/llvm/test/tools/obj2yaml/ELF/bb-addr-map-pgo-analysis-map.yaml @@ -15,7 +15,7 @@ # VALID-NEXT: Type: SHT_LLVM_BB_ADDR_MAP # VALID-NEXT: Entries: # VALID-NEXT: - Version: 2 -# VALID-NEXT: Feature: 0x7 +# VALID-NEXT: Feature: 0x87 ## The 'BaseAddress' field is omitted when it's zero. # VALID-NEXT: BBRanges: # VALID-NEXT: - BBEntries: @@ -43,17 +43,23 @@ # VALID-NEXT: PGOAnalyses: # VALID-NEXT: - FuncEntryCount: 100 # VALID-NEXT: PGOBBEntries: -# VALID-NEXT: - BBFreq: 100 +# VALID-NEXT: - BBFreq: 100 +# VALID-NEXT: PostLinkBBFreq: 10 # VALID-NEXT: Successors: -# VALID-NEXT: - ID: 2 -# VALID-NEXT: BrProb: 0x80000000 -# VALID-NEXT: - ID: 4 -# VALID-NEXT: BrProb: 0x80000000 -# VALID-NEXT: - BBFreq: 50 +# VALID-NEXT: - ID: 2 +# VALID-NEXT: BrProb: 0x80000000 +# VALID-NEXT: PostLinkBrFreq: 7 +# VALID-NEXT: - ID: 4 +# VALID-NEXT: BrProb: 0x80000000 +# VALID-NEXT: PostLinkBrFreq: 0 +# VALID-NEXT: - BBFreq: 50 +# VALID-NEXT: PostLinkBBFreq: 0 # VALID-NEXT: Successors: -# VALID-NEXT: - ID: 4 -# VALID-NEXT: BrProb: 0xFFFFFFFF -# VALID-NEXT: - BBFreq: 100 +# VALID-NEXT: - ID: 4 +# VALID-NEXT: BrProb: 0xFFFFFFFF +# VALID-NEXT: PostLinkBrFreq: 0 +# VALID-NEXT: - BBFreq: 100 +# VALID-NEXT: PostLinkBBFreq: 3 # VALID-NEXT: Successors: [] # VALID-NEXT: PGOBBEntries: # VALID-NEXT: - BBFreq: 20 @@ -69,7 +75,7 @@ Sections: ShSize: [[SIZE=]] Entries: - Version: 2 - Feature: 0x7 + Feature: 0x87 BBRanges: - BaseAddress: 0x0 BBEntries: @@ -97,17 +103,20 @@ Sections: PGOAnalyses: - FuncEntryCount: 100 PGOBBEntries: - - BBFreq: 100 + - BBFreq: 100 + PostLinkBBFreq: 10 Successors: - - ID: 2 - BrProb: 0x80000000 - - ID: 4 - BrProb: 0x80000000 - - BBFreq: 50 + - ID: 2 + BrProb: 0x80000000 + PostLinkBrFreq: 7 + - ID: 4 + BrProb: 0x80000000 + - BBFreq: 50 Successors: - - ID: 4 - BrProb: 0xFFFFFFFF - - BBFreq: 100 + - ID: 4 + BrProb: 0xFFFFFFFF + - BBFreq: 100 + PostLinkBBFreq: 3 Successors: [] - PGOBBEntries: - BBFreq: 20 diff --git a/llvm/test/tools/yaml2obj/ELF/bb-addr-map-pgo-analysis-map.yaml b/llvm/test/tools/yaml2obj/ELF/bb-addr-map-pgo-analysis-map.yaml index a4cb572e6d993..ac9c8d402b0a6 100644 --- a/llvm/test/tools/yaml2obj/ELF/bb-addr-map-pgo-analysis-map.yaml +++ b/llvm/test/tools/yaml2obj/ELF/bb-addr-map-pgo-analysis-map.yaml @@ -6,8 +6,9 @@ # Case 4: Specify Entries. # CHECK: Name: .llvm_bb_addr_map (1) # CHECK: SectionData ( -# CHECK-NEXT: 0000: 02072000 00000000 0000010B 010203E8 -# CHECK-NEXT: 0010: 07E80702 0CEEDDBB F70E0D91 A2C48801 +# CHECK-NEXT: 0000: 02872000 00000000 0000010B 010203E8 +# CHECK-NEXT: 0010: 07E80764 020CEEDD BBF70E28 0D91A2C4 +# CHECK-NEXT: 0020: 880100 # CHECK-NEXT: ) # Case 7: Not including a field which is enabled in feature doesn't emit value @@ -26,12 +27,12 @@ Sections: ## Test the following cases: ## 1) We can produce an .llvm_bb_addr_map section from a description with -## Entries and PGO Analysis data. +## Entries and PGO Analysis and Post Link data. - Name: '.llvm_bb_addr_map (1)' Type: SHT_LLVM_BB_ADDR_MAP Entries: - Version: 2 - Feature: 0x7 + Feature: 0x87 BBRanges: - BaseAddress: 0x0000000000000020 BBEntries: @@ -42,12 +43,14 @@ Sections: PGOAnalyses: - FuncEntryCount: 1000 PGOBBEntries: - - BBFreq: 1000 + - BBFreq: 1000 + PostLinkBBFreq: 100 Successors: - - ID: 12 - BrProb: 0xeeeeeeee - - ID: 13 - BrProb: 0x11111111 + - ID: 12 + BrProb: 0xeeeeeeee + PostLinkBrFreq: 40 + - ID: 13 + BrProb: 0x11111111 ## 2) According to feature we have FuncEntryCount but none is provided in yaml - Name: '.llvm_bb_addr_map (2)' @@ -66,7 +69,7 @@ Sections: ## Check that yaml2obj generates a warning when we use unsupported feature. # RUN: yaml2obj --docnum=2 %s 2>&1 | FileCheck %s --check-prefix=INVALID-FEATURE -# INVALID-FEATURE: warning: invalid encoding for BBAddrMap::Features: 0xf0 +# INVALID-FEATURE: warning: invalid encoding for BBAddrMap::Features: 0x100 --- !ELF FileHeader: @@ -79,4 +82,4 @@ Sections: Entries: - Version: 2 ## Specify unsupported feature - Feature: 0xF0 + Feature: 0x100 diff --git a/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml b/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml index 339e419b39458..05d77d67e4468 100644 --- a/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml +++ b/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml @@ -220,7 +220,7 @@ Sections: ## Check that yaml2obj generates a warning when we use unsupported versions. # RUN: yaml2obj --docnum=3 %s 2>&1 | FileCheck %s --check-prefix=INVALID-VERSION -# INVALID-VERSION: warning: unsupported SHT_LLVM_BB_ADDR_MAP version: 5; encoding using the most recent version +# INVALID-VERSION: warning: unsupported SHT_LLVM_BB_ADDR_MAP version: 6; encoding using the most recent version --- !ELF FileHeader: @@ -232,4 +232,4 @@ Sections: Type: SHT_LLVM_BB_ADDR_MAP Entries: ## Specify unsupported version - - Version: 5 + - Version: 6 diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index 423a11fd5b72a..6f09da5a4099f 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -8188,6 +8188,8 @@ void LLVMELFDumper::printBBAddrMaps(bool PrettyPGOAnalysis) { } else { W.printNumber("Frequency", PBBE.BlockFreq.getFrequency()); } + if (PAM.FeatEnable.PostLinkCfg) + W.printNumber("PostLink Frequency", PBBE.PostLinkBlockFreq); } if (PAM.FeatEnable.BrProb) { @@ -8200,6 +8202,8 @@ void LLVMELFDumper::printBBAddrMaps(bool PrettyPGOAnalysis) { } else { W.printHex("Probability", Succ.Prob.getNumerator()); } + if (PAM.FeatEnable.PostLinkCfg) + W.printNumber("PostLink Probability", Succ.PostLinkFreq); } } } diff --git a/llvm/tools/obj2yaml/elf2yaml.cpp b/llvm/tools/obj2yaml/elf2yaml.cpp index 68e18f6c79202..4364d15a8b455 100644 --- a/llvm/tools/obj2yaml/elf2yaml.cpp +++ b/llvm/tools/obj2yaml/elf2yaml.cpp @@ -895,7 +895,7 @@ ELFDumper::dumpBBAddrMapSection(const Elf_Shdr *Shdr) { std::vector PGOAnalyses; DataExtractor::Cursor Cur(0); uint8_t Version = 0; - uint8_t Feature = 0; + uint16_t Feature = 0; uint64_t Address = 0; while (Cur && Cur.tell() < Content.size()) { if (Shdr->sh_type == ELF::SHT_LLVM_BB_ADDR_MAP) { @@ -905,7 +905,7 @@ ELFDumper::dumpBBAddrMapSection(const Elf_Shdr *Shdr) { errc::invalid_argument, "invalid SHT_LLVM_BB_ADDR_MAP section version: " + Twine(static_cast(Version))); - Feature = Data.getU8(Cur); + Feature = Version < 5 ? Data.getU8(Cur) : Data.getU16(Cur); } uint64_t NumBBRanges = 1; uint64_t NumBlocks = 0; @@ -972,6 +972,8 @@ ELFDumper::dumpBBAddrMapSection(const Elf_Shdr *Shdr) { auto &PGOBBEntry = PGOBBEntries.emplace_back(); if (FeatureOrErr->BBFreq) { PGOBBEntry.BBFreq = Data.getULEB128(Cur); + if (FeatureOrErr->PostLinkCfg) + PGOBBEntry.PostLinkBBFreq = Data.getULEB128(Cur); if (!Cur) break; } @@ -982,7 +984,10 @@ ELFDumper::dumpBBAddrMapSection(const Elf_Shdr *Shdr) { for (uint64_t SuccIdx = 0; Cur && SuccIdx < SuccCount; ++SuccIdx) { uint32_t ID = Data.getULEB128(Cur); uint32_t BrProb = Data.getULEB128(Cur); - SuccEntries.push_back({ID, BrProb}); + std::optional PostLinkBrFreq; + if (FeatureOrErr->PostLinkCfg) + PostLinkBrFreq = Data.getULEB128(Cur); + SuccEntries.push_back({ID, BrProb, PostLinkBrFreq}); } } } diff --git a/llvm/unittests/Object/ELFObjectFileTest.cpp b/llvm/unittests/Object/ELFObjectFileTest.cpp index d6a3ca53b2154..1e2955ae40a66 100644 --- a/llvm/unittests/Object/ELFObjectFileTest.cpp +++ b/llvm/unittests/Object/ELFObjectFileTest.cpp @@ -531,7 +531,7 @@ TEST(ELFObjectFileTest, InvalidDecodeBBAddrMap) { // Check that we can detect unsupported versions. SmallString<128> UnsupportedVersionYamlString(CommonYamlString); UnsupportedVersionYamlString += R"( - - Version: 5 + - Version: 6 BBRanges: - BaseAddress: 0x11111 BBEntries: @@ -543,7 +543,7 @@ TEST(ELFObjectFileTest, InvalidDecodeBBAddrMap) { { SCOPED_TRACE("unsupported version"); DoCheck(UnsupportedVersionYamlString, - "unsupported SHT_LLVM_BB_ADDR_MAP version: 5"); + "unsupported SHT_LLVM_BB_ADDR_MAP version: 6"); } SmallString<128> ZeroBBRangesYamlString(CommonYamlString); @@ -1181,8 +1181,8 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) { Type: SHT_LLVM_BB_ADDR_MAP # Link: 0 (by default, can be overriden) Entries: - - Version: 2 - Feature: 0x7 + - Version: 5 + Feature: 0x87 BBRanges: - BaseAddress: 0x44444 BBEntries: @@ -1205,7 +1205,8 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) { PGOAnalyses: - FuncEntryCount: 1000 PGOBBEntries: - - BBFreq: 1000 + - BBFreq: 1000 + PostLinkBBFreq: 50 Successors: - ID: 1 BrProb: 0x22222222 @@ -1243,8 +1244,8 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) { Type: SHT_LLVM_BB_ADDR_MAP # Link: 0 (by default, can be overriden) Entries: - - Version: 2 - Feature: 0xc + - Version: 5 + Feature: 0x8c BBRanges: - BaseAddress: 0x66666 BBEntries: @@ -1265,8 +1266,9 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) { PGOAnalyses: - PGOBBEntries: - Successors: - - ID: 1 - BrProb: 0x22222222 + - ID: 1 + BrProb: 0x22222222 + PostLinkBrFreq: 7 - ID: 2 BrProb: 0xcccccccc - Successors: @@ -1278,59 +1280,66 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) { BBAddrMap E1 = { {{0x11111, {{1, 0x0, 0x1, {false, true, false, false, false}, {}, 0}}}}}; PGOAnalysisMap P1 = { - 892, {}, {true, false, false, false, false, false, false}}; + 892, {}, {true, false, false, false, false, false, false, false}}; BBAddrMap E2 = { {{0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}, {}, 0}}}}}; PGOAnalysisMap P2 = {{}, - {{BlockFrequency(343), {}}}, - {false, true, false, false, false, false, false}}; + {{BlockFrequency(343), 0, {}}}, + {false, true, false, false, false, false, false, false}}; BBAddrMap E3 = { {{0x33333, {{0, 0x0, 0x3, {false, true, true, false, false}, {}, 0}, {1, 0x3, 0x3, {false, false, true, false, false}, {}, 0}, {2, 0x6, 0x3, {false, false, false, false, false}, {}, 0}}}}}; - PGOAnalysisMap P3 = {{}, - {{{}, - {{1, BranchProbability::getRaw(0x1111'1111)}, - {2, BranchProbability::getRaw(0xeeee'eeee)}}}, - {{}, {{2, BranchProbability::getRaw(0xffff'ffff)}}}, - {{}, {}}}, - {false, false, true, false, false, false, false}}; + PGOAnalysisMap P3 = { + {}, + {{{}, + 0, + {{1, BranchProbability::getRaw(0x1111'1111), 0}, + {2, BranchProbability::getRaw(0xeeee'eeee), 0}}}, + {{}, 0, {{2, BranchProbability::getRaw(0xffff'ffff), 0}}}, + {{}, 0, {}}}, + {false, false, true, false, false, false, false, false}}; BBAddrMap E4 = { {{0x44444, {{0, 0x0, 0x4, {false, false, false, true, true}, {}, 0}, {1, 0x4, 0x4, {false, false, false, false, false}, {}, 0}, {2, 0x8, 0x4, {false, false, false, false, false}, {}, 0}, {3, 0xc, 0x4, {false, false, false, false, false}, {}, 0}}}}}; - PGOAnalysisMap P4 = { - 1000, - {{BlockFrequency(1000), - {{1, BranchProbability::getRaw(0x2222'2222)}, - {2, BranchProbability::getRaw(0x3333'3333)}, - {3, BranchProbability::getRaw(0xaaaa'aaaa)}}}, - {BlockFrequency(133), - {{2, BranchProbability::getRaw(0x1111'1111)}, - {3, BranchProbability::getRaw(0xeeee'eeee)}}}, - {BlockFrequency(18), {{3, BranchProbability::getRaw(0xffff'ffff)}}}, - {BlockFrequency(1000), {}}}, - {true, true, true, false, false, false, false}}; + PGOAnalysisMap P4 = {1000, + {{BlockFrequency(1000), + 50, + {{1, BranchProbability::getRaw(0x2222'2222), 0}, + {2, BranchProbability::getRaw(0x3333'3333), 0}, + {3, BranchProbability::getRaw(0xaaaa'aaaa), 0}}}, + {BlockFrequency(133), + 0, + {{2, BranchProbability::getRaw(0x1111'1111), 0}, + {3, BranchProbability::getRaw(0xeeee'eeee), 0}}}, + {BlockFrequency(18), + 0, + {{3, BranchProbability::getRaw(0xffff'ffff), 0}}}, + {BlockFrequency(1000), 0, {}}}, + {true, true, true, false, false, false, false, true}}; BBAddrMap E5 = { {{0x55555, {{2, 0x0, 0x2, {false, false, true, false, false}, {}, 0}}}}}; PGOAnalysisMap P5 = { - {}, {}, {false, false, false, false, false, false, false}}; + {}, {}, {false, false, false, false, false, false, false, false}}; BBAddrMap E6 = { {{0x66666, {{0, 0x0, 0x6, {false, true, true, false, false}, {}, 0}, {1, 0x6, 0x6, {false, false, true, false, false}, {}, 0}}}, {0x666661, {{2, 0x0, 0x6, {false, false, false, false, false}, {}, 0}}}}}; - PGOAnalysisMap P6 = {{}, - {{{}, - {{1, BranchProbability::getRaw(0x2222'2222)}, - {2, BranchProbability::getRaw(0xcccc'cccc)}}}, - {{}, {{2, BranchProbability::getRaw(0x8888'8888)}}}, - {{}, {}}}, - {false, false, true, true, false, false, false}}; + PGOAnalysisMap P6 = { + {}, + {{{}, + 0, + {{1, BranchProbability::getRaw(0x2222'2222), 7}, + {2, BranchProbability::getRaw(0xcccc'cccc), 0}}}, + {{}, 0, {{2, BranchProbability::getRaw(0x8888'8888), 0}}}, + {{}, 0, {}}}, + {false, false, true, true, false, false, false, true}}; std::vector Section0BBAddrMaps = {E4, E5, E6}; std::vector Section1BBAddrMaps = {E3}; @@ -1465,7 +1474,7 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) { DoCheckFails( TruncatedYamlString, /*TextSectionIndex=*/std::nullopt, "unable to read SHT_LLVM_BB_ADDR_MAP section with index 6: " - "unexpected end of data at offset 0xa while reading [0x3, 0xb)"); + "unexpected end of data at offset 0xa while reading [0x4, 0xc)"); // Check that we can read the other section's bb-address-maps which are // valid. DoCheckSucceeds(TruncatedYamlString, /*TextSectionIndex=*/2, diff --git a/llvm/unittests/Object/ELFTypesTest.cpp b/llvm/unittests/Object/ELFTypesTest.cpp index 1765e15003963..9e99b4a6d7bf3 100644 --- a/llvm/unittests/Object/ELFTypesTest.cpp +++ b/llvm/unittests/Object/ELFTypesTest.cpp @@ -101,22 +101,24 @@ static_assert( "PGOAnalysisMap should use the same type for basic block ID as BBAddrMap"); TEST(ELFTypesTest, BBAddrMapFeaturesEncodingTest) { - const std::array Decoded = { - {{false, false, false, false, false, false, false}, - {true, false, false, false, false, false, false}, - {false, true, false, false, false, false, false}, - {false, false, true, false, false, false, false}, - {false, false, false, true, false, false, false}, - {true, true, false, false, false, false, false}, - {false, true, true, false, false, false, false}, - {false, true, true, true, false, false, false}, - {true, true, true, true, false, false, false}, - {false, false, false, false, true, false, false}, - {false, false, false, false, false, true, false}, - {false, false, false, false, false, false, true}}}; - const std::array Encoded = { + const std::array Decoded = { + {{false, false, false, false, false, false, false, false}, + {true, false, false, false, false, false, false, false}, + {false, true, false, false, false, false, false, false}, + {false, false, true, false, false, false, false, false}, + {false, false, false, true, false, false, false, false}, + {true, true, false, false, false, false, false, false}, + {false, true, true, false, false, false, false, false}, + {false, true, true, true, false, false, false, false}, + {true, true, true, true, false, false, false, false}, + {false, false, false, false, true, false, false, false}, + {false, false, false, false, false, true, false, false}, + {false, false, false, false, false, false, true, false}, + {false, false, false, false, false, false, false, true}, + {false, false, false, false, false, false, true, true}}}; + const std::array Encoded = { {0b0000, 0b0001, 0b0010, 0b0100, 0b1000, 0b0011, 0b0110, 0b1110, 0b1111, - 0b1'0000, 0b10'0000, 0b100'0000}}; + 0b1'0000, 0b10'0000, 0b100'0000, 0b1000'0000, 0b1100'0000}}; for (const auto &[Feat, EncodedVal] : llvm::zip(Decoded, Encoded)) EXPECT_EQ(Feat.encode(), EncodedVal); for (const auto &[Feat, EncodedVal] : llvm::zip(Decoded, Encoded)) { @@ -129,9 +131,9 @@ TEST(ELFTypesTest, BBAddrMapFeaturesEncodingTest) { TEST(ELFTypesTest, BBAddrMapFeaturesInvalidEncodingTest) { const std::array Errors = { - "invalid encoding for BBAddrMap::Features: 0x80", - "invalid encoding for BBAddrMap::Features: 0xf0"}; - const std::array Values = {{0b1000'0000, 0b1111'0000}}; + "invalid encoding for BBAddrMap::Features: 0x100", + "invalid encoding for BBAddrMap::Features: 0x1000"}; + const std::array Values = {{0b1'0000'0000, 0b1'0000'0000'0000}}; for (const auto &[Val, Error] : llvm::zip(Values, Errors)) { EXPECT_THAT_ERROR(BBAddrMap::Features::decode(Val).takeError(), FailedWithMessage(Error)); From d1197c842eaa5506ca970c07e9eee9523070911c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 30 Oct 2025 20:15:53 +0000 Subject: [PATCH 253/539] [X86] narrowBitOpRMW - add tests showing failure to fold to BTC/BTR/BTS RMW patterns (#165758) Failure to fold if the store's chain doesn't directly touch the RMW load source (we should be using reachesChainWithoutSideEffects to avoid this). Failure to fold if the stored value has additional uses (we could update other uses of the whole stored value to reload after the new narrow store) --- llvm/test/CodeGen/X86/bittest-big-integer.ll | 288 +++++++++++++++++++ 1 file changed, 288 insertions(+) diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index cc3dcf32ac0eb..06e7d4773c58d 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -1676,3 +1676,291 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { %cmp = icmp ne i4096 %test, 0 ret i1 %cmp } + +; Special Cases + +; Multiple uses of the stored value +define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind { +; X86-LABEL: complement_cmpz_i128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: movzbl 12(%ebp), %ecx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %esi +; X86-NEXT: movl 36(%esp,%esi), %eax +; X86-NEXT: movl 40(%esp,%esi), %edi +; X86-NEXT: movl %edi, %edx +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl 32(%esp,%esi), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%esp,%esi), %esi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl %cl, %ebx, %eax +; X86-NEXT: movl 8(%ebp), %ecx +; X86-NEXT: xorl 12(%ecx), %esi +; X86-NEXT: xorl 8(%ecx), %edx +; X86-NEXT: xorl 4(%ecx), %eax +; X86-NEXT: xorl (%ecx), %edi +; X86-NEXT: movl %edx, 8(%ecx) +; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: movl %edi, (%ecx) +; X86-NEXT: movl %eax, 4(%ecx) +; X86-NEXT: orl %esi, %eax +; X86-NEXT: orl %edx, %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: setne %al +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; SSE-LABEL: complement_cmpz_i128: +; SSE: # %bb.0: +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: movl $1, %eax +; SSE-NEXT: xorl %edx, %edx +; SSE-NEXT: shldq %cl, %rax, %rdx +; SSE-NEXT: shlq %cl, %rax +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: testb $64, %cl +; SSE-NEXT: cmovneq %rax, %rdx +; SSE-NEXT: cmovneq %rsi, %rax +; SSE-NEXT: xorq 8(%rdi), %rdx +; SSE-NEXT: xorq (%rdi), %rax +; SSE-NEXT: movq %rax, (%rdi) +; SSE-NEXT: movq %rdx, 8(%rdi) +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: setne %al +; SSE-NEXT: retq +; +; AVX2-LABEL: complement_cmpz_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: movl $1, %eax +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: shldq %cl, %rax, %rdx +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: shlxq %rcx, %rax, %rax +; AVX2-NEXT: testb $64, %cl +; AVX2-NEXT: cmovneq %rax, %rdx +; AVX2-NEXT: cmovneq %rsi, %rax +; AVX2-NEXT: xorq 8(%rdi), %rdx +; AVX2-NEXT: xorq (%rdi), %rax +; AVX2-NEXT: movq %rax, (%rdi) +; AVX2-NEXT: movq %rdx, 8(%rdi) +; AVX2-NEXT: orq %rdx, %rax +; AVX2-NEXT: setne %al +; AVX2-NEXT: retq +; +; AVX512-LABEL: complement_cmpz_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movl $1, %edx +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: shldq %cl, %rdx, %rsi +; AVX512-NEXT: shlxq %rcx, %rdx, %rdx +; AVX512-NEXT: testb $64, %cl +; AVX512-NEXT: cmovneq %rdx, %rsi +; AVX512-NEXT: cmovneq %rax, %rdx +; AVX512-NEXT: xorq 8(%rdi), %rsi +; AVX512-NEXT: xorq (%rdi), %rdx +; AVX512-NEXT: movq %rdx, (%rdi) +; AVX512-NEXT: movq %rsi, 8(%rdi) +; AVX512-NEXT: orq %rsi, %rdx +; AVX512-NEXT: setne %al +; AVX512-NEXT: retq + %rem = and i32 %position, 127 + %ofs = zext nneg i32 %rem to i128 + %bit = shl nuw i128 1, %ofs + %ld = load i128, ptr %word + %res = xor i128 %ld, %bit + store i128 %res, ptr %word + %cmp = icmp ne i128 %res, 0 + ret i1 %cmp +} + +; Multiple loads in store chain +define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { +; X86-LABEL: reset_multiload_i128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %edi +; X86-NEXT: movl 36(%esp,%edi), %edx +; X86-NEXT: movl 40(%esp,%edi), %ebx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl 32(%esp,%edi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%esp,%edi), %edi +; X86-NEXT: shldl %cl, %ebx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: notl %ebx +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%ebp), %eax +; X86-NEXT: andl $96, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: movl 8(%ebp), %ecx +; X86-NEXT: movl (%ecx,%eax), %eax +; X86-NEXT: andl %ebx, (%ecx) +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: notl %edx +; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: andl %edx, 4(%ebx) +; X86-NEXT: notl %esi +; X86-NEXT: andl %esi, 8(%ebx) +; X86-NEXT: notl %edi +; X86-NEXT: andl %edi, 12(%ebx) +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: jae .LBB22_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: .LBB22_2: +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; SSE-LABEL: reset_multiload_i128: +; SSE: # %bb.0: +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: movl $1, %esi +; SSE-NEXT: xorl %r8d, %r8d +; SSE-NEXT: shldq %cl, %rsi, %r8 +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: shlq %cl, %rsi +; SSE-NEXT: testb $64, %cl +; SSE-NEXT: cmovneq %rsi, %r8 +; SSE-NEXT: cmovneq %rax, %rsi +; SSE-NEXT: notq %r8 +; SSE-NEXT: notq %rsi +; SSE-NEXT: movl %ecx, %r9d +; SSE-NEXT: andl $96, %r9d +; SSE-NEXT: shrl $3, %r9d +; SSE-NEXT: movl (%rdi,%r9), %r9d +; SSE-NEXT: btl %ecx, %r9d +; SSE-NEXT: jb .LBB22_2 +; SSE-NEXT: # %bb.1: +; SSE-NEXT: movl (%rdx), %eax +; SSE-NEXT: .LBB22_2: +; SSE-NEXT: andq %r8, 8(%rdi) +; SSE-NEXT: andq %rsi, (%rdi) +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: reset_multiload_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: movl $1, %r8d +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: shldq %cl, %r8, %rsi +; AVX2-NEXT: shlxq %rcx, %r8, %r8 +; AVX2-NEXT: testb $64, %cl +; AVX2-NEXT: cmovneq %r8, %rsi +; AVX2-NEXT: cmovneq %rax, %r8 +; AVX2-NEXT: notq %rsi +; AVX2-NEXT: notq %r8 +; AVX2-NEXT: movl %ecx, %r9d +; AVX2-NEXT: andl $96, %r9d +; AVX2-NEXT: shrl $3, %r9d +; AVX2-NEXT: movl (%rdi,%r9), %r9d +; AVX2-NEXT: btl %ecx, %r9d +; AVX2-NEXT: jb .LBB22_2 +; AVX2-NEXT: # %bb.1: +; AVX2-NEXT: movl (%rdx), %eax +; AVX2-NEXT: .LBB22_2: +; AVX2-NEXT: andq %rsi, 8(%rdi) +; AVX2-NEXT: andq %r8, (%rdi) +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: reset_multiload_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: movl $1, %r8d +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: shldq %cl, %r8, %rsi +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: shlxq %rcx, %r8, %r8 +; AVX512-NEXT: testb $64, %cl +; AVX512-NEXT: cmovneq %r8, %rsi +; AVX512-NEXT: cmovneq %rax, %r8 +; AVX512-NEXT: notq %rsi +; AVX512-NEXT: notq %r8 +; AVX512-NEXT: movl %ecx, %r9d +; AVX512-NEXT: andl $96, %r9d +; AVX512-NEXT: shrl $3, %r9d +; AVX512-NEXT: movl (%rdi,%r9), %r9d +; AVX512-NEXT: btl %ecx, %r9d +; AVX512-NEXT: jb .LBB22_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: movl (%rdx), %eax +; AVX512-NEXT: .LBB22_2: +; AVX512-NEXT: andq %rsi, 8(%rdi) +; AVX512-NEXT: andq %r8, (%rdi) +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq + %rem = and i32 %position, 127 + %ofs = zext nneg i32 %rem to i128 + %bit = shl nuw i128 1, %ofs + %mask = xor i128 %bit, -1 + %ld = load i128, ptr %word + %sel = load i32, ptr %p + %test = and i128 %ld, %bit + %res = and i128 %ld, %mask + %cmp = icmp eq i128 %test, 0 + store i128 %res, ptr %word + %ret = select i1 %cmp, i32 %sel, i32 0 + ret i32 %ret +} From 65c26f01dcf7b18fdfaef21740e7b98307f4ac1d Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 30 Oct 2025 13:19:44 -0700 Subject: [PATCH 254/539] [TSan] Make Test work with Internal Shell This test was using subshells to setup LD_LIBRARY_PATH properly. Use a python script and readfile substitutions to preserve the same behavior. Reviewers: vitalybuka, fmayer, thurstond Reviewed By: thurstond Pull Request: https://github.com/llvm/llvm-project/pull/165147 --- compiler-rt/test/tsan/ignore_lib0.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/compiler-rt/test/tsan/ignore_lib0.cpp b/compiler-rt/test/tsan/ignore_lib0.cpp index cba58c6177038..1673e8df6c50d 100644 --- a/compiler-rt/test/tsan/ignore_lib0.cpp +++ b/compiler-rt/test/tsan/ignore_lib0.cpp @@ -4,11 +4,13 @@ // RUN: %clangxx_tsan -O1 -fno-builtin %s -DLIB -fPIC -fno-sanitize=thread -shared -o %t-dir/libignore_lib0.so // RUN: %clangxx_tsan -O1 %s -L%t-dir -lignore_lib0 %link_libcxx_tsan -o %t // RUN: echo running w/o suppressions: -// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %deflake %run %t | FileCheck %s --check-prefix=CHECK-NOSUPP +// RUN: echo -n %t-dir > %t.ld_library_path +// RUN: python -c "if 'LD_LIBRARY_PATH' in __import__('os').environ: print(':' + __import__('os').environ['LD_LIBRARY_PATH'], end='')" >> %t.ld_library_path +// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %deflake %run %t | FileCheck %s --check-prefix=CHECK-NOSUPP // RUN: echo running with suppressions: -// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %env_tsan_opts=suppressions='%s.supp' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP +// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %env_tsan_opts=suppressions='%s.supp' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP // RUN: echo running with generic suppression of noninstrumented code: -// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %env_tsan_opts=ignore_noninstrumented_modules=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP +// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %env_tsan_opts=ignore_noninstrumented_modules=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP // Tests that interceptors coming from a library specified in called_from_lib // suppression are ignored. From 5f4bd93c1a2c2707a796f16f73fb359020517aab Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Thu, 30 Oct 2025 15:26:13 -0500 Subject: [PATCH 255/539] [libclang/python] Add isFunctionInlined support (#162882) `cindex.py` was missing support for [isFunctionInlined](https://clang.llvm.org/doxygen/group__CINDEX__TYPES.html#ga963097b9aecabf5dce7554dff18b061d), this PR add it. --------- Co-authored-by: Vlad Serebrennikov --- clang/bindings/python/clang/cindex.py | 8 ++++++++ clang/bindings/python/tests/cindex/test_cursor.py | 15 +++++++++++++++ clang/docs/ReleaseNotes.rst | 1 + 3 files changed, 24 insertions(+) diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py index 2786add27f5e8..c48bc9c2eb7de 100644 --- a/clang/bindings/python/clang/cindex.py +++ b/clang/bindings/python/clang/cindex.py @@ -2362,6 +2362,13 @@ def get_bitfield_width(self) -> int: """ return conf.lib.clang_getFieldDeclBitWidth(self) # type: ignore [no-any-return] + @cursor_null_guard + def is_function_inlined(self) -> bool: + """ + Check if the function is inlined. + """ + return bool(conf.lib.clang_Cursor_isFunctionInlined(self)) + @cursor_null_guard def has_attrs(self) -> bool: """ @@ -4310,6 +4317,7 @@ def set_property(self, property, value): ("clang_Cursor_isAnonymous", [Cursor], bool), ("clang_Cursor_isAnonymousRecordDecl", [Cursor], bool), ("clang_Cursor_isBitField", [Cursor], bool), + ("clang_Cursor_isFunctionInlined", [Cursor], c_uint), ("clang_Location_isInSystemHeader", [SourceLocation], bool), ("clang_PrintingPolicy_dispose", [PrintingPolicy]), ("clang_PrintingPolicy_getProperty", [PrintingPolicy, c_int], c_uint), diff --git a/clang/bindings/python/tests/cindex/test_cursor.py b/clang/bindings/python/tests/cindex/test_cursor.py index eb0d1d50601a6..7cb616a7ef148 100644 --- a/clang/bindings/python/tests/cindex/test_cursor.py +++ b/clang/bindings/python/tests/cindex/test_cursor.py @@ -784,6 +784,21 @@ def test_storage_class(self): cursor = get_cursor(tu, "reg") self.assertEqual(cursor.storage_class, StorageClass.REGISTER) + def test_function_inlined(self): + tu = get_tu( + """ +inline void f_inline(void); +void f_noninline(void); +int d_noninline; +""" + ) + cursor = get_cursor(tu, "f_inline") + self.assertEqual(cursor.is_function_inlined(), True) + cursor = get_cursor(tu, "f_noninline") + self.assertEqual(cursor.is_function_inlined(), False) + cursor = get_cursor(tu, "d_noninline") + self.assertEqual(cursor.is_function_inlined(), False) + def test_availability(self): tu = get_tu("class A { A(A const&) = delete; };", lang="cpp") diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index ba737b9efb003..73aaaad8b32e5 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -659,6 +659,7 @@ Sanitizers Python Binding Changes ---------------------- +- Exposed ``clang_Cursor_isFunctionInlined``. - Exposed ``clang_getCursorLanguage`` via ``Cursor.language``. - Add all missing ``CursorKind``s, ``TypeKind``s and ``ExceptionSpecificationKind``s from ``Index.h`` From f915e4ab8c1cffc8a52cf99f34142570d7ffddb2 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 30 Oct 2025 20:56:13 +0000 Subject: [PATCH 256/539] Revert "[TSan] Make Test work with Internal Shell" This reverts commit 87616939190b1c0d322f0f3c1d69ba3626d18582. This broke a buildbot. Reverting so I can ensure I'm comitting with the proper fix given this didn't reporoduce locally on my Linux box. https://lab.llvm.org/buildbot/#/builders/174/builds/26760 --- compiler-rt/test/tsan/ignore_lib0.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/compiler-rt/test/tsan/ignore_lib0.cpp b/compiler-rt/test/tsan/ignore_lib0.cpp index 1673e8df6c50d..cba58c6177038 100644 --- a/compiler-rt/test/tsan/ignore_lib0.cpp +++ b/compiler-rt/test/tsan/ignore_lib0.cpp @@ -4,13 +4,11 @@ // RUN: %clangxx_tsan -O1 -fno-builtin %s -DLIB -fPIC -fno-sanitize=thread -shared -o %t-dir/libignore_lib0.so // RUN: %clangxx_tsan -O1 %s -L%t-dir -lignore_lib0 %link_libcxx_tsan -o %t // RUN: echo running w/o suppressions: -// RUN: echo -n %t-dir > %t.ld_library_path -// RUN: python -c "if 'LD_LIBRARY_PATH' in __import__('os').environ: print(':' + __import__('os').environ['LD_LIBRARY_PATH'], end='')" >> %t.ld_library_path -// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %deflake %run %t | FileCheck %s --check-prefix=CHECK-NOSUPP +// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %deflake %run %t | FileCheck %s --check-prefix=CHECK-NOSUPP // RUN: echo running with suppressions: -// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %env_tsan_opts=suppressions='%s.supp' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP +// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %env_tsan_opts=suppressions='%s.supp' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP // RUN: echo running with generic suppression of noninstrumented code: -// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %env_tsan_opts=ignore_noninstrumented_modules=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP +// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %env_tsan_opts=ignore_noninstrumented_modules=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP // Tests that interceptors coming from a library specified in called_from_lib // suppression are ignored. From 016c008a4ff775fa02f1d1b9f475919141c3e874 Mon Sep 17 00:00:00 2001 From: Greg Clayton Date: Thu, 30 Oct 2025 14:08:44 -0700 Subject: [PATCH 257/539] Enable LLDB to load large dSYM files. (#164471) llvm-dsymutil can produce mach-o files where some sections in __DWARF exceed the 4GB barrier and subsequent sections in the dSYM will be inaccessible because the mach-o section_64 structure only has a 32 bit file offset. This patch enables LLDB to load a large dSYM file by figuring out when this happens and properly adjusting the file offset of the LLDB sections. I was unable to add a test as obj2yaml and yaml2obj are broken for mach-o files and they can't convert a yaml file back into a valid mach-o object file. Any suggestions for adding a test would be appreciated. --- .../ObjectFile/Mach-O/ObjectFileMachO.cpp | 30 +++++++++++++----- .../MachO/Inputs/section-overflow-binary | Bin 0 -> 344 bytes .../MachO/section-overflow-binary.test | 13 ++++++++ 3 files changed, 35 insertions(+), 8 deletions(-) create mode 100644 lldb/test/Shell/ObjectFile/MachO/Inputs/section-overflow-binary create mode 100644 lldb/test/Shell/ObjectFile/MachO/section-overflow-binary.test diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp index 9cdb8467bfc60..c8e520d687f67 100644 --- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp +++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp @@ -1674,6 +1674,10 @@ void ObjectFileMachO::ProcessSegmentCommand( uint32_t segment_sect_idx; const lldb::user_id_t first_segment_sectID = context.NextSectionIdx + 1; + // 64 bit mach-o files have sections with 32 bit file offsets. If any section + // data end will exceed UINT32_MAX, then we need to do some bookkeeping to + // ensure we can access this data correctly. + uint64_t section_offset_adjust = 0; const uint32_t num_u32s = load_cmd.cmd == LC_SEGMENT ? 7 : 8; for (segment_sect_idx = 0; segment_sect_idx < load_cmd.nsects; ++segment_sect_idx) { @@ -1697,6 +1701,16 @@ void ObjectFileMachO::ProcessSegmentCommand( // isn't stored in the abstracted Sections. m_mach_sections.push_back(sect64); + // Make sure we can load sections in mach-o files where some sections cross + // a 4GB boundary. llvm::MachO::section_64 have only 32 bit file offsets + // for the file offset of the section contents, so we need to track and + // sections that overflow and adjust the offsets accordingly. + const uint64_t section_file_offset = + (uint64_t)sect64.offset + section_offset_adjust; + const uint64_t end_section_offset = (uint64_t)sect64.offset + sect64.size; + if (end_section_offset >= UINT32_MAX) + section_offset_adjust += end_section_offset & 0xFFFFFFFF00000000ull; + if (add_section) { ConstString section_name( sect64.sectname, strnlen(sect64.sectname, sizeof(sect64.sectname))); @@ -1736,13 +1750,13 @@ void ObjectFileMachO::ProcessSegmentCommand( } // Grow the section size as needed. - if (sect64.offset) { + if (section_file_offset) { const lldb::addr_t segment_min_file_offset = segment->GetFileOffset(); const lldb::addr_t segment_max_file_offset = segment_min_file_offset + segment->GetFileSize(); - const lldb::addr_t section_min_file_offset = sect64.offset; + const lldb::addr_t section_min_file_offset = section_file_offset; const lldb::addr_t section_max_file_offset = section_min_file_offset + sect64.size; const lldb::addr_t new_file_offset = @@ -1769,10 +1783,10 @@ void ObjectFileMachO::ProcessSegmentCommand( // other sections. sect64.addr, // File VM address == addresses as they are // found in the object file - sect64.size, // VM size in bytes of this section - sect64.offset, // Offset to the data for this section in + sect64.size, // VM size in bytes of this section + section_file_offset, // Offset to the data for this section in // the file - sect64.offset ? sect64.size : 0, // Size in bytes of + section_file_offset ? sect64.size : 0, // Size in bytes of // this section as // found in the file sect64.align, @@ -1792,14 +1806,14 @@ void ObjectFileMachO::ProcessSegmentCommand( SectionSP section_sp(new Section( segment_sp, module_sp, this, ++context.NextSectionIdx, section_name, sect_type, sect64.addr - segment_sp->GetFileAddress(), sect64.size, - sect64.offset, sect64.offset == 0 ? 0 : sect64.size, sect64.align, - sect64.flags)); + section_file_offset, section_file_offset == 0 ? 0 : sect64.size, + sect64.align, sect64.flags)); // Set the section to be encrypted to match the segment bool section_is_encrypted = false; if (!segment_is_encrypted && load_cmd.filesize != 0) section_is_encrypted = context.EncryptedRanges.FindEntryThatContains( - sect64.offset) != nullptr; + section_file_offset) != nullptr; section_sp->SetIsEncrypted(segment_is_encrypted || section_is_encrypted); section_sp->SetPermissions(segment_permissions); diff --git a/lldb/test/Shell/ObjectFile/MachO/Inputs/section-overflow-binary b/lldb/test/Shell/ObjectFile/MachO/Inputs/section-overflow-binary new file mode 100644 index 0000000000000000000000000000000000000000..19dc2f4ac9ffe55c414b1f37817099f07846ad00 GIT binary patch literal 344 zcmX^A>+L@t1_nk3Am9RG5W@mUv@$3FSqu!4Kn&u?$Ge0(2DyR7K*Wdt|Np}{Oh|kO zs31F#W&>hoC_g?vB{iuuJw7ohsVKD!w|NRs^&s;>=E4BTUC01tUS?ieK1dwheF8u+ em>!ThAU-kX Date: Thu, 30 Oct 2025 23:15:19 +0200 Subject: [PATCH 258/539] [NFCI][lldb][test] Add missing includes (#165772) `std::ref()` is provided in `` and with recent libc++ changes it no longer seems to be included transitively. Fix by including explicitly. --- lldb/test/Shell/Register/Inputs/x86-multithread-read.cpp | 1 + lldb/test/Shell/Register/Inputs/x86-multithread-write.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/lldb/test/Shell/Register/Inputs/x86-multithread-read.cpp b/lldb/test/Shell/Register/Inputs/x86-multithread-read.cpp index c5f571fc1d2c4..0d2869c0c577c 100644 --- a/lldb/test/Shell/Register/Inputs/x86-multithread-read.cpp +++ b/lldb/test/Shell/Register/Inputs/x86-multithread-read.cpp @@ -1,4 +1,5 @@ #include +#include #include #include diff --git a/lldb/test/Shell/Register/Inputs/x86-multithread-write.cpp b/lldb/test/Shell/Register/Inputs/x86-multithread-write.cpp index 320f9e938e5bf..1f4e91acc4c03 100644 --- a/lldb/test/Shell/Register/Inputs/x86-multithread-write.cpp +++ b/lldb/test/Shell/Register/Inputs/x86-multithread-write.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include From 2d990e1156c803169142a2761fe7161de9d5ffb7 Mon Sep 17 00:00:00 2001 From: Raul Tambre Date: Thu, 30 Oct 2025 23:15:36 +0200 Subject: [PATCH 259/539] [NFCI][lldb] Omit redundant member initializer list (#164451) These all have member initializers of the same value so they're redundant. Fixes: 47b9aadb3215e914119d0c45827ea58cb7499204 --- lldb/include/lldb/Target/Process.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h index 8f5892e16cedf..c1f9785e76f90 100644 --- a/lldb/include/lldb/Target/Process.h +++ b/lldb/include/lldb/Target/Process.h @@ -127,10 +127,7 @@ class ProcessAttachInfo : public ProcessInstanceInfo { public: ProcessAttachInfo() = default; - ProcessAttachInfo(const ProcessLaunchInfo &launch_info) - : m_resume_count(0), m_wait_for_launch(false), m_ignore_existing(true), - m_continue_once_attached(false), m_detach_on_error(true), - m_async(false) { + ProcessAttachInfo(const ProcessLaunchInfo &launch_info) { ProcessInfo::operator=(launch_info); SetProcessPluginName(launch_info.GetProcessPluginName()); SetResumeCount(launch_info.GetResumeCount()); From 4cdc3f34ce7c3bd7da615b181fd559e9dfdd4946 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 30 Oct 2025 14:17:34 -0700 Subject: [PATCH 260/539] [GitHub] Move Copilot instructions for LLVM (#165763) GitHub allows specifying custom instructions for the GitHub Copilot reviewer [1]. Currently, we have a top level file, but GitHub supports having different instructions for different files, which requires creating an `instructions` subdirectory with multiple files and a patch it applies to. This PR moves the top level file into a new `instructions` directory, and make it apply to the `llvm/` subdirectory. I spoke with Mircea at the Dev Meeting and that should match his original intent. [1] https://docs.github.com/en/copilot/how-tos/use-copilot-agents/request-a-code-review/use-code-review#customizing-copilots-reviews-with-custom-instructions --- .../llvm.instructions.md} | 4 ++++ 1 file changed, 4 insertions(+) rename .github/{copilot-instructions.md => instructions/llvm.instructions.md} (90%) diff --git a/.github/copilot-instructions.md b/.github/instructions/llvm.instructions.md similarity index 90% rename from .github/copilot-instructions.md rename to .github/instructions/llvm.instructions.md index 03748938700e3..3f1308f51e676 100644 --- a/.github/copilot-instructions.md +++ b/.github/instructions/llvm.instructions.md @@ -1,3 +1,7 @@ +--- +applyTo: llvm/**/* +--- + When performing a code review, pay close attention to code modifying a function's control flow. Could the change result in the corruption of performance profile data? Could the change result in invalid debug information, in particular for From c95fc8d73725b4525c87a9bceea5af6bb4cecdfb Mon Sep 17 00:00:00 2001 From: Raul Tambre Date: Thu, 30 Oct 2025 23:18:32 +0200 Subject: [PATCH 261/539] [NFCI][lldb][test] Fix mismatched C/C++ substitutions (#165773) Most of the cases were where a C++ file was being compiled with the C substitution. There were a few cases of the opposite though. LLDB seems to be the only real culprit in the LLVM codebase for these mismatches. Rest of the LLVM presumably sticks at least language-specific options in the common substitutions making the mistakes immediately apparent. I found these by using Clang frontend configuration files containing language-specific options for both C and C++ (e.g. `-std=c2y` and `-std=c++26`). --- .../breakpoint/same_cu_name/Makefile | 8 ++++---- .../Breakpoint/jit-loader_jitlink_elf.test | 4 ++-- .../Breakpoint/jit-loader_rtdyld_elf.test | 4 ++-- .../command-image-dump-ast-colored.test | 2 +- .../Shell/Commands/command-image-dump-ast.test | 2 +- lldb/test/Shell/Commands/list-header.test | 8 ++++---- lldb/test/Shell/Error/cleanup.cpp | 2 +- lldb/test/Shell/Expr/TestExprLanguageNote.test | 2 +- lldb/test/Shell/Expr/TestLambdaExprImport.test | 2 +- lldb/test/Shell/ObjectFile/ELF/elf-memory.test | 2 +- ...verbose_trap-in-stl-callback-user-leaf.test | 2 +- .../verbose_trap-in-stl-callback.test | 2 +- .../verbose_trap-in-stl-max-depth.test | 2 +- .../Recognizer/verbose_trap-in-stl-nested.test | 2 +- .../Shell/Recognizer/verbose_trap-in-stl.test | 2 +- lldb/test/Shell/Recognizer/verbose_trap.test | 8 ++++---- .../Settings/TestChildCountTruncation.test | 2 +- .../Settings/TestChildDepthTruncation.test | 2 +- .../Shell/Settings/TestCxxFrameFormat.test | 2 +- .../Settings/TestCxxFrameFormatEmpty.test | 2 +- .../TestCxxFrameFormatMixedLanguages.test | 12 ++++++------ .../TestCxxFrameFormatPartialFailure.test | 2 +- .../TestFrameFormatFunctionBasename.test | 4 ++-- ...tFrameFormatFunctionFormattedArguments.test | 4 ++-- .../TestFrameFormatFunctionQualifiers.test | 4 ++-- .../TestFrameFormatFunctionReturn.test | 4 ++-- .../Settings/TestFrameFormatFunctionScope.test | 4 ++-- .../TestFrameFormatFunctionSuffix.test | 2 +- ...stFrameFormatFunctionTemplateArguments.test | 4 ++-- .../Settings/TestFrameFunctionInlined.test | 2 +- .../DWARF/split-dwarf-expression-eval-bug.cpp | 8 ++++---- .../DWARF/x86/apple-index-is-used.cpp | 2 +- .../DWARF/x86/debug-names-compressed.cpp | 2 +- .../DWARF/x86/debug-types-debug-names.cpp | 2 +- .../x86/debug-types-dwo-cross-reference.cpp | 4 ++-- .../DWARF/x86/dwarf5-index-is-used.cpp | 2 +- .../DWARF/x86/dwarf5-partial-index.cpp | 4 ++-- .../DWARF/x86/dwo-not-found-warning.cpp | 2 +- .../DWARF/x86/dwp-foreign-type-units.cpp | 4 ++-- .../SymbolFile/DWARF/x86/dwp-index-cache.cpp | 8 ++++---- .../DWARF/x86/dwp-separate-debug-file.cpp | 6 +++--- .../DWARF/x86/find-basic-function.cpp | 6 +++--- .../DWARF/x86/find-basic-namespace.cpp | 6 +++--- .../SymbolFile/DWARF/x86/find-basic-type.cpp | 6 +++--- .../DWARF/x86/find-basic-variable.cpp | 6 +++--- .../DWARF/x86/find-function-regex.cpp | 6 +++--- .../DWARF/x86/find-method-local-struct.cpp | 2 +- .../Shell/SymbolFile/DWARF/x86/find-method.cpp | 6 +++--- .../DWARF/x86/find-qualified-variable.cpp | 2 +- .../SymbolFile/DWARF/x86/find-variable-dwo.cpp | 4 ++-- .../DWARF/x86/find-variable-file.cpp | 18 +++++++++--------- .../SymbolFile/DWARF/x86/member-pointers.cpp | 2 +- .../SymbolFile/DWARF/x86/module-ownership.mm | 2 +- .../x86/no_unique_address-with-bitfields.cpp | 2 +- .../DWARF/x86/type-definition-search.cpp | 12 ++++++------ .../DWARF/x86/type-unit-same-basename.cpp | 4 ++-- 56 files changed, 116 insertions(+), 116 deletions(-) diff --git a/lldb/test/API/functionalities/breakpoint/same_cu_name/Makefile b/lldb/test/API/functionalities/breakpoint/same_cu_name/Makefile index b19e7818601eb..b508da24c6828 100644 --- a/lldb/test/API/functionalities/breakpoint/same_cu_name/Makefile +++ b/lldb/test/API/functionalities/breakpoint/same_cu_name/Makefile @@ -4,16 +4,16 @@ LD_EXTRAS := ns1.o ns2.o ns3.o ns4.o a.out: main.o ns1.o ns2.o ns3.o ns4.o ns1.o: common.cpp - $(CC) -gdwarf -c -DNAMESPACE=ns1 -o $@ $< + $(CXX) -gdwarf -c -DNAMESPACE=ns1 -o $@ $< ns2.o: common.cpp - $(CC) -gdwarf -c -DNAMESPACE=ns2 -o $@ $< + $(CXX) -gdwarf -c -DNAMESPACE=ns2 -o $@ $< ns3.o: common.cpp - $(CC) -gdwarf -c -DNAMESPACE=ns3 -o $@ $< + $(CXX) -gdwarf -c -DNAMESPACE=ns3 -o $@ $< ns4.o: common.cpp - $(CC) -gdwarf -c -DNAMESPACE=ns4 -o $@ $< + $(CXX) -gdwarf -c -DNAMESPACE=ns4 -o $@ $< include Makefile.rules diff --git a/lldb/test/Shell/Breakpoint/jit-loader_jitlink_elf.test b/lldb/test/Shell/Breakpoint/jit-loader_jitlink_elf.test index 52c86fa5530bf..9a972f1f1ece7 100644 --- a/lldb/test/Shell/Breakpoint/jit-loader_jitlink_elf.test +++ b/lldb/test/Shell/Breakpoint/jit-loader_jitlink_elf.test @@ -3,8 +3,8 @@ # JITLink is the Orc-specific JIT linker implementation. # -# RUN: %clang -g -S -emit-llvm -fPIC --target=x86_64-unknown-unknown-elf \ -# RUN: -o %t.ll %p/Inputs/jitbp.cpp +# RUN: %clangxx -g -S -emit-llvm -fPIC --target=x86_64-unknown-unknown-elf \ +# RUN: -o %t.ll %p/Inputs/jitbp.cpp # RUN: %lldb -b -o 'settings set plugin.jit-loader.gdb.enable on' -o 'b jitbp' \ # RUN: -o 'run --jit-linker=jitlink %t.ll' lli | FileCheck %s diff --git a/lldb/test/Shell/Breakpoint/jit-loader_rtdyld_elf.test b/lldb/test/Shell/Breakpoint/jit-loader_rtdyld_elf.test index b34a5673936f5..ae9402a519494 100644 --- a/lldb/test/Shell/Breakpoint/jit-loader_rtdyld_elf.test +++ b/lldb/test/Shell/Breakpoint/jit-loader_rtdyld_elf.test @@ -3,8 +3,8 @@ # RuntimeDyld can be used to link and load emitted code for both, MCJIT and Orc. # -# RUN: %clang -g -S -emit-llvm --target=x86_64-unknown-unknown-elf \ -# RUN: -o %t.ll %p/Inputs/jitbp.cpp +# RUN: %clangxx -g -S -emit-llvm --target=x86_64-unknown-unknown-elf \ +# RUN: -o %t.ll %p/Inputs/jitbp.cpp # # RUN: %lldb -b -o 'settings set plugin.jit-loader.gdb.enable on' -o 'b jitbp' \ # RUN: -o 'run --jit-kind=mcjit %t.ll' lli | FileCheck %s diff --git a/lldb/test/Shell/Commands/command-image-dump-ast-colored.test b/lldb/test/Shell/Commands/command-image-dump-ast-colored.test index 355ef6bb1d199..7fd70d234fbd4 100644 --- a/lldb/test/Shell/Commands/command-image-dump-ast-colored.test +++ b/lldb/test/Shell/Commands/command-image-dump-ast-colored.test @@ -1,7 +1,7 @@ # Test AST dumping with and without color. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Commands/command-image-dump-ast.test b/lldb/test/Shell/Commands/command-image-dump-ast.test index 3204022418cb8..86fe1836a2c6c 100644 --- a/lldb/test/Shell/Commands/command-image-dump-ast.test +++ b/lldb/test/Shell/Commands/command-image-dump-ast.test @@ -5,7 +5,7 @@ # UNSUPPORTED: system-windows # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Commands/list-header.test b/lldb/test/Shell/Commands/list-header.test index 53c4b786f1810..27eaa1a4f29c2 100644 --- a/lldb/test/Shell/Commands/list-header.test +++ b/lldb/test/Shell/Commands/list-header.test @@ -3,11 +3,11 @@ # XFAIL: target-windows ## Test that `list header.h:` works correctly when header is available. -## +## # RUN: split-file %s %t -# RUN: %clang_host -g %t/main_with_inlined.cc %t/foo.cc -o %t/main_with_inlined.out -# RUN: %clang_host -g %t/main_no_inlined.cc %t/foo.cc -o %t/main_no_inlined.out +# RUN: %clangxx_host -g %t/main_with_inlined.cc %t/foo.cc -o %t/main_with_inlined.out +# RUN: %clangxx_host -g %t/main_no_inlined.cc %t/foo.cc -o %t/main_no_inlined.out # RUN: %lldb %t/main_with_inlined.out -o "list foo.h:2" -o "exit" 2>&1 \ # RUN: | FileCheck %s --check-prefix=CHECK-INLINED @@ -19,7 +19,7 @@ # CHECK-INLINED: 2 extern int* ptr; # CHECK-INLINED: 3 void f(int x); -# CHECK-INLINED: 4 +# CHECK-INLINED: 4 # CHECK-INLINED: 5 inline void g(int x) { # CHECK-INLINED: 6 *ptr = x; // should crash here # CHECK-INLINED: 7 } diff --git a/lldb/test/Shell/Error/cleanup.cpp b/lldb/test/Shell/Error/cleanup.cpp index 6abc62dc4af99..1e83478a83337 100644 --- a/lldb/test/Shell/Error/cleanup.cpp +++ b/lldb/test/Shell/Error/cleanup.cpp @@ -1,5 +1,5 @@ // Test CommandObject is cleaned up even after commands fail due to not taking any argument. -// RUN: %clang_host -g %s -o %t +// RUN: %clangxx_host -g %s -o %t // RUN: %lldb -f %t -o "settings set interpreter.stop-command-source-on-error false" -s \ // RUN: %S/Inputs/cleanup.lldbinit int main() { return 0; } diff --git a/lldb/test/Shell/Expr/TestExprLanguageNote.test b/lldb/test/Shell/Expr/TestExprLanguageNote.test index e8e4e1399e451..e7da30816319e 100644 --- a/lldb/test/Shell/Expr/TestExprLanguageNote.test +++ b/lldb/test/Shell/Expr/TestExprLanguageNote.test @@ -1,5 +1,5 @@ # RUN: split-file %s %t -# RUN: %clang_host -g %t/main.cpp -o %t.out +# RUN: %clangxx_host -g %t/main.cpp -o %t.out # # RUN: %lldb -x -b -o "settings set interpreter.stop-command-source-on-error false" \ # RUN: -s %t/no-target.input 2>&1 | FileCheck %s --check-prefix=CHECK-NO-TARGET diff --git a/lldb/test/Shell/Expr/TestLambdaExprImport.test b/lldb/test/Shell/Expr/TestLambdaExprImport.test index c57ce06453fe2..b49a38036e566 100644 --- a/lldb/test/Shell/Expr/TestLambdaExprImport.test +++ b/lldb/test/Shell/Expr/TestLambdaExprImport.test @@ -3,7 +3,7 @@ # uses always). # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -o "settings set interpreter.stop-command-source-on-error false" \ # RUN: -x -b -s %t/commands.input %t.out 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/ObjectFile/ELF/elf-memory.test b/lldb/test/Shell/ObjectFile/ELF/elf-memory.test index 75a68edd2d349..170dc7682aab0 100644 --- a/lldb/test/Shell/ObjectFile/ELF/elf-memory.test +++ b/lldb/test/Shell/ObjectFile/ELF/elf-memory.test @@ -11,7 +11,7 @@ // - verify that "image dump objfile" will dump the dynamic section of the // memory elf file and find the .dynamic string table. -// RUN: %clang_host %p/Inputs/memory-elf.cpp -g -O0 -o %t +// RUN: %clangxx_host %p/Inputs/memory-elf.cpp -g -O0 -o %t // RUN: %lldb %t -b \ // RUN: -o "b main" \ diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test index 5a84c163453cc..32b4095d9addd 100644 --- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test +++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test @@ -12,7 +12,7 @@ # UNSUPPORTED: system-windows # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback-user-leaf.cpp -o %t.out +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback-user-leaf.cpp -o %t.out # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK run diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test index b15bcb3a384f9..c8c433c0a819a 100644 --- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test +++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test @@ -11,7 +11,7 @@ # UNSUPPORTED: system-windows # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback.cpp -o %t.out +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback.cpp -o %t.out # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK run diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test index 2ea6594643c9c..d0789ac7dc67a 100644 --- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test +++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test @@ -4,7 +4,7 @@ # UNSUPPORTED: system-windows # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-max-depth.cpp -o %t.out +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl-max-depth.cpp -o %t.out # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK run diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test index 81a492d1ed579..68a4ea612c0d1 100644 --- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test +++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test @@ -3,7 +3,7 @@ # UNSUPPORTED: system-windows # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-nested.cpp -o %t.out +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl-nested.cpp -o %t.out # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK run diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test index dd08290174e3a..bd4851146b40d 100644 --- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test +++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test @@ -3,7 +3,7 @@ # UNSUPPORTED: system-windows # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl.cpp -o %t.out +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl.cpp -o %t.out # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK run diff --git a/lldb/test/Shell/Recognizer/verbose_trap.test b/lldb/test/Shell/Recognizer/verbose_trap.test index dafab7bdea688..ab0df082cc032 100644 --- a/lldb/test/Shell/Recognizer/verbose_trap.test +++ b/lldb/test/Shell/Recognizer/verbose_trap.test @@ -1,15 +1,15 @@ # UNSUPPORTED: system-windows # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\" +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\" # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-BOTH # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\" +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\" # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-MESSAGE_ONLY # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\" +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\" # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-CATEGORY_ONLY # -# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\" +# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\" # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-NONE run diff --git a/lldb/test/Shell/Settings/TestChildCountTruncation.test b/lldb/test/Shell/Settings/TestChildCountTruncation.test index da6436cb5ca20..b66d0df983069 100644 --- a/lldb/test/Shell/Settings/TestChildCountTruncation.test +++ b/lldb/test/Shell/Settings/TestChildCountTruncation.test @@ -2,7 +2,7 @@ # when target.max-children-count wasn't explicitly set. # RUN: split-file %s %t -# RUN: %clang_host -g %t/main.cpp -o %t.out +# RUN: %clangxx_host -g %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s --check-prefix=DWIM # diff --git a/lldb/test/Shell/Settings/TestChildDepthTruncation.test b/lldb/test/Shell/Settings/TestChildDepthTruncation.test index 12f5661600ae7..7e4fbbef9e458 100644 --- a/lldb/test/Shell/Settings/TestChildDepthTruncation.test +++ b/lldb/test/Shell/Settings/TestChildDepthTruncation.test @@ -2,7 +2,7 @@ # when target.max-children-depth wasn't explicitly set. # RUN: split-file %s %t -# RUN: %clang_host -g %t/main.cpp -o %t.out +# RUN: %clangxx_host -g %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s --check-prefix=DWIM # diff --git a/lldb/test/Shell/Settings/TestCxxFrameFormat.test b/lldb/test/Shell/Settings/TestCxxFrameFormat.test index d70db582e9750..3ee92d53492fb 100644 --- a/lldb/test/Shell/Settings/TestCxxFrameFormat.test +++ b/lldb/test/Shell/Settings/TestCxxFrameFormat.test @@ -3,7 +3,7 @@ # Test the plugin.cplusplus.display.function-name-format setting. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestCxxFrameFormatEmpty.test b/lldb/test/Shell/Settings/TestCxxFrameFormatEmpty.test index 0a6d2723ded34..a0550b733d781 100644 --- a/lldb/test/Shell/Settings/TestCxxFrameFormatEmpty.test +++ b/lldb/test/Shell/Settings/TestCxxFrameFormatEmpty.test @@ -5,7 +5,7 @@ # ${function.name-with-args}. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestCxxFrameFormatMixedLanguages.test b/lldb/test/Shell/Settings/TestCxxFrameFormatMixedLanguages.test index bafd36f5ae177..679d6e4d5abe4 100644 --- a/lldb/test/Shell/Settings/TestCxxFrameFormatMixedLanguages.test +++ b/lldb/test/Shell/Settings/TestCxxFrameFormatMixedLanguages.test @@ -4,9 +4,9 @@ # when interoperating multiple languages. # RUN: split-file %s %t -# RUN: %clangxx_host -x c -c -g %t/lib.c -o %t.clib.o +# RUN: %clang_host -x c -c -g %t/lib.c -o %t.clib.o # RUN: %clangxx_host -c -g %t/lib.cpp -o %t.cxxlib.o -# RUN: %clangxx_host %t/main.m %t.cxxlib.o %t.clib.o -o %t.out +# RUN: %clang_host %t/main.m %t.cxxlib.o %t.clib.o -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 | FileCheck %s #--- lib.c @@ -47,7 +47,7 @@ break set -n method run bt -# CHECK: custom-frame 'this affects C++ only' -# CHECK: custom-frame 'this affects C++ only' -# CHECK: custom-frame 'func' -# CHECK: custom-frame 'main' +# CHECK: custom-frame 'this affects C++ only' +# CHECK: custom-frame 'this affects C++ only' +# CHECK: custom-frame 'func' +# CHECK: custom-frame 'main' diff --git a/lldb/test/Shell/Settings/TestCxxFrameFormatPartialFailure.test b/lldb/test/Shell/Settings/TestCxxFrameFormatPartialFailure.test index e914ff7a010dd..f279f07afcda2 100644 --- a/lldb/test/Shell/Settings/TestCxxFrameFormatPartialFailure.test +++ b/lldb/test/Shell/Settings/TestCxxFrameFormatPartialFailure.test @@ -5,7 +5,7 @@ # were successful. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionBasename.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionBasename.test index c0008e50927b1..56ec09e2f951d 100644 --- a/lldb/test/Shell/Settings/TestFrameFormatFunctionBasename.test +++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionBasename.test @@ -3,11 +3,11 @@ # Test the ${function.basename} frame-format variable. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s # -# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out +# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionFormattedArguments.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionFormattedArguments.test index 04f51701a2a2d..f20fc8ca77aeb 100644 --- a/lldb/test/Shell/Settings/TestFrameFormatFunctionFormattedArguments.test +++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionFormattedArguments.test @@ -3,11 +3,11 @@ # Test the ${function.formatted-arguments} frame-format variable. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s # -# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out +# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \ # RUN: | FileCheck %s --check-prefix=CHECK-NODEBUG diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionQualifiers.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionQualifiers.test index b1dfe834c1deb..d05e60b0e8d10 100644 --- a/lldb/test/Shell/Settings/TestFrameFormatFunctionQualifiers.test +++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionQualifiers.test @@ -3,11 +3,11 @@ # Test the ${function.qualifiers} frame-format variable. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s # -# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out +# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionReturn.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionReturn.test index f913162a1aa66..bb78258aba753 100644 --- a/lldb/test/Shell/Settings/TestFrameFormatFunctionReturn.test +++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionReturn.test @@ -4,11 +4,11 @@ # frame-format variables. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s # -# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out +# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionScope.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionScope.test index a28c16f95a9e2..f4a17661c3602 100644 --- a/lldb/test/Shell/Settings/TestFrameFormatFunctionScope.test +++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionScope.test @@ -3,11 +3,11 @@ # Test the ${function.scope} frame-format variable. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s # -# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out +# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionSuffix.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionSuffix.test index 4609a0412a0ab..5883c722f3336 100644 --- a/lldb/test/Shell/Settings/TestFrameFormatFunctionSuffix.test +++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionSuffix.test @@ -3,7 +3,7 @@ # Test the ${function.suffix} frame-format variable. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionTemplateArguments.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionTemplateArguments.test index ac8a32820c888..a09a9610f48db 100644 --- a/lldb/test/Shell/Settings/TestFrameFormatFunctionTemplateArguments.test +++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionTemplateArguments.test @@ -3,11 +3,11 @@ # Test the ${function.template-arguments} frame-format variable. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s # -# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out +# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/Settings/TestFrameFunctionInlined.test b/lldb/test/Shell/Settings/TestFrameFunctionInlined.test index 5db34b4160850..1bb7ab486bcf5 100644 --- a/lldb/test/Shell/Settings/TestFrameFunctionInlined.test +++ b/lldb/test/Shell/Settings/TestFrameFunctionInlined.test @@ -6,7 +6,7 @@ # REQUIRES: (system-windows && lld) || !system-windows # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out %if system-windows %{-fuse-ld=lld%} +# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out %if system-windows %{-fuse-ld=lld%} # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/split-dwarf-expression-eval-bug.cpp b/lldb/test/Shell/SymbolFile/DWARF/split-dwarf-expression-eval-bug.cpp index 4a8004ddd287f..b02eea6bbc4f8 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/split-dwarf-expression-eval-bug.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/split-dwarf-expression-eval-bug.cpp @@ -7,10 +7,10 @@ // UNSUPPORTED: system-darwin, system-windows -// RUN: %clang_host -c -gsplit-dwarf -g %s -o %t1.o -DONE -// RUN: %clang_host -c -gsplit-dwarf -g %s -o %t2.o -DTWO -// RUN: %clang_host -c -gsplit-dwarf -g %s -o %t3.o -DTHREE -// RUN: %clang_host %t1.o %t2.o %t3.o -o %t +// RUN: %clangxx_host -c -gsplit-dwarf -g %s -o %t1.o -DONE +// RUN: %clangxx_host -c -gsplit-dwarf -g %s -o %t2.o -DTWO +// RUN: %clangxx_host -c -gsplit-dwarf -g %s -o %t3.o -DTHREE +// RUN: %clangxx_host %t1.o %t2.o %t3.o -o %t // RUN: %lldb %t -o "br set -n foo" -o run -o "expression bool_in_first_cu" -o exit \ // RUN: | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/apple-index-is-used.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/apple-index-is-used.cpp index 5bcb2cbcbbe29..8ef2e56ba3d4d 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/apple-index-is-used.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/apple-index-is-used.cpp @@ -1,5 +1,5 @@ // Test that we use the apple indexes. -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx -gdwarf-4 +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx -gdwarf-4 // RUN: lldb-test symbols %t | FileCheck %s // CHECK: .apple_names index present diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-compressed.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-compressed.cpp index 4dcbb47152203..53c3d3daa40c5 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-compressed.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-compressed.cpp @@ -3,7 +3,7 @@ // REQUIRES: lld, zlib -// RUN: %clang -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %s +// RUN: %clangxx -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %s // RUN: ld.lld %t.o -o %t --compress-debug-sections=zlib // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --find=variable --name=foo %t | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-debug-names.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-debug-names.cpp index 2b7a928c89a8f..acc34dd41688b 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-debug-names.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-debug-names.cpp @@ -6,7 +6,7 @@ // REQUIRES: lld -// RUN: %clang %s -target x86_64-pc-linux -gdwarf-5 -fdebug-types-section \ +// RUN: %clangxx %s -target x86_64-pc-linux -gdwarf-5 -fdebug-types-section \ // RUN: -gpubnames -fno-limit-debug-info -c -o %t.o // RUN: ld.lld %t.o -o %t // RUN: %lldb %t -o "type lookup stype" -b | FileCheck %s --check-prefix=BASE diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-dwo-cross-reference.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-dwo-cross-reference.cpp index 0e29cb3e7f16e..bc863fb64a9cc 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-dwo-cross-reference.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-dwo-cross-reference.cpp @@ -3,9 +3,9 @@ // REQUIRES: lld -// RUN: %clang %s -target x86_64-pc-linux -fno-standalone-debug -g \ +// RUN: %clangxx %s -target x86_64-pc-linux -fno-standalone-debug -g \ // RUN: -fdebug-types-section -gsplit-dwarf -c -o %t1.o -DONE -// RUN: %clang %s -target x86_64-pc-linux -fno-standalone-debug -g \ +// RUN: %clangxx %s -target x86_64-pc-linux -fno-standalone-debug -g \ // RUN: -fdebug-types-section -gsplit-dwarf -c -o %t2.o -DTWO // RUN: llvm-dwarfdump %t1.dwo -debug-types -debug-info | FileCheck --check-prefix=ONEUNIT %s // RUN: llvm-dwarfdump %t2.dwo -debug-types -debug-info | FileCheck --check-prefix=ONEUNIT %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-index-is-used.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-index-is-used.cpp index d6ac23716f6ce..2fdb1d8d7ca7d 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-index-is-used.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-index-is-used.cpp @@ -2,7 +2,7 @@ // REQUIRES: lld -// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames +// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames // RUN: ld.lld %t.o -o %t // RUN: lldb-test symbols %t | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-partial-index.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-partial-index.cpp index ab84415f61b27..a739dfde48aaf 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-partial-index.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-partial-index.cpp @@ -3,9 +3,9 @@ // REQUIRES: lld -// RUN: %clang %s -c -o %t-1.o --target=x86_64-pc-linux -DONE -gdwarf-5 -gpubnames +// RUN: %clangxx %s -c -o %t-1.o --target=x86_64-pc-linux -DONE -gdwarf-5 -gpubnames // RUN: llvm-readobj --sections %t-1.o | FileCheck %s --check-prefix NAMES -// RUN: %clang %s -c -o %t-2.o --target=x86_64-pc-linux -DTWO -gdwarf-5 -gno-pubnames +// RUN: %clangxx %s -c -o %t-2.o --target=x86_64-pc-linux -DTWO -gdwarf-5 -gno-pubnames // RUN: ld.lld %t-1.o %t-2.o -o %t // RUN: lldb-test symbols --find=variable --name=foo %t | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwo-not-found-warning.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwo-not-found-warning.cpp index 929e11f80e34e..36eb299f06630 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwo-not-found-warning.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwo-not-found-warning.cpp @@ -1,4 +1,4 @@ -// RUN: %clang --target=x86_64-pc-linux -g -gsplit-dwarf -c %s -o %t.o +// RUN: %clangxx --target=x86_64-pc-linux -g -gsplit-dwarf -c %s -o %t.o // RUN: rm %t.dwo // RUN: %lldb %t.o -o "br set -n main" -o exit 2>&1 | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp index 9251930d7d13c..7fbc4f98e7976 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp @@ -16,9 +16,9 @@ // type unit comes from by looking at the DW_AT_dwo_name attribute in the // DW_TAG_type_unit. -// RUN: %clang -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf \ +// RUN: %clangxx -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf \ // RUN: -fdebug-types-section -gpubnames -c %s -o %t.main.o -// RUN: %clang -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -DVARIANT \ +// RUN: %clangxx -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -DVARIANT \ // RUN: -fdebug-types-section -gpubnames -c %s -o %t.foo.o // RUN: ld.lld %t.main.o %t.foo.o -o %t diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp index 3e97c3fb1ebc2..3edcd8f180a15 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp @@ -14,8 +14,8 @@ // complete DWARF index. // Test that if we don't have .debug_names, that we save a full DWARF index. -// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o -// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o +// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o +// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o // RUN: ld.lld %t.main.o %t.foo.o -o %t.nonames // RUN: llvm-dwp %t.main.dwo %t.foo.dwo -o %t.nonames.dwp // RUN: rm %t.main.dwo %t.foo.dwo @@ -35,8 +35,8 @@ // Test that if we have one .o file with .debug_names and one without, that we // save a partial DWARF index. -// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o -gpubnames -// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o +// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o -gpubnames +// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o // RUN: ld.lld %t.main.o %t.foo.o -o %t.somenames // RUN: llvm-dwp %t.main.dwo %t.foo.dwo -o %t.somenames.dwp // RUN: rm %t.main.dwo %t.foo.dwo diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp index 888e96bbb10af..f625fda2087db 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp @@ -1,7 +1,7 @@ // REQUIRES: lld, python // Now test with DWARF5 -// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.dwarf5.o +// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.dwarf5.o // RUN: ld.lld %t.dwarf5.o -o %t.dwarf5 // RUN: llvm-dwp %t.dwarf5.dwo -o %t.dwarf5.dwp // RUN: rm %t.dwarf5.dwo @@ -64,7 +64,7 @@ // RUN: -b %t.dwarf5.debug 2>&1 | FileCheck %s -check-prefix=NODWP // Now test with DWARF4 -// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-4 -c %s -o %t.dwarf4.o +// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-4 -c %s -o %t.dwarf4.o // RUN: ld.lld %t.dwarf4.o -o %t.dwarf4 // RUN: llvm-dwp %t.dwarf4.dwo -o %t.dwarf4.dwp // RUN: rm %t.dwarf4.dwo @@ -128,7 +128,7 @@ // Test if we have a GNU build ID in our main executable and in our debug file, // and we have a .dwp file that doesn't, that we can still load our .dwp file. -// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.o +// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.o // RUN: ld.lld %t.o --build-id=md5 -o %t // RUN: llvm-dwp %t.dwo -o %t.dwp // RUN: rm %t.dwo diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp index c42f9fe0b8b52..a00b2bd9506ef 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp @@ -1,6 +1,6 @@ // REQUIRES: lld -// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames +// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames // RUN: ld.lld %t.o -o %t // RUN: lldb-test symbols --name=foo --find=function --function-flags=base %t | \ // RUN: FileCheck --check-prefix=BASE %s @@ -19,7 +19,7 @@ // RUN: lldb-test symbols --name=not_there --find=function %t | \ // RUN: FileCheck --check-prefix=EMPTY %s // -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx // RUN: lldb-test symbols --name=foo --find=function --function-flags=base %t | \ // RUN: FileCheck --check-prefix=BASE %s // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \ @@ -39,7 +39,7 @@ // RUN: lldb-test symbols --name=not_there --find=function %t | \ // RUN: FileCheck --check-prefix=EMPTY %s -// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames +// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames // RUN: ld.lld %t.o -o %t // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --name=foo --find=function --function-flags=base %t | \ diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-namespace.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-namespace.cpp index 13d50af7ef601..14c73c3e82efb 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-namespace.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-namespace.cpp @@ -1,6 +1,6 @@ // REQUIRES: lld -// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames +// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames // RUN: ld.lld %t.o -o %t // RUN: lldb-test symbols --name=foo --find=namespace %t | \ // RUN: FileCheck --check-prefix=FOO %s @@ -9,7 +9,7 @@ // RUN: lldb-test symbols --name=not_there --find=namespace %t | \ // RUN: FileCheck --check-prefix=EMPTY %s // -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx // RUN: lldb-test symbols --name=foo --find=namespace %t | \ // RUN: FileCheck --check-prefix=FOO %s // RUN: lldb-test symbols --name=foo --find=namespace --context=context %t | \ @@ -17,7 +17,7 @@ // RUN: lldb-test symbols --name=not_there --find=namespace %t | \ // RUN: FileCheck --check-prefix=EMPTY %s -// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames +// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames // RUN: ld.lld %t.o -o %t // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --name=foo --find=namespace %t | \ diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp index af49206608723..315fab344dfee 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp @@ -1,6 +1,6 @@ // REQUIRES: lld -// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames +// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames // RUN: ld.lld %t.o -o %t // RUN: lldb-test symbols --name=foo --find=type %t | \ // RUN: FileCheck --check-prefix=NAME %s @@ -11,7 +11,7 @@ // RUN: lldb-test symbols --name=not_there --find=type %t | \ // RUN: FileCheck --check-prefix=EMPTY %s // -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx // RUN: lldb-test symbols --name=foo --find=type %t | \ // RUN: FileCheck --check-prefix=NAME %s // RUN: lldb-test symbols --name=::foo --find=type %t | \ @@ -21,7 +21,7 @@ // RUN: lldb-test symbols --name=not_there --find=type %t | \ // RUN: FileCheck --check-prefix=EMPTY %s -// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames +// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames // RUN: ld.lld %t.o -o %t // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --name=foo --find=type %t | \ diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-variable.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-variable.cpp index e46fa14489d32..b6e2252c28402 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-variable.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-variable.cpp @@ -1,6 +1,6 @@ // REQUIRES: lld -// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames +// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames // RUN: ld.lld %t.o -o %t // RUN: lldb-test symbols --name=foo --find=variable --context=context %t | \ // RUN: FileCheck --check-prefix=CONTEXT %s @@ -11,7 +11,7 @@ // RUN: lldb-test symbols --name=not_there --find=variable %t | \ // RUN: FileCheck --check-prefix=EMPTY %s // -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx // RUN: lldb-test symbols --name=foo --find=variable --context=context %t | \ // RUN: FileCheck --check-prefix=CONTEXT %s // RUN: lldb-test symbols --name=foo --find=variable %t | \ @@ -21,7 +21,7 @@ // RUN: lldb-test symbols --name=not_there --find=variable %t | \ // RUN: FileCheck --check-prefix=EMPTY %s // -// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames +// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames // RUN: ld.lld %t.o -o %t // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --name=foo --find=variable --context=context %t | \ diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-function-regex.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-function-regex.cpp index be267596fb372..5c7ad844f6603 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-function-regex.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-function-regex.cpp @@ -1,13 +1,13 @@ // REQUIRES: lld -// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames +// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames // RUN: ld.lld %t.o -o %t // RUN: lldb-test symbols --name=f.o --regex --find=function %t | FileCheck %s // -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx // RUN: lldb-test symbols --name=f.o --regex --find=function %t | FileCheck %s -// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames +// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames // RUN: ld.lld %t.o -o %t // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --name=f.o --regex --find=function %t | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-method-local-struct.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-method-local-struct.cpp index 3da4a4a23f8a8..46553a83081e4 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-method-local-struct.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-method-local-struct.cpp @@ -1,4 +1,4 @@ -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \ // RUN: FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-method.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-method.cpp index 9f8b3df2f31a7..26faf8907b4a9 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-method.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-method.cpp @@ -1,15 +1,15 @@ // REQUIRES: lld -// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames +// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames // RUN: ld.lld %t.o -o %t // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \ // RUN: FileCheck %s // -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \ // RUN: FileCheck %s -// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames +// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames // RUN: ld.lld %t.o -o %t // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \ diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-qualified-variable.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-qualified-variable.cpp index 1ad3e7fbadf51..e3f9ce308b75c 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-qualified-variable.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-qualified-variable.cpp @@ -1,4 +1,4 @@ -// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx +// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx // RUN: lldb-test symbols --name=A::foo --find=variable %t | FileCheck %s // CHECK: Found 1 variables: diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-dwo.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-dwo.cpp index b5d35e4f7883f..250b34377acda 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-dwo.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-dwo.cpp @@ -1,9 +1,9 @@ // REQUIRES: lld -// RUN: %clang %s -gdwarf-5 -gpubnames -gsplit-dwarf -c -emit-llvm -o - --target=x86_64-pc-linux -DONE | \ +// RUN: %clangxx %s -gdwarf-5 -gpubnames -gsplit-dwarf -c -emit-llvm -o - --target=x86_64-pc-linux -DONE | \ // RUN: llc -filetype=obj -split-dwarf-file=%t-1.dwo -o %t-1.o // RUN: llvm-objcopy --split-dwo=%t-1.dwo %t-1.o -// RUN: %clang %s -gdwarf-5 -gpubnames -gsplit-dwarf -c -emit-llvm -o - --target=x86_64-pc-linux -DTWO | \ +// RUN: %clangxx %s -gdwarf-5 -gpubnames -gsplit-dwarf -c -emit-llvm -o - --target=x86_64-pc-linux -DTWO | \ // RUN: llc -filetype=obj -split-dwarf-file=%t-2.dwo -o %t-2.o // RUN: llvm-objcopy --split-dwo=%t-2.dwo %t-2.o // RUN: ld.lld %t-1.o %t-2.o -o %t diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-file.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-file.cpp index f1a9a4eb12d07..3a8cf89ac367b 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-file.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-file.cpp @@ -1,7 +1,7 @@ // REQUIRES: lld -// RUN: %clang -g -c -o %t-1.o --target=x86_64-pc-linux -gno-pubnames %s -// RUN: %clang -g -c -o %t-2.o --target=x86_64-pc-linux -gno-pubnames %S/Inputs/find-variable-file-2.cpp +// RUN: %clangxx -g -c -o %t-1.o --target=x86_64-pc-linux -gno-pubnames %s +// RUN: %clangxx -g -c -o %t-2.o --target=x86_64-pc-linux -gno-pubnames %S/Inputs/find-variable-file-2.cpp // RUN: ld.lld %t-1.o %t-2.o -o %t // RUN: lldb-test symbols --file=find-variable-file.cpp --find=variable %t | \ // RUN: FileCheck --check-prefix=ONE %s @@ -10,16 +10,16 @@ // Run the same test with split-dwarf. This is interesting because the two // split compile units will have the same offset (0). -// RUN: %clang -g -c -o %t-1.o --target=x86_64-pc-linux -gsplit-dwarf %s -// RUN: %clang -g -c -o %t-2.o --target=x86_64-pc-linux -gsplit-dwarf %S/Inputs/find-variable-file-2.cpp +// RUN: %clangxx -g -c -o %t-1.o --target=x86_64-pc-linux -gsplit-dwarf %s +// RUN: %clangxx -g -c -o %t-2.o --target=x86_64-pc-linux -gsplit-dwarf %S/Inputs/find-variable-file-2.cpp // RUN: ld.lld %t-1.o %t-2.o -o %t // RUN: lldb-test symbols --file=find-variable-file.cpp --find=variable %t | \ // RUN: FileCheck --check-prefix=ONE %s // RUN: lldb-test symbols --file=find-variable-file-2.cpp --find=variable %t | \ // RUN: FileCheck --check-prefix=TWO %s -// RUN: %clang -c -o %t-1.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %s -// RUN: %clang -c -o %t-2.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %S/Inputs/find-variable-file-2.cpp +// RUN: %clangxx -c -o %t-1.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %s +// RUN: %clangxx -c -o %t-2.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %S/Inputs/find-variable-file-2.cpp // RUN: ld.lld %t-1.o %t-2.o -o %t // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --file=find-variable-file.cpp --find=variable %t | \ @@ -29,9 +29,9 @@ // Run the same test with split dwarf and pubnames to check whether we can find // the compile unit using the name index if it is split. -// RUN: %clang -c -o %t-1.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %s -// RUN: %clang -c -o %t-2.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %S/Inputs/find-variable-file-2.cpp -// RUN: %clang -c -o %t-3.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %S/Inputs/find-variable-file-3.cpp +// RUN: %clangxx -c -o %t-1.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %s +// RUN: %clangxx -c -o %t-2.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %S/Inputs/find-variable-file-2.cpp +// RUN: %clangxx -c -o %t-3.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %S/Inputs/find-variable-file-3.cpp // RUN: ld.lld %t-1.o %t-2.o %t-3.o -o %t // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES // RUN: lldb-test symbols --file=find-variable-file.cpp --find=variable %t | \ diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/member-pointers.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/member-pointers.cpp index a12892305798a..00805770af11e 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/member-pointers.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/member-pointers.cpp @@ -1,7 +1,7 @@ // REQUIRES: lld // Itanium ABI: -// RUN: %clang --target=x86_64-pc-linux -gdwarf -c -o %t_linux.o %s +// RUN: %clangxx --target=x86_64-pc-linux -gdwarf -c -o %t_linux.o %s // RUN: %lldb -f %t_linux.o -b -o "target variable s1 s2 m1 m2 v1 v2 v3 v4" | FileCheck --check-prefix=CHECK-GNU %s // // CHECK-GNU: (void (Single1::*)()) s1 = 0x00000000000000000000000000000000 diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm b/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm index 2dec109a781ca..27aa1365ab54c 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm @@ -1,5 +1,5 @@ // RUN: rm -rf %t.cache -// RUN: %clang --target=x86_64-apple-macosx -g -gmodules -Wno-objc-root-class \ +// RUN: %clangxx --target=x86_64-apple-macosx -g -gmodules -Wno-objc-root-class \ // RUN: -fmodules -fmodules-cache-path=%t.cache \ // RUN: -c -o %t.o %s -I%S/Inputs // RUN: lldb-test symbols -dump-clang-ast %t.o | FileCheck --check-prefix CHECK-ANON-S1 %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp index 297fb82caee5f..8f530c803a40c 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp @@ -1,4 +1,4 @@ -// RUN: %clang --target=x86_64-apple-macosx -c -gdwarf -o %t %s +// RUN: %clangxx --target=x86_64-apple-macosx -c -gdwarf -o %t %s // RUN: %lldb %t \ // RUN: -o "target var global" \ // RUN: -o "target var global2" \ diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp index 5a40a6e0fbc27..5ab45eefd2211 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp @@ -4,18 +4,18 @@ // REQUIRES: lld -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-n-a.o -g -gsimple-template-names -DFILE_A -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-n-b.o -g -gsimple-template-names -DFILE_B +// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-n-a.o -g -gsimple-template-names -DFILE_A +// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-n-b.o -g -gsimple-template-names -DFILE_B // RUN: ld.lld %t-n-a.o %t-n-b.o -o %t-n // RUN: %lldb %t-n -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-t-a.o -g -fdebug-types-section -DFILE_A -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-t-b.o -g -fdebug-types-section -DFILE_B +// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-t-a.o -g -fdebug-types-section -DFILE_A +// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-t-b.o -g -fdebug-types-section -DFILE_B // RUN: ld.lld %t-t-a.o %t-t-b.o -o %t-t // RUN: %lldb %t-t -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-tn-a.o -g -fdebug-types-section -gsimple-template-names -DFILE_A -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-tn-b.o -g -fdebug-types-section -gsimple-template-names -DFILE_B +// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-tn-a.o -g -fdebug-types-section -gsimple-template-names -DFILE_A +// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-tn-b.o -g -fdebug-types-section -gsimple-template-names -DFILE_B // RUN: ld.lld %t-tn-a.o %t-tn-b.o -o %t-tn // RUN: %lldb %t-tn -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/type-unit-same-basename.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/type-unit-same-basename.cpp index f7f5a30aaba9e..f9fd5b5e52250 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/type-unit-same-basename.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/type-unit-same-basename.cpp @@ -5,8 +5,8 @@ // REQUIRES: lld -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-a.o -g -fdebug-types-section -flimit-debug-info -DFILE_A -// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-b.o -g -fdebug-types-section -flimit-debug-info -DFILE_B +// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-a.o -g -fdebug-types-section -flimit-debug-info -DFILE_A +// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-b.o -g -fdebug-types-section -flimit-debug-info -DFILE_B // RUN: ld.lld -z undefs %t-a.o %t-b.o -o %t // RUN: %lldb %t -o "target variable x" -o exit | FileCheck %s From 911d75ae3aadb723118849c0d01a4aaa90fb44bb Mon Sep 17 00:00:00 2001 From: Rahman Lavaee Date: Thu, 30 Oct 2025 21:05:45 +0000 Subject: [PATCH 262/539] Simplify the basic-block-sections-bb-hash.ll test. The original test which uses grep,sed,tr commands fails on darwin: https://github.com/llvm/llvm-project/issues/165781 --- .../X86/basic-block-sections-bb-hash.ll | 20 ++++--------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll b/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll index f46d6ed262b2c..293b48d7dc5dd 100644 --- a/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll @@ -1,23 +1,11 @@ -; BB section test with basic block hashes. -; -; RUN: llc %s -O0 -mtriple=x86_64-pc-linux -function-sections -filetype=obj -basic-block-address-map -emit-bb-hash -o %t.o -; RUN: obj2yaml %t.o -o %t.yaml -; -;; Profile for version 1: +;; BB section test with basic block hashes. + +;; basic block sections Profile with bb hashes ; RUN: echo 'v1' > %t ; RUN: echo 'f foo' >> %t ; RUN: echo 'g 0:10,1:9,2:1 1:8,3:8 2:2,3:2 3:11' >> %t ; RUN: echo 'c 0 2 3' >> %t - -; These commands read BB hashes from SHT_LLVM_BB_ADDR_MAP -; and put them into the basic blocks sections profile. -; RUN: grep -E '^\s+(- ID:|Hash:)' %t.yaml | \ -; RUN: grep -B1 'Hash:' | \ -; RUN: sed 's/^\s*//; s/^- ID: *//; s/Hash: *0x//' | \ -; RUN: paste -d: - - | \ -; RUN: tr '\n' ' ' | \ -; RUN: sed 's/ $/\n/; s/^/h /' >> %t -; +; RUN: echo 'h 0:64863A11B5CA0000 1:54F1E80D6B270006 2:54F1F4E66B270008 3:C8BC6041A2CB0009' >> %t ; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t | FileCheck %s ; define void @foo(i1 zeroext) nounwind { From 7b9927f58fb826964526b7d3d83aadd5ea249a44 Mon Sep 17 00:00:00 2001 From: Baranov Victor Date: Fri, 31 Oct 2025 00:42:31 +0300 Subject: [PATCH 263/539] [CI] Remove unused variable in code-format job (#165454) `comments` were never used plus generated pylint error --- llvm/utils/git/code-format-helper.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py index dff7f78ce64a2..f6b28f480b8a2 100755 --- a/llvm/utils/git/code-format-helper.py +++ b/llvm/utils/git/code-format-helper.py @@ -486,8 +486,6 @@ def hook_main(): if fmt.has_tool(): if not fmt.run(args.changed_files, args): failed_fmts.append(fmt.name) - if fmt.comment: - comments.append(fmt.comment) else: print(f"Couldn't find {fmt.name}, can't check " + fmt.friendly_name.lower()) From a21263757b6928c470d9b540696a45606fe847f4 Mon Sep 17 00:00:00 2001 From: Ebuka Ezike Date: Thu, 30 Oct 2025 21:43:53 +0000 Subject: [PATCH 264/539] [lldb] Add alternative SBThread::GetStopDescription (#165379) the function signature for `GetStopDescription` is `lldb::SBThread::GetStopDescription(char *dst_or_null, size_t len)`. To get a description you need to call the function first time to get the buffer size. a second time to get the description. This is little worse from the python size as the signature is `lldb.SBThread.GetStopDescription(int: len) -> list[str]` the user has to pass the max size as possible with no way of checking if it is enough. This patch adds a new api `lldb.SBThread.GetStopDescription(desc: lldb.SBStream()) -> bool` `bool lldb::SBThread::GetStopDescription(lldb::SBStream &description)` which handles this case. Adds new Test case for lua. --- lldb/bindings/lua/lua-typemaps.swig | 20 ++++++++++- lldb/bindings/python/python-typemaps.swig | 18 ++++++++++ lldb/include/lldb/API/SBThread.h | 8 +++++ lldb/source/API/SBThread.cpp | 35 +++++++++++++++---- lldb/test/API/lua_api/TestThreadAPI.lua | 25 +++++++++++++ .../default-constructor/sb_thread.py | 1 + .../API/python_api/thread/TestThreadAPI.py | 5 +++ 7 files changed, 105 insertions(+), 7 deletions(-) create mode 100644 lldb/test/API/lua_api/TestThreadAPI.lua diff --git a/lldb/bindings/lua/lua-typemaps.swig b/lldb/bindings/lua/lua-typemaps.swig index 56756936a532c..f2a7401419368 100644 --- a/lldb/bindings/lua/lua-typemaps.swig +++ b/lldb/bindings/lua/lua-typemaps.swig @@ -121,9 +121,27 @@ LLDB_NUMBER_TYPEMAP(enum SWIGTYPE); $1 = (char *)malloc($2); } +// Disable default type checking for this method to avoid SWIG dispatch issues. +// +// Problem: SBThread::GetStopDescription has two overloads: +// 1. GetStopDescription(char* dst_or_null, size_t dst_len) +// 2. GetStopDescription(lldb::SBStream& stream) +// +// SWIG generates a dispatch function to select the correct overload based on argument types. +// see https://www.swig.org/Doc4.0/SWIGDocumentation.html#Typemaps_overloading. +// However, this dispatcher doesn't consider typemaps that transform function signatures. +// +// In lua, our typemap converts GetStopDescription(char*, size_t) to GetStopDescription(int). +// The dispatcher still checks against the original (char*, size_t) signature instead of +// the transformed (int) signature, causing type matching to fail. +// This only affects SBThread::GetStopDescription since the type check also matches +// the argument name, which is unique to this function. +%typemap(typecheck, precedence=SWIG_TYPECHECK_POINTER) (char *dst_or_null, size_t dst_len) "" + %typemap(argout) (char *dst_or_null, size_t dst_len) { lua_pop(L, 1); // Blow away the previous result - lua_pushlstring(L, (const char *)$1, $result); + llvm::StringRef ref($1); + lua_pushlstring(L, (const char *)$1, ref.size()); free($1); // SWIG_arg was already incremented } diff --git a/lldb/bindings/python/python-typemaps.swig b/lldb/bindings/python/python-typemaps.swig index 715914fe745f8..4d3a95768f2f3 100644 --- a/lldb/bindings/python/python-typemaps.swig +++ b/lldb/bindings/python/python-typemaps.swig @@ -224,6 +224,24 @@ AND call SWIG_fail at the same time, because it will result in a double free. } $1 = (char *)malloc($2); } + +// Disable default type checking for this method to avoid SWIG dispatch issues. +// +// Problem: SBThread::GetStopDescription has two overloads: +// 1. GetStopDescription(char* dst_or_null, size_t dst_len) +// 2. GetStopDescription(lldb::SBStream& stream) +// +// SWIG generates a dispatch function to select the correct overload based on argument types. +// see https://www.swig.org/Doc4.0/SWIGDocumentation.html#Typemaps_overloading. +// However, this dispatcher doesn't consider typemaps that transform function signatures. +// +// In Python, our typemap converts GetStopDescription(char*, size_t) to GetStopDescription(int). +// The dispatcher still checks against the original (char*, size_t) signature instead of +// the transformed (int) signature, causing type matching to fail. +// This only affects SBThread::GetStopDescription since the type check also matches +// the argument name, which is unique to this function. +%typemap(typecheck, precedence=SWIG_TYPECHECK_POINTER) (char *dst_or_null, size_t dst_len) "" + %typemap(argout) (char *dst_or_null, size_t dst_len) { Py_XDECREF($result); /* Blow away any previous result */ llvm::StringRef ref($1); diff --git a/lldb/include/lldb/API/SBThread.h b/lldb/include/lldb/API/SBThread.h index e9fe5858d125e..2411dfd376519 100644 --- a/lldb/include/lldb/API/SBThread.h +++ b/lldb/include/lldb/API/SBThread.h @@ -81,6 +81,14 @@ class LLDB_API SBThread { SBThreadCollection GetStopReasonExtendedBacktraces(InstrumentationRuntimeType type); + /// Gets a human-readable description of why the thread stopped. + /// + /// \param stream Output stream to receive the stop description text + /// \return + /// true if obtained and written to the stream, + // false if there was an error retrieving the description. + bool GetStopDescription(lldb::SBStream &stream) const; + size_t GetStopDescription(char *dst_or_null, size_t dst_len); SBValue GetStopReturnValue(); diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp index 4e4aa48bc9a2e..f58a1b52afa91 100644 --- a/lldb/source/API/SBThread.cpp +++ b/lldb/source/API/SBThread.cpp @@ -239,11 +239,34 @@ SBThread::GetStopReasonExtendedBacktraces(InstrumentationRuntimeType type) { return threads; } -size_t SBThread::GetStopDescription(char *dst, size_t dst_len) { - LLDB_INSTRUMENT_VA(this, dst, dst_len); +bool SBThread::GetStopDescription(lldb::SBStream &stream) const { + LLDB_INSTRUMENT_VA(this, stream); + + if (!m_opaque_sp) + return false; + + llvm::Expected exe_ctx = + GetStoppedExecutionContext(m_opaque_sp); + if (!exe_ctx) { + LLDB_LOG_ERROR(GetLog(LLDBLog::API), exe_ctx.takeError(), "{0}"); + return false; + } + + if (!exe_ctx->HasThreadScope()) + return false; + + Stream &strm = stream.ref(); + const std::string stop_desc = exe_ctx->GetThreadPtr()->GetStopDescription(); + strm.PutCString(stop_desc); + + return true; +} + +size_t SBThread::GetStopDescription(char *dst_or_null, size_t dst_len) { + LLDB_INSTRUMENT_VA(this, dst_or_null, dst_len); - if (dst) - *dst = 0; + if (dst_or_null) + *dst_or_null = 0; llvm::Expected exe_ctx = GetStoppedExecutionContext(m_opaque_sp); @@ -259,8 +282,8 @@ size_t SBThread::GetStopDescription(char *dst, size_t dst_len) { if (thread_stop_desc.empty()) return 0; - if (dst) - return ::snprintf(dst, dst_len, "%s", thread_stop_desc.c_str()) + 1; + if (dst_or_null) + return ::snprintf(dst_or_null, dst_len, "%s", thread_stop_desc.c_str()) + 1; // NULL dst passed in, return the length needed to contain the // description. diff --git a/lldb/test/API/lua_api/TestThreadAPI.lua b/lldb/test/API/lua_api/TestThreadAPI.lua new file mode 100644 index 0000000000000..5a38d0ba9192f --- /dev/null +++ b/lldb/test/API/lua_api/TestThreadAPI.lua @@ -0,0 +1,25 @@ +_T = require('lua_lldb_test').create_test('TestThreadAPI') + +function _T:TestGetStopDescription() + local target = self:create_target() + local breakpoint = target:BreakpointCreateByName("main", "a.out") + assertTrue(breakpoint:IsValid() and breakpoint:GetNumLocations() == 1) + + local process = target:LaunchSimple({ 'arg1', 'arg2' }, nil, nil) + local thread = get_stopped_thread(process, lldb.eStopReasonBreakpoint) + assertNotNil(thread) + assertTrue(thread:IsValid()) + + assertEqual("breakpoint", thread:GetStopDescription(string.len("breakpoint") + 1)) + assertEqual("break", thread:GetStopDescription(string.len("break") + 1)) + assertEqual("b", thread:GetStopDescription(string.len("b") + 1)) + assertEqual("breakpoint 1.1", thread:GetStopDescription(string.len("breakpoint 1.1") + 100)) + + -- Test stream variation + local stream = lldb.SBStream() + assertTrue(thread:GetStopDescription(stream)) + assertNotNil(stream) + assertEqual("breakpoint 1.1", stream:GetData()) +end + +os.exit(_T:run()) diff --git a/lldb/test/API/python_api/default-constructor/sb_thread.py b/lldb/test/API/python_api/default-constructor/sb_thread.py index 34eb3db852c38..4252fa0321fff 100644 --- a/lldb/test/API/python_api/default-constructor/sb_thread.py +++ b/lldb/test/API/python_api/default-constructor/sb_thread.py @@ -10,6 +10,7 @@ def fuzz_obj(obj): obj.GetStopReasonDataCount() obj.GetStopReasonDataAtIndex(100) obj.GetStopDescription(256) + obj.GetStopDescription(lldb.SBStream()) obj.GetThreadID() obj.GetIndexID() obj.GetName() diff --git a/lldb/test/API/python_api/thread/TestThreadAPI.py b/lldb/test/API/python_api/thread/TestThreadAPI.py index 5583434a742a9..acad7583eec19 100644 --- a/lldb/test/API/python_api/thread/TestThreadAPI.py +++ b/lldb/test/API/python_api/thread/TestThreadAPI.py @@ -138,6 +138,11 @@ def get_stop_description(self): "breakpoint 1.1", thread.GetStopDescription(len("breakpoint 1.1") + 100) ) + # Test the stream variation + stream = lldb.SBStream() + self.assertTrue(thread.GetStopDescription(stream)) + self.assertEqual("breakpoint 1.1", stream.GetData()) + def step_out_of_malloc_into_function_b(self, exe_name): """Test Python SBThread.StepOut() API to step out of a malloc call where the call site is at function b().""" exe = self.getBuildArtifact(exe_name) From f2cceda4b77ef767489feecff3f98372b4d09373 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 30 Oct 2025 15:46:19 -0700 Subject: [PATCH 265/539] [VPlan] Extend getSCEVForVPV, use to compute VPReplicateRecipe cost. (#161276) Update getSCEVExprForVPValue to handle more complex expressions, to use it in VPReplicateRecipe::comptueCost. In particular, it supports construction SCEV expressions for GetElementPtr VPReplicateRecipes, with operands that are VPScalarIVStepsRecipe, VPDerivedIVRecipe and VPCanonicalIVRecipe. If we hit a sub-expression we don't support yet, we return SCEVCouldNotCompute. Note that the SCEV expression is valid VF = 1: we only support construction AddRecs for VPCanonicalIVRecipe, which is an AddRec starting at 0 and stepping by 1. The returned SCEV expressions could be converted to a VF specific one, by rewriting the AddRecs to ones with the appropriate step. Note that the logic for constructing SCEVs for GetElementPtr was directly ported from ScalarEvolution.cpp. Another thing to note is that we construct SCEV expression purely by looking at the operation of the recipe and its translated operands, w/o accessing the underlying IR (the exception being getting the source element type for GEPs). PR: https://github.com/llvm/llvm-project/pull/161276 --- .../Transforms/Vectorize/LoopVectorize.cpp | 14 +++--- llvm/lib/Transforms/Vectorize/VPlanHelpers.h | 5 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 23 +++++---- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 49 ++++++++++++++++++- llvm/lib/Transforms/Vectorize/VPlanUtils.h | 3 +- 5 files changed, 74 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 505fb435e91e6..25bf49db0e073 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3908,7 +3908,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( continue; VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); precomputeCosts(*Plan, VF, CostCtx); auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { @@ -4166,7 +4166,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Add on other costs that are modelled in VPlan, but not in the legacy // cost model. VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); VPRegionBlock *VectorRegion = P->getVectorLoopRegion(); assert(VectorRegion && "Expected to have a vector region!"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( @@ -6876,7 +6876,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF) const { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE()); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE(), + OrigLoop); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. @@ -7110,12 +7111,13 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // case, don't trigger the assertion, as the extra simplifications may cause a // different VF to be picked by the VPlan-based cost model. VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); // Verify that the VPlan-based and legacy cost models agree, except for VPlans // with early exits and plans with additional VPlan simplifications. The // legacy cost model doesn't properly model costs for such loops. assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() || + !Legal->getLAI()->getSymbolicStrides().empty() || planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), CostCtx, OrigLoop, BestFactor.Width) || @@ -8441,7 +8443,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // and mulacc-reduction are implemented. if (!CM.foldTailWithEVL()) { VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); } @@ -9911,7 +9913,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, - CM.CostKind, *CM.PSE.getSE()); + CM.CostKind, *CM.PSE.getSE(), L); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, LVP.getPlanFor(VF.Width), SEL, diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 2aaabd9ebdd04..965426f86ff21 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -350,13 +350,14 @@ struct VPCostContext { SmallPtrSet SkipCostComputation; TargetTransformInfo::TargetCostKind CostKind; ScalarEvolution &SE; + const Loop *L; VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const VPlan &Plan, LoopVectorizationCostModel &CM, TargetTransformInfo::TargetCostKind CostKind, - ScalarEvolution &SE) + ScalarEvolution &SE, const Loop *L) : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM), - CostKind(CostKind), SE(SE) {} + CostKind(CostKind), SE(SE), L(L) {} /// Return the cost for \p UI with \p VF using the legacy cost model as /// fallback until computing the cost of all recipes migrates to VPlan. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 9a63c802047ea..bde62dd6dd4bc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3167,26 +3167,30 @@ bool VPReplicateRecipe::shouldPack() const { }); } -/// Returns true if \p Ptr is a pointer computation for which the legacy cost -/// model computes a SCEV expression when computing the address cost. -static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) { +/// Returns a SCEV expression for \p Ptr if it is a pointer computation for +/// which the legacy cost model computes a SCEV expression when computing the +/// address cost. Computing SCEVs for VPValues is incomplete and returns +/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In +/// those cases we fall back to the legacy cost model. Otherwise return nullptr. +static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE, + const Loop *L) { auto *PtrR = Ptr->getDefiningRecipe(); if (!PtrR || !((isa(PtrR) && cast(PtrR)->getOpcode() == Instruction::GetElementPtr) || isa(PtrR) || match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue())))) - return false; + return nullptr; // We are looking for a GEP where all indices are either loop invariant or // inductions. for (VPValue *Opd : drop_begin(PtrR->operands())) { if (!Opd->isDefinedOutsideLoopRegions() && !isa(Opd)) - return false; + return nullptr; } - return true; + return vputils::getSCEVExprForVPValue(Ptr, SE, L); } /// Returns true if \p V is used as part of the address of another load or @@ -3354,9 +3358,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, bool IsLoad = UI->getOpcode() == Instruction::Load; const VPValue *PtrOp = getOperand(!IsLoad); - // TODO: Handle cases where we need to pass a SCEV to - // getAddressComputationCost. - if (shouldUseAddressAccessSCEV(PtrOp)) + const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L); + if (isa_and_nonnull(PtrSCEV)) break; Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); @@ -3374,7 +3377,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, InstructionCost ScalarCost = ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE, - nullptr, Ctx.CostKind); + PtrSCEV, Ctx.CostKind); if (isSingleScalar()) return ScalarCost; diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 4db92e7def3ed..54348c6e34488 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -75,7 +75,8 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { B == Plan.getBackedgeTakenCount(); } -const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { +const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V, + ScalarEvolution &SE, const Loop *L) { if (V->isLiveIn()) { if (Value *LiveIn = V->getLiveInIRValue()) return SE.getSCEV(LiveIn); @@ -86,6 +87,52 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { return TypeSwitch(V->getDefiningRecipe()) .Case( [](const VPExpandSCEVRecipe *R) { return R->getSCEV(); }) + .Case([&SE, L](const VPCanonicalIVPHIRecipe *R) { + if (!L) + return SE.getCouldNotCompute(); + const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L); + return SE.getAddRecExpr(Start, SE.getOne(Start->getType()), L, + SCEV::FlagAnyWrap); + }) + .Case([&SE, L](const VPDerivedIVRecipe *R) { + const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L); + const SCEV *IV = getSCEVExprForVPValue(R->getOperand(1), SE, L); + const SCEV *Scale = getSCEVExprForVPValue(R->getOperand(2), SE, L); + if (any_of(ArrayRef({Start, IV, Scale}), IsaPred)) + return SE.getCouldNotCompute(); + + return SE.getAddExpr(SE.getTruncateOrSignExtend(Start, IV->getType()), + SE.getMulExpr(IV, SE.getTruncateOrSignExtend( + Scale, IV->getType()))); + }) + .Case([&SE, L](const VPScalarIVStepsRecipe *R) { + const SCEV *IV = getSCEVExprForVPValue(R->getOperand(0), SE, L); + const SCEV *Step = getSCEVExprForVPValue(R->getOperand(1), SE, L); + if (isa(IV) || isa(Step)) + return SE.getCouldNotCompute(); + return SE.getMulExpr(SE.getTruncateOrSignExtend(IV, Step->getType()), + Step); + }) + .Case([&SE, L](const VPReplicateRecipe *R) { + if (R->getOpcode() != Instruction::GetElementPtr) + return SE.getCouldNotCompute(); + + const SCEV *Base = getSCEVExprForVPValue(R->getOperand(0), SE, L); + if (isa(Base)) + return SE.getCouldNotCompute(); + + SmallVector IndexExprs; + for (VPValue *Index : drop_begin(R->operands())) { + const SCEV *IndexExpr = getSCEVExprForVPValue(Index, SE, L); + if (isa(IndexExpr)) + return SE.getCouldNotCompute(); + IndexExprs.push_back(IndexExpr); + } + + Type *SrcElementTy = cast(R->getUnderlyingInstr()) + ->getSourceElementType(); + return SE.getGEPExpr(Base, IndexExprs, SrcElementTy); + }) .Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); }); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 37cd413da9079..c21a0e70c1392 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -37,7 +37,8 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr); /// Return the SCEV expression for \p V. Returns SCEVCouldNotCompute if no /// SCEV expression could be constructed. -const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE); +const SCEV *getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, + const Loop *L = nullptr); /// Returns true if \p VPV is a single scalar, either because it produces the /// same value for all lanes or only has its first lane used. From 52fa592c436925ef4db3a98bdb371cd17549e5de Mon Sep 17 00:00:00 2001 From: Richard Smith Date: Thu, 30 Oct 2025 15:50:45 -0700 Subject: [PATCH 266/539] Add tests for CWG issues 6, 212, 232, 2823. (#165633) Unfortunately this adds two more "no"s to cxx_dr_status for 232 and 2823. --------- Co-authored-by: Vlad Serebrennikov --- clang/test/CXX/drs/cwg0xx.cpp | 2 ++ clang/test/CXX/drs/cwg28xx.cpp | 18 ++++++++++++ clang/test/CXX/drs/cwg2xx.cpp | 35 +++++++++++++++++++++++ clang/test/CXX/drs/cwg6.cpp | 51 ++++++++++++++++++++++++++++++++++ clang/www/cxx_dr_status.html | 8 +++--- 5 files changed, 110 insertions(+), 4 deletions(-) create mode 100644 clang/test/CXX/drs/cwg6.cpp diff --git a/clang/test/CXX/drs/cwg0xx.cpp b/clang/test/CXX/drs/cwg0xx.cpp index 805be67f2dc1a..10a4f1d6add3a 100644 --- a/clang/test/CXX/drs/cwg0xx.cpp +++ b/clang/test/CXX/drs/cwg0xx.cpp @@ -90,6 +90,8 @@ namespace cwg5 { // cwg5: 3.1 const C c = e; } // namespace cwg5 +// cwg6 is in cwg6.cpp + namespace cwg7 { // cwg7: 3.4 class A { public: ~A(); }; class B : virtual private A {}; // #cwg7-B diff --git a/clang/test/CXX/drs/cwg28xx.cpp b/clang/test/CXX/drs/cwg28xx.cpp index a6b2b99e0c3f1..d0ee191ef23d8 100644 --- a/clang/test/CXX/drs/cwg28xx.cpp +++ b/clang/test/CXX/drs/cwg28xx.cpp @@ -61,6 +61,24 @@ namespace cwg2819 { // cwg2819: 19 c++26 #endif } // namespace cwg2819 +namespace cwg2823 { // cwg2823: no +#if __cplusplus >= 201103L + constexpr int *p = 0; + constexpr int *q1 = &*p; + // expected-error@-1 {{constexpr variable 'q1' must be initialized by a constant expression}} + // expected-note@-2 {{dereferencing a null pointer is not allowed in a constant expression}} + // FIXME: invalid: dereferencing a null pointer. + constexpr int *q2 = &p[0]; + + int arr[32]; + constexpr int *r = arr; + // FIXME: invalid: dereferencing a past-the-end pointer. + constexpr int *s1 = &*(r + 32); + // FIXME: invalid: dereferencing a past-the-end pointer. + constexpr int *s2 = &r[32]; +#endif +} + namespace cwg2847 { // cwg2847: 19 review 2024-03-01 #if __cplusplus >= 202002L diff --git a/clang/test/CXX/drs/cwg2xx.cpp b/clang/test/CXX/drs/cwg2xx.cpp index 37186e3c3f205..a4995ddc2c588 100644 --- a/clang/test/CXX/drs/cwg2xx.cpp +++ b/clang/test/CXX/drs/cwg2xx.cpp @@ -230,6 +230,38 @@ namespace cwg211 { // cwg211: 2.7 }; } // namespace cwg211 +namespace cwg212 { // cwg212: 2.7 + template struct Base; + template struct Derived; + + int *overload(void*); + float *overload(Base*); + double *overload(Base*); + + void f(Derived *p) { + // OK, calls void* overload. + int *a = overload(p); + + Base *q = p; + // expected-error@-1 {{cannot initialize a variable of type 'Base *' with an lvalue of type 'Derived *'}} + } + + template struct Base {}; + template struct Derived : Base {}; + + void g(Derived *p) { + // OK, instantiates and calls Base* overlod. + double *b = overload(p); + (void)b; + } + + void h(Derived *p) { + // OK, instantiates and converts. + Base *q = p; + (void)q; + } +} + namespace cwg213 { // cwg213: 2.7 template struct A : T { void h(T t) { @@ -593,6 +625,9 @@ namespace cwg231 { // cwg231: 2.7 } } // namespace cwg231 +// 232 is NAD; the desired behavior is described in 2823. +// cwg232: dup 2823 + // cwg234: na // cwg235: na diff --git a/clang/test/CXX/drs/cwg6.cpp b/clang/test/CXX/drs/cwg6.cpp new file mode 100644 index 0000000000000..4752e72034c78 --- /dev/null +++ b/clang/test/CXX/drs/cwg6.cpp @@ -0,0 +1,51 @@ +// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK +// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK +// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK +// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK +// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK +// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK +// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK + +#if __cplusplus == 199711L +#define static_assert(expr) __extension__ _Static_assert(expr) +#define noexcept throw() +#endif + +namespace cwg6 { // cwg6: 2.7 +#if __cplusplus >= 201103L +struct Counter { + int copies; + constexpr Counter(int copies) : copies(copies) {} + constexpr Counter(const Counter& other) : copies(other.copies + 1) {} +}; + +// Passing an lvalue by value makes a non-elidable copy. +constexpr int PassByValue(Counter c) { return c.copies; } +constexpr int PassByValue2(Counter c) { return PassByValue(c); } +constexpr int PassByValue3(Counter c) { return PassByValue2(c); } +static_assert(PassByValue(Counter(0)) == 0, "expect no copies"); +static_assert(PassByValue2(Counter(0)) == 1, "expect 1 copy"); +static_assert(PassByValue3(Counter(0)) == 2, "expect 2 copies"); +#endif + +struct A { + A() noexcept; + A(const A&) noexcept; + ~A() noexcept; +}; + +inline void f(A a) noexcept {} + +// CHECK-LABEL: define {{.*}} @_ZN4cwg64callEv +void call() { + A a; + // We copy the parameter here, even though object is not mutated by f and + // otherwise satisfies the criteria for the proposed CWG6 optimization. + // CHECK: call {{.*}} @_ZN4cwg61AC1ERKS0_( + // CHECK: call {{.*}} @_ZN4cwg61fENS_1AE( + f(a); + // CHECK: call {{.*}} @_ZN4cwg61AD1Ev( + // CHECK: call {{.*}} @_ZN4cwg61AD1Ev( +} + +} // namespace cwg6 diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index ae9b28ee625cd..0312c9dfc0665 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -81,7 +81,7 @@

C++ defect report implementation status

6 NAD Should the optimization that allows a class object to alias another object also allow the case of a parameter in an inline function to alias its argument? - Unknown + Yes 7 @@ -1318,7 +1318,7 @@

C++ defect report implementation status

212 CD4 Implicit instantiation is not described clearly enough - Unknown + Yes 213 @@ -1438,7 +1438,7 @@

C++ defect report implementation status

232 NAD Is indirection through a null pointer undefined behavior? - Unknown + Duplicate of 2823 233 @@ -16790,7 +16790,7 @@

C++ defect report implementation status

2823 CD7 Implicit undefined behavior when dereferencing pointers - Unknown + No 2824 From df49c272930cf47566ad65fae449b83b71b0ac91 Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Thu, 30 Oct 2025 15:52:50 -0700 Subject: [PATCH 267/539] [libc] Remove optimization flags on entrypoints (#165782) Optimization flags are now handled through a common flag. These are no longer necessary. Fixes #112409 --- libc/src/fenv/CMakeLists.txt | 32 ------------- libc/src/math/amdgpu/CMakeLists.txt | 70 ---------------------------- libc/src/math/generic/CMakeLists.txt | 28 +---------- libc/src/math/nvptx/CMakeLists.txt | 58 ----------------------- 4 files changed, 1 insertion(+), 187 deletions(-) diff --git a/libc/src/fenv/CMakeLists.txt b/libc/src/fenv/CMakeLists.txt index c5431b1b9d55e..f368845977964 100644 --- a/libc/src/fenv/CMakeLists.txt +++ b/libc/src/fenv/CMakeLists.txt @@ -6,8 +6,6 @@ add_entrypoint_object( fegetround.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -18,8 +16,6 @@ add_entrypoint_object( fesetround.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -30,8 +26,6 @@ add_entrypoint_object( feclearexcept.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -42,8 +36,6 @@ add_entrypoint_object( feraiseexcept.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -54,8 +46,6 @@ add_entrypoint_object( fetestexcept.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -67,8 +57,6 @@ add_entrypoint_object( DEPENDS libc.hdr.types.fexcept_t libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -80,8 +68,6 @@ add_entrypoint_object( DEPENDS libc.hdr.types.fenv_t libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -93,8 +79,6 @@ add_entrypoint_object( DEPENDS libc.hdr.types.fenv_t libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -107,8 +91,6 @@ add_entrypoint_object( libc.hdr.fenv_macros libc.hdr.types.fexcept_t libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -119,8 +101,6 @@ add_entrypoint_object( fesetexcept.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -133,8 +113,6 @@ add_entrypoint_object( libc.hdr.fenv_macros libc.hdr.types.fexcept_t libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -147,8 +125,6 @@ add_entrypoint_object( libc.hdr.fenv_macros libc.hdr.types.fenv_t libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -161,8 +137,6 @@ add_entrypoint_object( libc.hdr.fenv_macros libc.hdr.types.fenv_t libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -173,8 +147,6 @@ add_entrypoint_object( feenableexcept.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -185,8 +157,6 @@ add_entrypoint_object( fedisableexcept.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -197,6 +167,4 @@ add_entrypoint_object( fegetexcept.h DEPENDS libc.src.__support.FPUtil.fenv_impl - COMPILE_OPTIONS - -O2 ) diff --git a/libc/src/math/amdgpu/CMakeLists.txt b/libc/src/math/amdgpu/CMakeLists.txt index e2cd3b99c3037..d05d519b74b4f 100644 --- a/libc/src/math/amdgpu/CMakeLists.txt +++ b/libc/src/math/amdgpu/CMakeLists.txt @@ -4,8 +4,6 @@ add_entrypoint_object( ceil.cpp HDRS ../ceil.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -14,8 +12,6 @@ add_entrypoint_object( ceilf.cpp HDRS ../ceilf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -24,8 +20,6 @@ add_entrypoint_object( copysign.cpp HDRS ../copysign.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -34,8 +28,6 @@ add_entrypoint_object( copysignf.cpp HDRS ../copysignf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -44,8 +36,6 @@ add_entrypoint_object( fabs.cpp HDRS ../fabs.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -54,8 +44,6 @@ add_entrypoint_object( fabsf.cpp HDRS ../fabsf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -64,8 +52,6 @@ add_entrypoint_object( floor.cpp HDRS ../floor.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -74,8 +60,6 @@ add_entrypoint_object( floorf.cpp HDRS ../floorf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -84,8 +68,6 @@ add_entrypoint_object( fma.cpp HDRS ../fma.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -94,8 +76,6 @@ add_entrypoint_object( fmaf.cpp HDRS ../fmaf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -104,8 +84,6 @@ add_entrypoint_object( fmax.cpp HDRS ../fmax.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -114,8 +92,6 @@ add_entrypoint_object( fmaxf.cpp HDRS ../fmaxf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -124,8 +100,6 @@ add_entrypoint_object( fmin.cpp HDRS ../fmin.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -134,8 +108,6 @@ add_entrypoint_object( fminf.cpp HDRS ../fminf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -144,8 +116,6 @@ add_entrypoint_object( fmod.cpp HDRS ../fmod.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -154,8 +124,6 @@ add_entrypoint_object( fmodf.cpp HDRS ../fmodf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -164,8 +132,6 @@ add_entrypoint_object( nearbyint.cpp HDRS ../nearbyint.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -174,8 +140,6 @@ add_entrypoint_object( nearbyintf.cpp HDRS ../nearbyintf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -184,8 +148,6 @@ add_entrypoint_object( remainder.cpp HDRS ../remainder.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -194,8 +156,6 @@ add_entrypoint_object( remainderf.cpp HDRS ../remainderf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -204,8 +164,6 @@ add_entrypoint_object( rint.cpp HDRS ../rint.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -214,8 +172,6 @@ add_entrypoint_object( rintf.cpp HDRS ../rintf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -224,8 +180,6 @@ add_entrypoint_object( round.cpp HDRS ../round.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -234,8 +188,6 @@ add_entrypoint_object( sqrt.cpp HDRS ../sqrt.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -244,8 +196,6 @@ add_entrypoint_object( sqrtf.cpp HDRS ../sqrtf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -254,8 +204,6 @@ add_entrypoint_object( trunc.cpp HDRS ../trunc.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -264,8 +212,6 @@ add_entrypoint_object( truncf.cpp HDRS ../truncf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -274,8 +220,6 @@ add_entrypoint_object( frexp.cpp HDRS ../frexp.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -284,8 +228,6 @@ add_entrypoint_object( frexpf.cpp HDRS ../frexpf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -294,8 +236,6 @@ add_entrypoint_object( scalbn.cpp HDRS ../scalbn.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -304,8 +244,6 @@ add_entrypoint_object( scalbnf.cpp HDRS ../scalbnf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -314,8 +252,6 @@ add_entrypoint_object( ldexp.cpp HDRS ../ldexp.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -324,8 +260,6 @@ add_entrypoint_object( ldexpf.cpp HDRS ../ldexpf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -336,7 +270,6 @@ add_entrypoint_object( ../tgamma.h COMPILE_OPTIONS ${bitcode_link_flags} - -O2 ) add_entrypoint_object( @@ -347,7 +280,6 @@ add_entrypoint_object( ../tgammaf.h COMPILE_OPTIONS ${bitcode_link_flags} - -O2 ) add_entrypoint_object( @@ -358,7 +290,6 @@ add_entrypoint_object( ../lgamma.h COMPILE_OPTIONS ${bitcode_link_flags} - -O2 ) add_entrypoint_object( @@ -369,5 +300,4 @@ add_entrypoint_object( ../lgamma_r.h COMPILE_OPTIONS ${bitcode_link_flags} - -O2 ) diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 6068c36e558ef..c048a64db6bc2 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -2662,8 +2662,6 @@ add_entrypoint_object( ../fmaximum_mag.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -2674,8 +2672,6 @@ add_entrypoint_object( ../fmaximum_magf.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -2686,8 +2682,6 @@ add_entrypoint_object( ../fmaximum_magl.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -2735,8 +2729,6 @@ add_entrypoint_object( ../fmaximum_mag_num.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -2747,8 +2739,6 @@ add_entrypoint_object( ../fmaximum_mag_numf.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -2759,8 +2749,6 @@ add_entrypoint_object( ../fmaximum_mag_numl.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -2954,8 +2942,6 @@ add_entrypoint_object( ../fminimum_mag.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -2966,8 +2952,6 @@ add_entrypoint_object( ../fminimum_magf.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -2978,8 +2962,6 @@ add_entrypoint_object( ../fminimum_magl.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -3027,8 +3009,6 @@ add_entrypoint_object( ../fminimum_mag_num.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -3039,8 +3019,6 @@ add_entrypoint_object( ../fminimum_mag_numf.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -3051,8 +3029,6 @@ add_entrypoint_object( ../fminimum_mag_numl.h DEPENDS libc.src.__support.FPUtil.basic_operations - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -4306,7 +4282,7 @@ add_entrypoint_object( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.__support.FPUtil.except_value_utils - libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.rounding_mode libc.src.__support.macros.optimization @@ -4546,8 +4522,6 @@ add_entrypoint_object( atan.cpp HDRS ../atan.h - COMPILE_OPTIONS - -O3 DEPENDS libc.src.__support.math.atan ) diff --git a/libc/src/math/nvptx/CMakeLists.txt b/libc/src/math/nvptx/CMakeLists.txt index fcb2870b4bb1c..e27c316ff20ca 100644 --- a/libc/src/math/nvptx/CMakeLists.txt +++ b/libc/src/math/nvptx/CMakeLists.txt @@ -4,8 +4,6 @@ add_entrypoint_object( ceil.cpp HDRS ../ceil.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -14,8 +12,6 @@ add_entrypoint_object( ceilf.cpp HDRS ../ceilf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -24,8 +20,6 @@ add_entrypoint_object( copysign.cpp HDRS ../copysign.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -34,8 +28,6 @@ add_entrypoint_object( copysignf.cpp HDRS ../copysignf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -44,8 +36,6 @@ add_entrypoint_object( fabs.cpp HDRS ../fabs.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -54,8 +44,6 @@ add_entrypoint_object( fabsf.cpp HDRS ../fabsf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -64,8 +52,6 @@ add_entrypoint_object( floor.cpp HDRS ../floor.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -74,8 +60,6 @@ add_entrypoint_object( floorf.cpp HDRS ../floorf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -84,8 +68,6 @@ add_entrypoint_object( fma.cpp HDRS ../fma.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -94,8 +76,6 @@ add_entrypoint_object( fmaf.cpp HDRS ../fmaf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -104,8 +84,6 @@ add_entrypoint_object( fmax.cpp HDRS ../fmax.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -114,8 +92,6 @@ add_entrypoint_object( fmaxf.cpp HDRS ../fmaxf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -124,8 +100,6 @@ add_entrypoint_object( fmin.cpp HDRS ../fmin.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -134,8 +108,6 @@ add_entrypoint_object( fminf.cpp HDRS ../fminf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -144,8 +116,6 @@ add_entrypoint_object( fmod.cpp HDRS ../fmod.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -154,8 +124,6 @@ add_entrypoint_object( fmodf.cpp HDRS ../fmodf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -164,8 +132,6 @@ add_entrypoint_object( nearbyint.cpp HDRS ../nearbyint.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -174,8 +140,6 @@ add_entrypoint_object( nearbyintf.cpp HDRS ../nearbyintf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -184,8 +148,6 @@ add_entrypoint_object( remainder.cpp HDRS ../remainder.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -194,8 +156,6 @@ add_entrypoint_object( remainderf.cpp HDRS ../remainderf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -204,8 +164,6 @@ add_entrypoint_object( rint.cpp HDRS ../rint.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -214,8 +172,6 @@ add_entrypoint_object( rintf.cpp HDRS ../rintf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -224,8 +180,6 @@ add_entrypoint_object( round.cpp HDRS ../round.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -234,8 +188,6 @@ add_entrypoint_object( sqrt.cpp HDRS ../sqrt.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -244,8 +196,6 @@ add_entrypoint_object( sqrtf.cpp HDRS ../sqrtf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -254,8 +204,6 @@ add_entrypoint_object( trunc.cpp HDRS ../trunc.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -264,8 +212,6 @@ add_entrypoint_object( truncf.cpp HDRS ../truncf.h - COMPILE_OPTIONS - -O2 ) add_entrypoint_object( @@ -276,7 +222,6 @@ add_entrypoint_object( ../tgamma.h COMPILE_OPTIONS ${bitcode_link_flags} - -O2 ) add_entrypoint_object( @@ -287,7 +232,6 @@ add_entrypoint_object( ../tgammaf.h COMPILE_OPTIONS ${bitcode_link_flags} - -O2 ) add_entrypoint_object( @@ -298,7 +242,6 @@ add_entrypoint_object( ../lgamma.h COMPILE_OPTIONS ${bitcode_link_flags} - -O2 ) add_entrypoint_object( @@ -309,5 +252,4 @@ add_entrypoint_object( ../lgamma_r.h COMPILE_OPTIONS ${bitcode_link_flags} - -O2 ) From 497be6477b7e4f81c861e4a150f05820f092e5e9 Mon Sep 17 00:00:00 2001 From: Razvan Lupusoru Date: Thu, 30 Oct 2025 16:17:00 -0700 Subject: [PATCH 268/539] [acc][flang] Define hasUnknownDimensions in MappableType (#165794) The MappableType interface currently defines a `generateAccBounds` method which examines a variable and generates `acc.bounds` operations that encode its dimensions. The implementation can extract bounds information in various ways: either from the MLIR type itself or by analyzing the IR to find dimension information from defining operations. However, we need to distinguish between cases where dimensional information is not directly available from the type itself. This new `hasUnknownDimensions` API returns true when the MLIR type does not encode dimensional information and there is no associated descriptor or metadata that would make this information extractable from the visible ssa value the represents the variable. The expected use case is calling `generateAccBounds` only when this returns true, as it indicates that bounds must be extracted from the IR (by walking back from current variable to its defining spots or its descriptor). This supports cases such as raw references to arrays with non-constant bounds (e.g., explicit-shape arrays in Fortran where bounds are passed as arguments). This functionality could also be leveraged for CIR VLA support in the future. For FIR types: - Box types return false (descriptor encodes dimensions) - Reference types check if the pointee has dynamic size using fir::hasDynamicSize() --- .../Support/FIROpenACCTypeInterfaces.h | 2 ++ .../Support/FIROpenACCTypeInterfaces.cpp | 22 +++++++++++++++++++ flang/test/Fir/OpenACC/openacc-mappable.fir | 5 +++++ .../lib/OpenACC/TestOpenACCInterfaces.cpp | 4 ++++ .../Dialect/OpenACC/OpenACCTypeInterfaces.td | 12 ++++++++++ .../Dialect/OpenACC/OpenACCOpsTest.cpp | 4 +++- 6 files changed, 48 insertions(+), 1 deletion(-) diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h index 4817ed933ba06..3167c554abbdd 100644 --- a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h +++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h @@ -60,6 +60,8 @@ struct OpenACCMappableModel getOffsetInBytes(mlir::Type type, mlir::Value var, mlir::ValueRange accBounds, const mlir::DataLayout &dataLayout) const; + bool hasUnknownDimensions(mlir::Type type) const; + llvm::SmallVector generateAccBounds(mlir::Type type, mlir::Value var, mlir::OpBuilder &builder) const; diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp index ed9e41c743754..ae0f5fb8197fa 100644 --- a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp +++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp @@ -193,6 +193,28 @@ OpenACCMappableModel::getOffsetInBytes( mlir::Type type, mlir::Value var, mlir::ValueRange accBounds, const mlir::DataLayout &dataLayout) const; +template +bool OpenACCMappableModel::hasUnknownDimensions(mlir::Type type) const { + assert(fir::isa_ref_type(type) && "expected FIR reference type"); + return fir::hasDynamicSize(fir::unwrapRefType(type)); +} + +template bool OpenACCMappableModel::hasUnknownDimensions( + mlir::Type type) const; + +template bool OpenACCMappableModel::hasUnknownDimensions( + mlir::Type type) const; + +template bool OpenACCMappableModel::hasUnknownDimensions( + mlir::Type type) const; + +template <> +bool OpenACCMappableModel::hasUnknownDimensions( + mlir::Type type) const { + // Descriptor-based entities have dimensions encoded. + return false; +} + static llvm::SmallVector generateSeqTyAccBounds(fir::SequenceType seqType, mlir::Value var, mlir::OpBuilder &builder) { diff --git a/flang/test/Fir/OpenACC/openacc-mappable.fir b/flang/test/Fir/OpenACC/openacc-mappable.fir index 05df35a482907..00fe2574da62a 100644 --- a/flang/test/Fir/OpenACC/openacc-mappable.fir +++ b/flang/test/Fir/OpenACC/openacc-mappable.fir @@ -21,11 +21,13 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<2xi64>, // CHECK: Mappable: !fir.box> // CHECK: Type category: array // CHECK: Size: 40 + // CHECK: Has unknown dimensions: false // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref>) -> !fir.ref> {name = "arr", structured = false} // CHECK: Pointer-like and Mappable: !fir.ref> // CHECK: Type category: array // CHECK: Size: 40 + // CHECK: Has unknown dimensions: false // This second test exercises argument of explicit-shape arrays in following forms: // `real :: arr1(nn), arr2(2:nn), arr3(10)` @@ -62,6 +64,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<2xi64>, // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref>) -> !fir.ref> {name = "arr1", structured = false} // CHECK: Pointer-like and Mappable: !fir.ref> // CHECK: Type category: array + // CHECK: Has unknown dimensions: true // CHECK: Shape: %{{.*}} = fir.shape %[[EXTENT1:.*]] : (index) -> !fir.shape<1> // CHECK: Bound[0]: %{{.*}} = acc.bounds lowerbound(%[[LB1:.*]] : index) upperbound(%[[UB1:.*]] : index) extent(%{{.*}} : index) stride(%c1{{.*}} : index) startIdx(%c1{{.*}} : index) // CHECK: Lower bound: %[[LB1]] = arith.constant 0 : index @@ -70,6 +73,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<2xi64>, // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref>) -> !fir.ref> {name = "arr2", structured = false} // CHECK: Pointer-like and Mappable: !fir.ref> // CHECK: Type category: array + // CHECK: Has unknown dimensions: true // CHECK: Shape: %{{.*}} = fir.shape_shift %c2{{.*}}, %[[EXTENT2:.*]] : (index, index) -> !fir.shapeshift<1> // CHECK: Bound[0]: %{{.*}} = acc.bounds lowerbound(%[[LB2:.*]] : index) upperbound(%[[UB2:.*]] : index) extent(%{{.*}} : index) stride(%c1{{.*}} : index) startIdx(%c2{{.*}} : index) // CHECK: Lower bound: %[[LB2]] = arith.constant 0 : index @@ -80,6 +84,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<2xi64>, // CHECK: Type category: array // CHECK: Size: 40 // CHECK: Offset: 0 + // CHECK: Has unknown dimensions: false // CHECK: Shape: %{{.*}} = fir.shape %[[EXTENT3:.*]] : (index) -> !fir.shape<1> // CHECK: Bound[0]: %{{.*}} = acc.bounds lowerbound(%[[LB3:.*]] : index) upperbound(%[[UB3:.*]] : index) extent(%c10{{.*}} : index) stride(%c1{{.*}} : index) startIdx(%c1{{.*}} : index) // CHECK: Lower bound: %[[LB3]] = arith.constant 0 : index diff --git a/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp b/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp index 9a80e3b1a9aee..072aee5ba269f 100644 --- a/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp +++ b/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp @@ -100,6 +100,10 @@ struct TestFIROpenACCInterfaces } } + llvm::errs() << "\t\tHas unknown dimensions: " + << (mappableTy.hasUnknownDimensions() ? "true" : "false") + << "\n"; + if (auto declareOp = dyn_cast_if_present(var.getDefiningOp())) { llvm::errs() << "\t\tShape: " << declareOp.getShape() << "\n"; diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td index 93e9e3d0689f7..d1bbc7f206ce6 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td @@ -259,6 +259,18 @@ def OpenACC_MappableTypeInterface : TypeInterface<"MappableType"> { return {}; }] >, + InterfaceMethod< + /*description=*/[{ + Returns true if the dimensions of this type are not known. This occurs + when the MLIR type does not encode dimensional information and there is + no associated descriptor or metadata in the current entity that would + make this information extractable. For example, an opaque pointer type + pointing to an array without dimension information would have unknown + dimensions. + }], + /*retTy=*/"bool", + /*methodName=*/"hasUnknownDimensions" + >, InterfaceMethod< /*description=*/[{ Returns explicit `acc.bounds` operations that envelop the whole diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp index 6ac9a873e6154..d6203b97e00d7 100644 --- a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp +++ b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp @@ -766,7 +766,9 @@ void testShortDataEntryOpBuildersMappableVar(OpBuilder &b, MLIRContext &context, struct IntegerOpenACCMappableModel : public mlir::acc::MappableType::ExternalModel {}; + IntegerType> { + bool hasUnknownDimensions(mlir::Type type) const { return false; } +}; TEST_F(OpenACCOpsTest, mappableTypeBuilderDataEntry) { // First, set up the test by attaching MappableInterface to IntegerType. From c1550dc8022f150e5110db51c2c926a83b59699f Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Thu, 30 Oct 2025 16:20:54 -0700 Subject: [PATCH 269/539] workflows/release-binaries: Drop x86_64 Mac OS builds (#165645) We don't have the resources to test this and the builds are very expensive. If someone is interested in providing x86_64 macOS they can submit this as third-party binaries or provide resources to test the builds. --- .github/workflows/release-binaries-all.yml | 1 - .github/workflows/release-binaries.yml | 11 ----------- 2 files changed, 12 deletions(-) diff --git a/.github/workflows/release-binaries-all.yml b/.github/workflows/release-binaries-all.yml index 0b52a08202f1a..eef49b5e3625d 100644 --- a/.github/workflows/release-binaries-all.yml +++ b/.github/workflows/release-binaries-all.yml @@ -90,7 +90,6 @@ jobs: runs-on: - ubuntu-22.04 - ubuntu-22.04-arm - - macos-13 - macos-14 uses: ./.github/workflows/release-binaries.yml diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml index 8145926265256..fa73b9d9fe8d0 100644 --- a/.github/workflows/release-binaries.yml +++ b/.github/workflows/release-binaries.yml @@ -21,7 +21,6 @@ on: options: - ubuntu-22.04 - ubuntu-22.04-arm - - macos-13 - macos-14 workflow_call: @@ -130,8 +129,6 @@ jobs: target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_BOOTSTRAP_COMPILER_RT_ENABLE_IOS=OFF" if [ "$RUNNER_ARCH" = "ARM64" ]; then arches=arm64 - else - arches=x86_64 fi target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_BOOTSTRAP_DARWIN_osx_ARCHS=$arches -DBOOTSTRAP_BOOTSTRAP_DARWIN_osx_BUILTIN_ARCHS=$arches" fi @@ -147,14 +144,6 @@ jobs: build_runs_on="depot-${{ inputs.runs-on }}-16" test_runs_on=$build_runs_on ;; - macos-13) - if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then - build_runs_on="${{ inputs.runs-on }}" - else - build_runs_on="macos-13-large" - fi - test_runs_on="${{ inputs.runs-on }}" - ;; macos-14) if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then build_runs_on="${{ inputs.runs-on }}" From a60d1989a4125ce6e9a60898686b47ef666b729a Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Thu, 30 Oct 2025 16:37:00 -0700 Subject: [PATCH 270/539] Add to llvm-libc-types headers that need it. (#165798) We need `` to support having "bool" members inside pthread structs that may get included through `` from C code prior to C23. --- libc/include/llvm-libc-types/__barrier_type.h | 2 ++ libc/include/llvm-libc-types/pthread_barrierattr_t.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/libc/include/llvm-libc-types/__barrier_type.h b/libc/include/llvm-libc-types/__barrier_type.h index 59712619e917d..5752f832f04b9 100644 --- a/libc/include/llvm-libc-types/__barrier_type.h +++ b/libc/include/llvm-libc-types/__barrier_type.h @@ -9,6 +9,8 @@ #ifndef LLVM_LIBC_TYPES__BARRIER_TYPE_H #define LLVM_LIBC_TYPES__BARRIER_TYPE_H +#include + typedef struct __attribute__((aligned(8 /* alignof (Barrier) */))) { unsigned expected; unsigned waiting; diff --git a/libc/include/llvm-libc-types/pthread_barrierattr_t.h b/libc/include/llvm-libc-types/pthread_barrierattr_t.h index 064be5bfb6721..b62fdc0f72e12 100644 --- a/libc/include/llvm-libc-types/pthread_barrierattr_t.h +++ b/libc/include/llvm-libc-types/pthread_barrierattr_t.h @@ -9,6 +9,8 @@ #ifndef LLVM_LIBC_TYPES_PTHREAD_BARRIERATTR_T_H #define LLVM_LIBC_TYPES_PTHREAD_BARRIERATTR_T_H +#include + typedef struct { bool pshared; } pthread_barrierattr_t; From e5ee376b861f9bbc945cfcb1025d8e5c33bb83d5 Mon Sep 17 00:00:00 2001 From: agozillon Date: Fri, 31 Oct 2025 00:54:31 +0100 Subject: [PATCH 271/539] [MLIR][OpenMP] Fix and simplify bounds offset calculation for 1-D GEP offsets (#165486) Currently this is being calculated incorrectly and will result in incorrect index offsets in more complicated array slices. This PR tries to address it by refactoring and changing the calculation to be more correct. --- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 55 ++++++----------- ...rget-record-type-with-ptr-member-host.mlir | 3 +- .../fortran/descriptor-array-slice-map.f90 | 61 +++++++++++++++++++ 3 files changed, 81 insertions(+), 38 deletions(-) create mode 100644 offload/test/offloading/fortran/descriptor-array-slice-map.f90 diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index f28454075f1d3..8edec990eaaba 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -4084,12 +4084,13 @@ static omp::MapInfoOp getFirstOrLastMappedMemberPtr(omp::MapInfoOp mapInfo, /// /// Fortran /// map(tofrom: array(2:5, 3:2)) -/// or -/// C++ -/// map(tofrom: array[1:4][2:3]) +/// /// We must calculate the initial pointer offset to pass across, this function /// performs this using bounds. /// +/// TODO/WARNING: This only supports Fortran's column major indexing currently +/// as is noted in the note below and comments in the function, we must extend +/// this function when we add a C++ frontend. /// NOTE: which while specified in row-major order it currently needs to be /// flipped for Fortran's column order array allocation and access (as /// opposed to C++'s row-major, hence the backwards processing where order is @@ -4125,46 +4126,28 @@ calculateBoundsOffset(LLVM::ModuleTranslation &moduleTranslation, // with a pointer that's being treated like an array and we have the // underlying type e.g. an i32, or f64 etc, e.g. a fortran descriptor base // address (pointer pointing to the actual data) so we must caclulate the - // offset using a single index which the following two loops attempts to - // compute. - - // Calculates the size offset we need to make per row e.g. first row or - // column only needs to be offset by one, but the next would have to be - // the previous row/column offset multiplied by the extent of current row. + // offset using a single index which the following loop attempts to + // compute using the standard column-major algorithm e.g for a 3D array: // - // For example ([1][10][100]): + // ((((c_idx * b_len) + b_idx) * a_len) + a_idx) // - // - First row/column we move by 1 for each index increment - // - Second row/column we move by 1 (first row/column) * 10 (extent/size of - // current) for 10 for each index increment - // - Third row/column we would move by 10 (second row/column) * - // (extent/size of current) 100 for 1000 for each index increment - std::vector dimensionIndexSizeOffset{builder.getInt64(1)}; - for (size_t i = 1; i < bounds.size(); ++i) { - if (auto boundOp = dyn_cast_if_present( - bounds[i].getDefiningOp())) { - dimensionIndexSizeOffset.push_back(builder.CreateMul( - moduleTranslation.lookupValue(boundOp.getExtent()), - dimensionIndexSizeOffset[i - 1])); - } - } - - // Now that we have calculated how much we move by per index, we must - // multiply each lower bound offset in indexes by the size offset we - // have calculated in the previous and accumulate the results to get - // our final resulting offset. + // It is of note that it's doing column-major rather than row-major at the + // moment, but having a way for the frontend to indicate which major format + // to use or standardizing/canonicalizing the order of the bounds to compute + // the offset may be useful in the future when there's other frontends with + // different formats. + std::vector dimensionIndexSizeOffset; for (int i = bounds.size() - 1; i >= 0; --i) { if (auto boundOp = dyn_cast_if_present( bounds[i].getDefiningOp())) { - if (idx.empty()) - idx.emplace_back(builder.CreateMul( - moduleTranslation.lookupValue(boundOp.getLowerBound()), - dimensionIndexSizeOffset[i])); + if (i == ((int)bounds.size() - 1)) + idx.emplace_back( + moduleTranslation.lookupValue(boundOp.getLowerBound())); else idx.back() = builder.CreateAdd( - idx.back(), builder.CreateMul(moduleTranslation.lookupValue( - boundOp.getLowerBound()), - dimensionIndexSizeOffset[i])); + builder.CreateMul(idx.back(), moduleTranslation.lookupValue( + boundOp.getExtent())), + moduleTranslation.lookupValue(boundOp.getLowerBound())); } } } diff --git a/mlir/test/Target/LLVMIR/omptarget-record-type-with-ptr-member-host.mlir b/mlir/test/Target/LLVMIR/omptarget-record-type-with-ptr-member-host.mlir index a1e415c35e4b6..9640f03311af7 100644 --- a/mlir/test/Target/LLVMIR/omptarget-record-type-with-ptr-member-host.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-record-type-with-ptr-member-host.mlir @@ -81,9 +81,8 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a // CHECK: %[[ARR_SECT_SIZE:.*]] = mul i64 %[[ARR_SECT_SIZE1]], 4 // CHECK: %[[LFULL_ARR:.*]] = load ptr, ptr @full_arr, align 8 // CHECK: %[[FULL_ARR_PTR:.*]] = getelementptr inbounds float, ptr %[[LFULL_ARR]], i64 0 -// CHECK: %[[ARR_SECT_OFFSET1:.*]] = mul i64 %[[ARR_SECT_OFFSET2]], 1 // CHECK: %[[LARR_SECT:.*]] = load ptr, ptr @sect_arr, align 8 -// CHECK: %[[ARR_SECT_PTR:.*]] = getelementptr inbounds i32, ptr %[[LARR_SECT]], i64 %[[ARR_SECT_OFFSET1]] +// CHECK: %[[ARR_SECT_PTR:.*]] = getelementptr inbounds i32, ptr %[[LARR_SECT]], i64 %[[ARR_SECT_OFFSET2]] // CHECK: %[[SCALAR_PTR_LOAD:.*]] = load ptr, ptr %[[SCALAR_BASE]], align 8 // CHECK: %[[FULL_ARR_DESC_SIZE:.*]] = sdiv exact i64 48, ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) // CHECK: %[[FULL_ARR_SIZE_CMP:.*]] = icmp eq ptr %[[FULL_ARR_PTR]], null diff --git a/offload/test/offloading/fortran/descriptor-array-slice-map.f90 b/offload/test/offloading/fortran/descriptor-array-slice-map.f90 new file mode 100644 index 0000000000000..69abb320adc35 --- /dev/null +++ b/offload/test/offloading/fortran/descriptor-array-slice-map.f90 @@ -0,0 +1,61 @@ +! Offloading test which aims to test that an allocatable/descriptor type map +! will allow the appropriate slicing behaviour. +! REQUIRES: flang, amdgpu + +subroutine slice_writer(n, a, b, c) + implicit none + integer, intent(in) :: n + real(8), intent(in) :: a(n) + real(8), intent(in) :: b(n) + real(8), intent(out) :: c(n) + integer :: i + + !$omp target teams distribute parallel do + do i=1,n + c(i) = b(i) + a(i) + end do +end subroutine slice_writer + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +program main + implicit none + real(kind=8), allocatable :: a(:,:,:) + integer :: i, j, k, idx, idx1, idx2, idx3 + + i=50 + j=100 + k=2 + + allocate(a(1:i,1:j,1:k)) + + do idx1=1, i + do idx2=1, j + do idx3=1, k + a(idx1,idx2,idx3) = idx2 + end do + end do + end do + + do idx=1,k + !$omp target enter data map(alloc: a(1:i,:, idx)) + + !$omp target update to(a(1:i, 1:30, idx), & + !$omp& a(1:i, 61:100, idx)) + + call slice_writer(i, a(:, 1, idx), a(:, 61, idx), a(:, 31, idx)) + call slice_writer(i, a(:, 30, idx), a(:, 100, idx), a(:, 60, idx)) + + !$omp target update from(a(1:i, 31:60, idx)) + !$omp target exit data map(delete: a(1:i, :, idx)) + + print *, a(1, 31, idx), a(2, 31, idx), a(i, 31, idx) + print *, a(1, 60, idx), a(2, 60, idx), a(i, 60, idx) + enddo + + deallocate(a) +end program + +! CHECK: 62. 62. 62. +! CHECK: 130. 130. 130. +! CHECK: 62. 62. 62. +! CHECK: 130. 130. 130. From e52b5518b93db5ea5018403d3e15322808ed98ab Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Fri, 31 Oct 2025 00:01:50 +0000 Subject: [PATCH 272/539] [lit] Mark ulimit test as unsupported on Solaris 160058fc19a9bcb70feb442a755229838b4dbc7a broke the Solaris bots because they do not support RLIMIT_FSIZE despite it being in POSIX 2004. Disable it there for now as the loss of test coverage should not be significant. --- llvm/utils/lit/tests/shtest-ulimit-nondarwin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py index 893270ec68f68..286fd3d7e173e 100644 --- a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py +++ b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py @@ -2,7 +2,7 @@ # ulimit does not work on non-POSIX platforms. # These tests are specific to options that Darwin does not support. -# UNSUPPORTED: system-windows, system-darwin, system-aix +# UNSUPPORTED: system-windows, system-darwin, system-aix, system-solaris # RUN: not %{lit} -a -v %{inputs}/shtest-ulimit-nondarwin | FileCheck %s From db0b2ef704088a613711f8f7be78ca918b184c79 Mon Sep 17 00:00:00 2001 From: Eli Friedman Date: Thu, 30 Oct 2025 17:18:18 -0700 Subject: [PATCH 273/539] Update Qualcomm email addresses. (#165799) Updating email IDs to align with employer mandate. --- clang/AreaTeamMembers.txt | 2 +- clang/Maintainers.rst | 2 +- llvm/Maintainers.md | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/AreaTeamMembers.txt b/clang/AreaTeamMembers.txt index 964d11e79f694..2928943f47533 100644 --- a/clang/AreaTeamMembers.txt +++ b/clang/AreaTeamMembers.txt @@ -13,5 +13,5 @@ rnk@google.com (email), rnk (Discourse), rnk (GitHub), rnk (Discord) Other Members ------------- Eli Friedman -efriedma@quicinc.com> (email), efriedma-quic (Discourse), efriedma-quic (GitHub) +efriedma@qti.qualcomm.com> (email), efriedma-quic (Discourse), efriedma-quic (GitHub) diff --git a/clang/Maintainers.rst b/clang/Maintainers.rst index 8fb2201aae16c..1d16ea9fe5638 100644 --- a/clang/Maintainers.rst +++ b/clang/Maintainers.rst @@ -46,7 +46,7 @@ Clang LLVM IR generation | rjmccall\@apple.com (email), rjmccall (Phabricator), rjmccall (GitHub) | Eli Friedman -| efriedma\@quicinc.com (email), efriedma (Phabricator), efriedma-quic (GitHub) +| efriedma\@qti.qualcomm.com (email), efriedma (Phabricator), efriedma-quic (GitHub) | Anton Korobeynikov | anton\@korobeynikov.info (email), asl (Phabricator), asl (GitHub) diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md index e52259236fc19..1eba955f9d6ed 100644 --- a/llvm/Maintainers.md +++ b/llvm/Maintainers.md @@ -197,7 +197,7 @@ david.green@arm.com (email), [davemgreen](https://github.com/davemgreen) (GitHub Amara Emerson (esp. AArch64 GlobalISel) \ amara@apple.com (email), [aemerson](https://github.com/aemerson) (GitHub) \ Eli Friedman (esp. ARM64EC) \ -efriedma@quicinc.com (email), [efriedma-quic](https://github.com/efriedma-quic) (GitHub) \ +efriedma@qti.qualcomm.com (email), [efriedma-quic](https://github.com/efriedma-quic) (GitHub) \ Sjoerd Meijer \ smeijer@nvidia.com (email), [sjoerdmeijer](https://github.com/sjoerdmeijer) (GitHub) \ Nashe Mncube \ @@ -246,7 +246,7 @@ mail@justinbogner.com (email), [bogner](https://github.com/bogner) (GitHub) #### Hexagon backend Sundeep Kushwaha \ -sundeepk@quicinc.com (email), [SundeepKushwaha](https://github.com/SundeepKushwaha) (GitHub) +sundeepk@qti.qualcomm.com (email), [SundeepKushwaha](https://github.com/SundeepKushwaha) (GitHub) #### Lanai backend From 69d71f36c123ef00f94dc2b635c49c9e206472b2 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 30 Oct 2025 17:30:22 -0700 Subject: [PATCH 274/539] Reapply "[TSan] Make Test work with Internal Shell" This reverts commit 39f08eb997424626bd396a0529daf4ab816d19e6. This was causing buildbot failures because we were using an explicit python call instead of the python substitution. This leads to failures on platforms that do not have a binary called python. --- compiler-rt/test/tsan/ignore_lib0.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/compiler-rt/test/tsan/ignore_lib0.cpp b/compiler-rt/test/tsan/ignore_lib0.cpp index cba58c6177038..9c4919022b512 100644 --- a/compiler-rt/test/tsan/ignore_lib0.cpp +++ b/compiler-rt/test/tsan/ignore_lib0.cpp @@ -4,11 +4,13 @@ // RUN: %clangxx_tsan -O1 -fno-builtin %s -DLIB -fPIC -fno-sanitize=thread -shared -o %t-dir/libignore_lib0.so // RUN: %clangxx_tsan -O1 %s -L%t-dir -lignore_lib0 %link_libcxx_tsan -o %t // RUN: echo running w/o suppressions: -// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %deflake %run %t | FileCheck %s --check-prefix=CHECK-NOSUPP +// RUN: echo -n %t-dir > %t.ld_library_path +// RUN: %python -c "if 'LD_LIBRARY_PATH' in __import__('os').environ: print(':' + __import__('os').environ['LD_LIBRARY_PATH'], end='')" >> %t.ld_library_path +// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %deflake %run %t | FileCheck %s --check-prefix=CHECK-NOSUPP // RUN: echo running with suppressions: -// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %env_tsan_opts=suppressions='%s.supp' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP +// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %env_tsan_opts=suppressions='%s.supp' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP // RUN: echo running with generic suppression of noninstrumented code: -// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %env_tsan_opts=ignore_noninstrumented_modules=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP +// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %env_tsan_opts=ignore_noninstrumented_modules=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP // Tests that interceptors coming from a library specified in called_from_lib // suppression are ignored. From eaf7c949d06373fae84071b25079ab92ba136b04 Mon Sep 17 00:00:00 2001 From: "A. Jiang" Date: Fri, 31 Oct 2025 10:07:38 +0800 Subject: [PATCH 275/539] [libc++][test] Make `deallocate_size.pass.cpp` MSVC-friendly (#165162) This patch contains several changes to `deallocate_size.pass.cpp`: 1. `static_cast`-ing some parameters to `size_t` to avoid narrowing. 2. Changing the type of loop variable `i` to `unsigned int` avoid signedness mismatch with the constructor parameter. 3. Separately counting allocations and deallocations in variables `allocated_` and `deallocated_`, and changing their type to `uint64_t`. 4. Avoiding `assert`-ing count of allocations when a `basic_string` is allocated, just `assert`-ing after destruction instead. --- .../string.capacity/deallocate_size.pass.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp index 00f9e2b846783..ecdc39701641d 100644 --- a/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp +++ b/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp @@ -12,12 +12,14 @@ #include #include +#include #include #include #include "test_macros.h" -static int allocated_; +static std::uint64_t allocated_; +static std::uint64_t deallocated_; template struct test_alloc { @@ -41,12 +43,12 @@ struct test_alloc { pointer allocate(size_type n, const void* = nullptr) { allocated_ += n; - return std::allocator().allocate(n); + return std::allocator().allocate(static_cast(n)); } void deallocate(pointer p, size_type s) { - allocated_ -= s; - std::allocator().deallocate(p, s); + deallocated_ += s; + std::allocator().deallocate(p, static_cast(s)); } template @@ -64,14 +66,13 @@ struct test_alloc { template void test() { - for (int i = 1; i < 1000; ++i) { - using Str = std::basic_string, test_alloc >; + for (unsigned int i = 1; i < 1000; ++i) { { - Str s(i, 't'); - assert(allocated_ == 0 || allocated_ >= i); + std::basic_string, test_alloc > s(i, 't'); + (void)s; } + assert(allocated_ == deallocated_); } - assert(allocated_ == 0); } int main(int, char**) { From f57dd864fe38815516a49850f2f6cd2a1beb04b9 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 31 Oct 2025 02:22:55 +0000 Subject: [PATCH 276/539] [VPlan] Limit VPScalarIVSteps to step == 1 in getSCEVExprForVPValue. For now, just support VPScalarIVSteps with step == 1 in getSCEVExprForVPValue. This fixes a crash when the step would be != 1. --- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 3 +- .../AArch64/replicating-load-store-costs.ll | 108 ++++++++++++++++++ 2 files changed, 110 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 54348c6e34488..8c23e78693db5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -108,7 +108,8 @@ const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V, .Case([&SE, L](const VPScalarIVStepsRecipe *R) { const SCEV *IV = getSCEVExprForVPValue(R->getOperand(0), SE, L); const SCEV *Step = getSCEVExprForVPValue(R->getOperand(1), SE, L); - if (isa(IV) || isa(Step)) + if (isa(IV) || isa(Step) || + !Step->isOne()) return SE.getCouldNotCompute(); return SE.getMulExpr(SE.getTruncateOrSignExtend(IV, Step->getType()), Step); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll index 7f345133f51dd..68cfc659e1e94 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll @@ -660,6 +660,114 @@ exit: ret i32 %red } + +define i32 @test_or_reduction_with_stride_2(i32 %scale, ptr %src) { +; CHECK-LABEL: define i32 @test_or_reduction_with_stride_2( +; CHECK-SAME: i32 [[SCALE:%.*]], ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[SCALE]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP66:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 10 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 12 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 14 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 16 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 18 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 20 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 22 +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 24 +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 26 +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 28 +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 30 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP32:%.*]] = load i8, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i8, ptr [[TMP19]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = load i8, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP23]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = load i8, ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = load i8, ptr [[TMP26]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr [[TMP27]], align 1 +; CHECK-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP28]], align 1 +; CHECK-NEXT: [[TMP45:%.*]] = load i8, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[TMP46:%.*]] = load i8, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP47:%.*]] = load i8, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = insertelement <16 x i8> poison, i8 [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <16 x i8> [[TMP48]], i8 [[TMP33]], i32 1 +; CHECK-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP49]], i8 [[TMP34]], i32 2 +; CHECK-NEXT: [[TMP51:%.*]] = insertelement <16 x i8> [[TMP50]], i8 [[TMP35]], i32 3 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP36]], i32 4 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <16 x i8> [[TMP52]], i8 [[TMP37]], i32 5 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <16 x i8> [[TMP53]], i8 [[TMP38]], i32 6 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP54]], i8 [[TMP39]], i32 7 +; CHECK-NEXT: [[TMP56:%.*]] = insertelement <16 x i8> [[TMP55]], i8 [[TMP40]], i32 8 +; CHECK-NEXT: [[TMP57:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP41]], i32 9 +; CHECK-NEXT: [[TMP58:%.*]] = insertelement <16 x i8> [[TMP57]], i8 [[TMP42]], i32 10 +; CHECK-NEXT: [[TMP59:%.*]] = insertelement <16 x i8> [[TMP58]], i8 [[TMP43]], i32 11 +; CHECK-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP59]], i8 [[TMP44]], i32 12 +; CHECK-NEXT: [[TMP61:%.*]] = insertelement <16 x i8> [[TMP60]], i8 [[TMP45]], i32 13 +; CHECK-NEXT: [[TMP62:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP46]], i32 14 +; CHECK-NEXT: [[TMP63:%.*]] = insertelement <16 x i8> [[TMP62]], i8 [[TMP47]], i32 15 +; CHECK-NEXT: [[TMP64:%.*]] = sext <16 x i8> [[TMP63]] to <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = mul <16 x i32> [[BROADCAST_SPLAT]], [[TMP64]] +; CHECK-NEXT: [[TMP66]] = or <16 x i32> [[TMP65]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP67:%.*]] = icmp eq i64 [[INDEX_NEXT]], 48 +; CHECK-NEXT: br i1 [[TMP67]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP68:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP66]]) +; CHECK-NEXT: br label %[[SCALAR_PH:.*]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %reduction = phi i32 [ %reduction.next, %loop ], [ 0, %entry ] + %gep = getelementptr [32 x i8], ptr %src, i64 %iv + %load = load i8, ptr %gep, align 1 + %sext = sext i8 %load to i32 + %mul = mul i32 %scale, %sext + %reduction.next = or i32 %mul, %reduction + %iv.next = add i64 %iv, 2 + %cmp = icmp eq i64 %iv.next, 100 + br i1 %cmp, label %exit, label %loop + +exit: + ret i32 %reduction.next +} + attributes #0 = { "target-cpu"="neoverse-512tvb" } !0 = !{!1, !2, i64 0} From 78fb5f7e1f570a417988f98af581226d1f576978 Mon Sep 17 00:00:00 2001 From: Zhaoxin Yang Date: Fri, 31 Oct 2025 10:25:51 +0800 Subject: [PATCH 277/539] [LoongArch] Lowering flog2 to flogb (#162978) According to LoongArch ISA Volume 1 V1.11, FLOGB.S/D is unsupported in LA32. --- .../LoongArch/LoongArchFloat32InstrInfo.td | 1 + .../LoongArch/LoongArchFloat64InstrInfo.td | 1 + .../LoongArch/LoongArchISelLowering.cpp | 10 +- .../LoongArch/LoongArchLASXInstrInfo.td | 3 + .../Target/LoongArch/LoongArchLSXInstrInfo.td | 3 + .../CodeGen/LoongArch/ir-instruction/flog2.ll | 8 +- .../LoongArch/lasx/ir-instruction/flog2.ll | 258 +----------------- .../LoongArch/lsx/ir-instruction/flog2.ll | 156 +---------- 8 files changed, 48 insertions(+), 392 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td index 690dd73014e57..e86b21cf849cb 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td @@ -365,6 +365,7 @@ def : Pat<(f32 (uint_to_fp (i64 (sexti32 (i64 GPR:$src))))), // FP Rounding let Predicates = [HasBasicF, IsLA64] in { def : PatFpr; +def : PatFpr; } // Predicates = [HasBasicF, IsLA64] let Predicates = [HasBasicF, IsLA32] in { diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td index daefbaa52d42a..2e88254aab4d5 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td @@ -348,6 +348,7 @@ def : Pat<(bitconvert FPR64:$src), (MOVFR2GR_D FPR64:$src)>; // FP Rounding let Predicates = [HasBasicD, IsLA64] in { def : PatFpr; +def : PatFpr; } // Predicates = [HasBasicD, IsLA64] /// Pseudo-instructions needed for the soft-float ABI with LA32D diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 80c96c6dc8eb6..a6de839de7c28 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -244,8 +244,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_BF16, MVT::f32, Subtarget.isSoftFPABI() ? LibCall : Custom); - if (Subtarget.is64Bit()) + if (Subtarget.is64Bit()) { setOperationAction(ISD::FRINT, MVT::f32, Legal); + setOperationAction(ISD::FLOG2, MVT::f32, Legal); + } if (!Subtarget.hasBasicD()) { setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); @@ -291,8 +293,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_BF16, MVT::f64, Subtarget.isSoftFPABI() ? LibCall : Custom); - if (Subtarget.is64Bit()) + if (Subtarget.is64Bit()) { setOperationAction(ISD::FRINT, MVT::f64, Legal); + setOperationAction(ISD::FLOG2, MVT::f64, Legal); + } } // Set operations for 'LSX' feature. @@ -362,6 +366,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::FSQRT, VT, Legal); setOperationAction(ISD::FNEG, VT, Legal); + setOperationAction(ISD::FLOG2, VT, Legal); setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT, ISD::SETUGE, ISD::SETUGT}, VT, Expand); @@ -443,6 +448,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::FSQRT, VT, Legal); setOperationAction(ISD::FNEG, VT, Legal); + setOperationAction(ISD::FLOG2, VT, Legal); setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT, ISD::SETUGE, ISD::SETUGT}, VT, Expand); diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 613dea6093f5f..ddf91ca54e1e0 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1593,6 +1593,9 @@ def : Pat<(fma_nsz (fneg v4f64:$xj), v4f64:$xk, v4f64:$xa), // XVFSQRT_{S/D} defm : PatXrF; +// XVFLOGB_{S/D} +defm : PatXrF; + // XVRECIP_{S/D} def : Pat<(fdiv vsplatf32_fpimm_eq_1, v8f32:$xj), (XVFRECIP_S v8f32:$xj)>; diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 4619c6bd248a6..ba1204d620575 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -1783,6 +1783,9 @@ def : Pat<(fma_nsz (fneg v2f64:$vj), v2f64:$vk, v2f64:$va), // VFSQRT_{S/D} defm : PatVrF; +// VFLOGB_{S/D} +defm : PatVrF; + // VFRECIP_{S/D} def : Pat<(fdiv vsplatf32_fpimm_eq_1, v4f32:$vj), (VFRECIP_S v4f32:$vj)>; diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/flog2.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/flog2.ll index 93fcd421e4bd7..e02a2e7cce9b2 100644 --- a/llvm/test/CodeGen/LoongArch/ir-instruction/flog2.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/flog2.ll @@ -12,8 +12,8 @@ define float @flog2_s(float %x) nounwind { ; ; LA64-LABEL: flog2_s: ; LA64: # %bb.0: -; LA64-NEXT: pcaddu18i $t8, %call36(log2f) -; LA64-NEXT: jr $t8 +; LA64-NEXT: flogb.s $fa0, $fa0 +; LA64-NEXT: ret %y = call float @llvm.log2.f32(float %x) ret float %y } @@ -25,8 +25,8 @@ define double @flog2_d(double %x) nounwind { ; ; LA64-LABEL: flog2_d: ; LA64: # %bb.0: -; LA64-NEXT: pcaddu18i $t8, %call36(log2) -; LA64-NEXT: jr $t8 +; LA64-NEXT: flogb.d $fa0, $fa0 +; LA64-NEXT: ret %y = call double @llvm.log2.f64(double %x) ret double %y } diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/flog2.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/flog2.ll index 68f2e3ab488e1..6b5f5751e5706 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/flog2.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/flog2.ll @@ -1,166 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefix=LA32 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefix=LA64 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s declare <8 x float> @llvm.log2.v8f32(<8 x float>) declare <4 x double> @llvm.log2.v4f64(<4 x double>) define void @flog2_v8f32(ptr %res, ptr %a) nounwind { -; LA32-LABEL: flog2_v8f32: -; LA32: # %bb.0: # %entry -; LA32-NEXT: addi.w $sp, $sp, -128 -; LA32-NEXT: st.w $ra, $sp, 124 # 4-byte Folded Spill -; LA32-NEXT: st.w $fp, $sp, 120 # 4-byte Folded Spill -; LA32-NEXT: xvld $xr0, $a1, 0 -; LA32-NEXT: xvst $xr0, $sp, 80 # 32-byte Folded Spill -; LA32-NEXT: move $fp, $a0 -; LA32-NEXT: xvpickve.w $xr0, $xr0, 5 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.w $xr0, $xr0, 4 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $xr0 -; LA32-NEXT: vld $vr1, $sp, 48 # 16-byte Folded Reload -; LA32-NEXT: vextrins.w $vr0, $vr1, 16 -; LA32-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.w $xr0, $xr0, 6 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload -; LA32-NEXT: vextrins.w $vr1, $vr0, 32 -; LA32-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.w $xr0, $xr0, 7 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload -; LA32-NEXT: vextrins.w $vr1, $vr0, 48 -; LA32-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.w $xr0, $xr0, 1 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.w $xr0, $xr0, 0 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $xr0 -; LA32-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload -; LA32-NEXT: vextrins.w $vr0, $vr1, 16 -; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.w $xr0, $xr0, 2 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload -; LA32-NEXT: vextrins.w $vr1, $vr0, 32 -; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.w $xr0, $xr0, 3 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload -; LA32-NEXT: vextrins.w $vr1, $vr0, 48 -; LA32-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload -; LA32-NEXT: xvpermi.q $xr1, $xr0, 2 -; LA32-NEXT: xvst $xr1, $fp, 0 -; LA32-NEXT: ld.w $fp, $sp, 120 # 4-byte Folded Reload -; LA32-NEXT: ld.w $ra, $sp, 124 # 4-byte Folded Reload -; LA32-NEXT: addi.w $sp, $sp, 128 -; LA32-NEXT: ret -; -; LA64-LABEL: flog2_v8f32: -; LA64: # %bb.0: # %entry -; LA64-NEXT: addi.d $sp, $sp, -128 -; LA64-NEXT: st.d $ra, $sp, 120 # 8-byte Folded Spill -; LA64-NEXT: st.d $fp, $sp, 112 # 8-byte Folded Spill -; LA64-NEXT: xvld $xr0, $a1, 0 -; LA64-NEXT: xvst $xr0, $sp, 80 # 32-byte Folded Spill -; LA64-NEXT: move $fp, $a0 -; LA64-NEXT: xvpickve.w $xr0, $xr0, 5 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.w $xr0, $xr0, 4 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $xr0 -; LA64-NEXT: vld $vr1, $sp, 48 # 16-byte Folded Reload -; LA64-NEXT: vextrins.w $vr0, $vr1, 16 -; LA64-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.w $xr0, $xr0, 6 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload -; LA64-NEXT: vextrins.w $vr1, $vr0, 32 -; LA64-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.w $xr0, $xr0, 7 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload -; LA64-NEXT: vextrins.w $vr1, $vr0, 48 -; LA64-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.w $xr0, $xr0, 1 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.w $xr0, $xr0, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $xr0 -; LA64-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload -; LA64-NEXT: vextrins.w $vr0, $vr1, 16 -; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.w $xr0, $xr0, 2 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload -; LA64-NEXT: vextrins.w $vr1, $vr0, 32 -; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.w $xr0, $xr0, 3 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload -; LA64-NEXT: vextrins.w $vr1, $vr0, 48 -; LA64-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload -; LA64-NEXT: xvpermi.q $xr1, $xr0, 2 -; LA64-NEXT: xvst $xr1, $fp, 0 -; LA64-NEXT: ld.d $fp, $sp, 112 # 8-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 120 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 128 -; LA64-NEXT: ret +; CHECK-LABEL: flog2_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvflogb.s $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret entry: %v = load <8 x float>, ptr %a %r = call <8 x float> @llvm.log2.v8f32(<8 x float> %v) @@ -169,93 +20,12 @@ entry: } define void @flog2_v4f64(ptr %res, ptr %a) nounwind { -; LA32-LABEL: flog2_v4f64: -; LA32: # %bb.0: # %entry -; LA32-NEXT: addi.w $sp, $sp, -112 -; LA32-NEXT: st.w $ra, $sp, 108 # 4-byte Folded Spill -; LA32-NEXT: st.w $fp, $sp, 104 # 4-byte Folded Spill -; LA32-NEXT: xvld $xr0, $a1, 0 -; LA32-NEXT: xvst $xr0, $sp, 64 # 32-byte Folded Spill -; LA32-NEXT: move $fp, $a0 -; LA32-NEXT: xvpickve.d $xr0, $xr0, 3 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; LA32-NEXT: bl log2 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; LA32-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.d $xr0, $xr0, 2 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; LA32-NEXT: bl log2 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 -; LA32-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload -; LA32-NEXT: vextrins.d $vr0, $vr1, 16 -; LA32-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.d $xr0, $xr0, 1 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; LA32-NEXT: bl log2 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; LA32-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA32-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; LA32-NEXT: xvpickve.d $xr0, $xr0, 0 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; LA32-NEXT: bl log2 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 -; LA32-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload -; LA32-NEXT: vextrins.d $vr0, $vr1, 16 -; LA32-NEXT: xvld $xr1, $sp, 32 # 32-byte Folded Reload -; LA32-NEXT: xvpermi.q $xr0, $xr1, 2 -; LA32-NEXT: xvst $xr0, $fp, 0 -; LA32-NEXT: ld.w $fp, $sp, 104 # 4-byte Folded Reload -; LA32-NEXT: ld.w $ra, $sp, 108 # 4-byte Folded Reload -; LA32-NEXT: addi.w $sp, $sp, 112 -; LA32-NEXT: ret -; -; LA64-LABEL: flog2_v4f64: -; LA64: # %bb.0: # %entry -; LA64-NEXT: addi.d $sp, $sp, -112 -; LA64-NEXT: st.d $ra, $sp, 104 # 8-byte Folded Spill -; LA64-NEXT: st.d $fp, $sp, 96 # 8-byte Folded Spill -; LA64-NEXT: xvld $xr0, $a1, 0 -; LA64-NEXT: xvst $xr0, $sp, 64 # 32-byte Folded Spill -; LA64-NEXT: move $fp, $a0 -; LA64-NEXT: xvpickve.d $xr0, $xr0, 3 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.d $xr0, $xr0, 2 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 -; LA64-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload -; LA64-NEXT: vextrins.d $vr0, $vr1, 16 -; LA64-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.d $xr0, $xr0, 1 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA64-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; LA64-NEXT: xvpickve.d $xr0, $xr0, 0 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 -; LA64-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload -; LA64-NEXT: vextrins.d $vr0, $vr1, 16 -; LA64-NEXT: xvld $xr1, $sp, 32 # 32-byte Folded Reload -; LA64-NEXT: xvpermi.q $xr0, $xr1, 2 -; LA64-NEXT: xvst $xr0, $fp, 0 -; LA64-NEXT: ld.d $fp, $sp, 96 # 8-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 104 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 112 -; LA64-NEXT: ret +; CHECK-LABEL: flog2_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvflogb.d $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret entry: %v = load <4 x double>, ptr %a %r = call <4 x double> @llvm.log2.v4f64(<4 x double> %v) diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/flog2.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/flog2.ll index e5e75ec617b51..87cc7c6dbc708 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/flog2.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/flog2.ll @@ -1,98 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefix=LA32 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefix=LA64 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s declare <4 x float> @llvm.log2.v4f32(<4 x float>) declare <2 x double> @llvm.log2.v2f64(<2 x double>) define void @flog2_v4f32(ptr %res, ptr %a) nounwind { -; LA32-LABEL: flog2_v4f32: -; LA32: # %bb.0: # %entry -; LA32-NEXT: addi.w $sp, $sp, -48 -; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill -; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill -; LA32-NEXT: vld $vr0, $a1, 0 -; LA32-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA32-NEXT: move $fp, $a0 -; LA32-NEXT: vreplvei.w $vr0, $vr0, 1 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $vr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill -; LA32-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; LA32-NEXT: vreplvei.w $vr0, $vr0, 0 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $vr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload -; LA32-NEXT: vextrins.w $vr0, $vr1, 16 -; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill -; LA32-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; LA32-NEXT: vreplvei.w $vr0, $vr0, 2 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $vr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload -; LA32-NEXT: vextrins.w $vr1, $vr0, 32 -; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill -; LA32-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; LA32-NEXT: vreplvei.w $vr0, $vr0, 3 -; LA32-NEXT: # kill: def $f0 killed $f0 killed $vr0 -; LA32-NEXT: bl log2f -; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload -; LA32-NEXT: vextrins.w $vr1, $vr0, 48 -; LA32-NEXT: vst $vr1, $fp, 0 -; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload -; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload -; LA32-NEXT: addi.w $sp, $sp, 48 -; LA32-NEXT: ret -; -; LA64-LABEL: flog2_v4f32: -; LA64: # %bb.0: # %entry -; LA64-NEXT: addi.d $sp, $sp, -48 -; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64-NEXT: vld $vr0, $a1, 0 -; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA64-NEXT: move $fp, $a0 -; LA64-NEXT: vreplvei.w $vr0, $vr0, 1 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill -; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; LA64-NEXT: vreplvei.w $vr0, $vr0, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload -; LA64-NEXT: vextrins.w $vr0, $vr1, 16 -; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill -; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; LA64-NEXT: vreplvei.w $vr0, $vr0, 2 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload -; LA64-NEXT: vextrins.w $vr1, $vr0, 32 -; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill -; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; LA64-NEXT: vreplvei.w $vr0, $vr0, 3 -; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2f) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload -; LA64-NEXT: vextrins.w $vr1, $vr0, 48 -; LA64-NEXT: vst $vr1, $fp, 0 -; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 48 -; LA64-NEXT: ret +; CHECK-LABEL: flog2_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vflogb.s $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret entry: %v = load <4 x float>, ptr %a %r = call <4 x float> @llvm.log2.v4f32(<4 x float> %v) @@ -101,59 +20,12 @@ entry: } define void @flog2_v2f64(ptr %res, ptr %a) nounwind { -; LA32-LABEL: flog2_v2f64: -; LA32: # %bb.0: # %entry -; LA32-NEXT: addi.w $sp, $sp, -48 -; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill -; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill -; LA32-NEXT: vld $vr0, $a1, 0 -; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill -; LA32-NEXT: move $fp, $a0 -; LA32-NEXT: vreplvei.d $vr0, $vr0, 1 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 -; LA32-NEXT: bl log2 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; LA32-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload -; LA32-NEXT: vreplvei.d $vr0, $vr0, 0 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 -; LA32-NEXT: bl log2 -; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; LA32-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload -; LA32-NEXT: vextrins.d $vr0, $vr1, 16 -; LA32-NEXT: vst $vr0, $fp, 0 -; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload -; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload -; LA32-NEXT: addi.w $sp, $sp, 48 -; LA32-NEXT: ret -; -; LA64-LABEL: flog2_v2f64: -; LA64: # %bb.0: # %entry -; LA64-NEXT: addi.d $sp, $sp, -48 -; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64-NEXT: vld $vr0, $a1, 0 -; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill -; LA64-NEXT: move $fp, $a0 -; LA64-NEXT: vreplvei.d $vr0, $vr0, 1 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload -; LA64-NEXT: vreplvei.d $vr0, $vr0, 0 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 -; LA64-NEXT: pcaddu18i $ra, %call36(log2) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; LA64-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload -; LA64-NEXT: vextrins.d $vr0, $vr1, 16 -; LA64-NEXT: vst $vr0, $fp, 0 -; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 48 -; LA64-NEXT: ret +; CHECK-LABEL: flog2_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vflogb.d $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret entry: %v = load <2 x double>, ptr %a %r = call <2 x double> @llvm.log2.v2f64(<2 x double> %v) From 69fa380079df618e31c2158564775b23a1c6cce6 Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Fri, 31 Oct 2025 10:27:01 +0800 Subject: [PATCH 278/539] [LoongArch] Add patterns to support vector type average instructions generation (#161079) NOTE: For simplicity and convenience, `v2i64/v4i64` types on LA32 is not optimized. If hoping to implement this in the future, special handling for `bitcast` and `build_vector` is needed. --- .../LoongArch/LoongArchLASXInstrInfo.td | 18 +++ .../Target/LoongArch/LoongArchLSXInstrInfo.td | 30 ++++ .../LoongArch/lasx/ir-instruction/avg.ll | 146 ++++++++++-------- .../LoongArch/lsx/ir-instruction/avg.ll | 146 ++++++++++-------- 4 files changed, 208 insertions(+), 132 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index ddf91ca54e1e0..ca4ee5f89573a 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -2027,6 +2027,24 @@ def : Pat<(v4i32(fp_to_uint v4f64:$vj)), (XVFTINTRZ_LU_D v4f64:$vj)), sub_128)>; +// XVAVG_{B/H/W/D/BU/HU/WU/DU}, XVAVGR_{B/H/W/D/BU/HU/WU/DU} +defm : VAvgPat; +defm : VAvgPat; +defm : VAvgPat; +defm : VAvgPat; +defm : VAvgPat; +defm : VAvgPat; +defm : VAvgPat; +defm : VAvgPat; +defm : VAvgrPat; +defm : VAvgrPat; +defm : VAvgrPat; +defm : VAvgrPat; +defm : VAvgrPat; +defm : VAvgrPat; +defm : VAvgrPat; +defm : VAvgrPat; + // abs def : Pat<(abs v32i8:$xj), (XVSIGNCOV_B v32i8:$xj, v32i8:$xj)>; def : Pat<(abs v16i16:$xj), (XVSIGNCOV_H v16i16:$xj, v16i16:$xj)>; diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index ba1204d620575..92402baa0fa0f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -1518,6 +1518,18 @@ multiclass InsertExtractPatV2 { } } +multiclass VAvgPat { + def : Pat<(OpNode (vt (add vt:$vj, vt:$vk)), (vt (vsplat_imm_eq_1))), + (!cast(Inst) vt:$vj, vt:$vk)>; +} + +multiclass VAvgrPat { + def : Pat<(OpNode (vt (add (vt (add vt:$vj, vt:$vk)), + (vt (vsplat_imm_eq_1)))), + (vt (vsplat_imm_eq_1))), + (!cast(Inst) vt:$vj, vt:$vk)>; +} + let Predicates = [HasExtLSX] in { // VADD_{B/H/W/D} @@ -2157,6 +2169,24 @@ def : Pat<(f32 f32imm_vldi:$in), def : Pat<(f64 f64imm_vldi:$in), (f64 (EXTRACT_SUBREG (VLDI (to_f64imm_vldi f64imm_vldi:$in)), sub_64))>; +// VAVG_{B/H/W/D/BU/HU/WU/DU}, VAVGR_{B/H/W/D/BU/HU/WU/DU} +defm : VAvgPat; +defm : VAvgPat; +defm : VAvgPat; +defm : VAvgPat; +defm : VAvgPat; +defm : VAvgPat; +defm : VAvgPat; +defm : VAvgPat; +defm : VAvgrPat; +defm : VAvgrPat; +defm : VAvgrPat; +defm : VAvgrPat; +defm : VAvgrPat; +defm : VAvgrPat; +defm : VAvgrPat; +defm : VAvgrPat; + // abs def : Pat<(abs v16i8:$vj), (VSIGNCOV_B v16i8:$vj, v16i8:$vj)>; def : Pat<(abs v8i16:$vj), (VSIGNCOV_H v8i16:$vj, v8i16:$vj)>; diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll index 2a5a8fa05d646..5c5c19935080b 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll @@ -1,14 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s -; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @xvavg_b(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK-LABEL: xvavg_b: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsrai.b $xr0, $xr0, 1 +; CHECK-NEXT: xvavg.b $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -25,8 +24,7 @@ define void @xvavg_h(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.h $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsrai.h $xr0, $xr0, 1 +; CHECK-NEXT: xvavg.h $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -43,8 +41,7 @@ define void @xvavg_w(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsrai.w $xr0, $xr0, 1 +; CHECK-NEXT: xvavg.w $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -57,14 +54,22 @@ entry: } define void @xvavg_d(ptr %res, ptr %a, ptr %b) nounwind { -; CHECK-LABEL: xvavg_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.d $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsrai.d $xr0, $xr0, 1 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: xvavg_d: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvadd.d $xr0, $xr0, $xr1 +; LA32-NEXT: xvsrai.d $xr0, $xr0, 1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: xvavg_d: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvavg.d $xr0, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret entry: %va = load <4 x i64>, ptr %a %vb = load <4 x i64>, ptr %b @@ -79,8 +84,7 @@ define void @xvavg_bu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsrli.b $xr0, $xr0, 1 +; CHECK-NEXT: xvavg.bu $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -97,8 +101,7 @@ define void @xvavg_hu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.h $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsrli.h $xr0, $xr0, 1 +; CHECK-NEXT: xvavg.hu $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -115,8 +118,7 @@ define void @xvavg_wu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsrli.w $xr0, $xr0, 1 +; CHECK-NEXT: xvavg.wu $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -129,14 +131,22 @@ entry: } define void @xvavg_du(ptr %res, ptr %a, ptr %b) nounwind { -; CHECK-LABEL: xvavg_du: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.d $xr0, $xr0, $xr1 -; CHECK-NEXT: xvsrli.d $xr0, $xr0, 1 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: xvavg_du: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvadd.d $xr0, $xr0, $xr1 +; LA32-NEXT: xvsrli.d $xr0, $xr0, 1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: xvavg_du: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvavg.du $xr0, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret entry: %va = load <4 x i64>, ptr %a %vb = load <4 x i64>, ptr %b @@ -151,9 +161,7 @@ define void @xvavgr_b(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvaddi.bu $xr0, $xr0, 1 -; CHECK-NEXT: xvsrai.b $xr0, $xr0, 1 +; CHECK-NEXT: xvavgr.b $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -171,9 +179,7 @@ define void @xvavgr_h(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.h $xr0, $xr0, $xr1 -; CHECK-NEXT: xvaddi.hu $xr0, $xr0, 1 -; CHECK-NEXT: xvsrai.h $xr0, $xr0, 1 +; CHECK-NEXT: xvavgr.h $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -191,9 +197,7 @@ define void @xvavgr_w(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvaddi.wu $xr0, $xr0, 1 -; CHECK-NEXT: xvsrai.w $xr0, $xr0, 1 +; CHECK-NEXT: xvavgr.w $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -207,15 +211,23 @@ entry: } define void @xvavgr_d(ptr %res, ptr %a, ptr %b) nounwind { -; CHECK-LABEL: xvavgr_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.d $xr0, $xr0, $xr1 -; CHECK-NEXT: xvaddi.du $xr0, $xr0, 1 -; CHECK-NEXT: xvsrai.d $xr0, $xr0, 1 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: xvavgr_d: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvadd.d $xr0, $xr0, $xr1 +; LA32-NEXT: xvaddi.du $xr0, $xr0, 1 +; LA32-NEXT: xvsrai.d $xr0, $xr0, 1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: xvavgr_d: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvavgr.d $xr0, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret entry: %va = load <4 x i64>, ptr %a %vb = load <4 x i64>, ptr %b @@ -231,9 +243,7 @@ define void @xvavgr_bu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvaddi.bu $xr0, $xr0, 1 -; CHECK-NEXT: xvsrli.b $xr0, $xr0, 1 +; CHECK-NEXT: xvavgr.bu $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -251,9 +261,7 @@ define void @xvavgr_hu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.h $xr0, $xr0, $xr1 -; CHECK-NEXT: xvaddi.hu $xr0, $xr0, 1 -; CHECK-NEXT: xvsrli.h $xr0, $xr0, 1 +; CHECK-NEXT: xvavgr.hu $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -271,9 +279,7 @@ define void @xvavgr_wu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvaddi.wu $xr0, $xr0, 1 -; CHECK-NEXT: xvsrli.w $xr0, $xr0, 1 +; CHECK-NEXT: xvavgr.wu $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -287,15 +293,23 @@ entry: } define void @xvavgr_du(ptr %res, ptr %a, ptr %b) nounwind { -; CHECK-LABEL: xvavgr_du: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: xvadd.d $xr0, $xr0, $xr1 -; CHECK-NEXT: xvaddi.du $xr0, $xr0, 1 -; CHECK-NEXT: xvsrli.d $xr0, $xr0, 1 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: xvavgr_du: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: xvadd.d $xr0, $xr0, $xr1 +; LA32-NEXT: xvaddi.du $xr0, $xr0, 1 +; LA32-NEXT: xvsrli.d $xr0, $xr0, 1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: xvavgr_du: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvavgr.du $xr0, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret entry: %va = load <4 x i64>, ptr %a %vb = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll index 20b8898436cc4..334af22edee59 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll @@ -1,14 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s -; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @vavg_b(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK-LABEL: vavg_b: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.b $vr0, $vr0, $vr1 -; CHECK-NEXT: vsrai.b $vr0, $vr0, 1 +; CHECK-NEXT: vavg.b $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -25,8 +24,7 @@ define void @vavg_h(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.h $vr0, $vr0, $vr1 -; CHECK-NEXT: vsrai.h $vr0, $vr0, 1 +; CHECK-NEXT: vavg.h $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -43,8 +41,7 @@ define void @vavg_w(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vsrai.w $vr0, $vr0, 1 +; CHECK-NEXT: vavg.w $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -57,14 +54,22 @@ entry: } define void @vavg_d(ptr %res, ptr %a, ptr %b) nounwind { -; CHECK-LABEL: vavg_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.d $vr0, $vr0, $vr1 -; CHECK-NEXT: vsrai.d $vr0, $vr0, 1 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vavg_d: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: vadd.d $vr0, $vr0, $vr1 +; LA32-NEXT: vsrai.d $vr0, $vr0, 1 +; LA32-NEXT: vst $vr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vavg_d: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vld $vr0, $a1, 0 +; LA64-NEXT: vld $vr1, $a2, 0 +; LA64-NEXT: vavg.d $vr0, $vr0, $vr1 +; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: ret entry: %va = load <2 x i64>, ptr %a %vb = load <2 x i64>, ptr %b @@ -79,8 +84,7 @@ define void @vavg_bu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.b $vr0, $vr0, $vr1 -; CHECK-NEXT: vsrli.b $vr0, $vr0, 1 +; CHECK-NEXT: vavg.bu $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -97,8 +101,7 @@ define void @vavg_hu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.h $vr0, $vr0, $vr1 -; CHECK-NEXT: vsrli.h $vr0, $vr0, 1 +; CHECK-NEXT: vavg.hu $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -115,8 +118,7 @@ define void @vavg_wu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vsrli.w $vr0, $vr0, 1 +; CHECK-NEXT: vavg.wu $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -129,14 +131,22 @@ entry: } define void @vavg_du(ptr %res, ptr %a, ptr %b) nounwind { -; CHECK-LABEL: vavg_du: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.d $vr0, $vr0, $vr1 -; CHECK-NEXT: vsrli.d $vr0, $vr0, 1 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vavg_du: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: vadd.d $vr0, $vr0, $vr1 +; LA32-NEXT: vsrli.d $vr0, $vr0, 1 +; LA32-NEXT: vst $vr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vavg_du: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vld $vr0, $a1, 0 +; LA64-NEXT: vld $vr1, $a2, 0 +; LA64-NEXT: vavg.du $vr0, $vr0, $vr1 +; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: ret entry: %va = load <2 x i64>, ptr %a %vb = load <2 x i64>, ptr %b @@ -151,9 +161,7 @@ define void @vavgr_b(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.b $vr0, $vr0, $vr1 -; CHECK-NEXT: vaddi.bu $vr0, $vr0, 1 -; CHECK-NEXT: vsrai.b $vr0, $vr0, 1 +; CHECK-NEXT: vavgr.b $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -171,9 +179,7 @@ define void @vavgr_h(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.h $vr0, $vr0, $vr1 -; CHECK-NEXT: vaddi.hu $vr0, $vr0, 1 -; CHECK-NEXT: vsrai.h $vr0, $vr0, 1 +; CHECK-NEXT: vavgr.h $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -191,9 +197,7 @@ define void @vavgr_w(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vaddi.wu $vr0, $vr0, 1 -; CHECK-NEXT: vsrai.w $vr0, $vr0, 1 +; CHECK-NEXT: vavgr.w $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -207,15 +211,23 @@ entry: } define void @vavgr_d(ptr %res, ptr %a, ptr %b) nounwind { -; CHECK-LABEL: vavgr_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.d $vr0, $vr0, $vr1 -; CHECK-NEXT: vaddi.du $vr0, $vr0, 1 -; CHECK-NEXT: vsrai.d $vr0, $vr0, 1 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vavgr_d: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: vadd.d $vr0, $vr0, $vr1 +; LA32-NEXT: vaddi.du $vr0, $vr0, 1 +; LA32-NEXT: vsrai.d $vr0, $vr0, 1 +; LA32-NEXT: vst $vr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vavgr_d: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vld $vr0, $a1, 0 +; LA64-NEXT: vld $vr1, $a2, 0 +; LA64-NEXT: vavgr.d $vr0, $vr0, $vr1 +; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: ret entry: %va = load <2 x i64>, ptr %a %vb = load <2 x i64>, ptr %b @@ -231,9 +243,7 @@ define void @vavgr_bu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.b $vr0, $vr0, $vr1 -; CHECK-NEXT: vaddi.bu $vr0, $vr0, 1 -; CHECK-NEXT: vsrli.b $vr0, $vr0, 1 +; CHECK-NEXT: vavgr.bu $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -251,9 +261,7 @@ define void @vavgr_hu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.h $vr0, $vr0, $vr1 -; CHECK-NEXT: vaddi.hu $vr0, $vr0, 1 -; CHECK-NEXT: vsrli.h $vr0, $vr0, 1 +; CHECK-NEXT: vavgr.hu $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -271,9 +279,7 @@ define void @vavgr_wu(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vaddi.wu $vr0, $vr0, 1 -; CHECK-NEXT: vsrli.w $vr0, $vr0, 1 +; CHECK-NEXT: vavgr.wu $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -287,15 +293,23 @@ entry: } define void @vavgr_du(ptr %res, ptr %a, ptr %b) nounwind { -; CHECK-LABEL: vavgr_du: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: vadd.d $vr0, $vr0, $vr1 -; CHECK-NEXT: vaddi.du $vr0, $vr0, 1 -; CHECK-NEXT: vsrli.d $vr0, $vr0, 1 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vavgr_du: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: vadd.d $vr0, $vr0, $vr1 +; LA32-NEXT: vaddi.du $vr0, $vr0, 1 +; LA32-NEXT: vsrli.d $vr0, $vr0, 1 +; LA32-NEXT: vst $vr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vavgr_du: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vld $vr0, $a1, 0 +; LA64-NEXT: vld $vr1, $a2, 0 +; LA64-NEXT: vavgr.du $vr0, $vr0, $vr1 +; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: ret entry: %va = load <2 x i64>, ptr %a %vb = load <2 x i64>, ptr %b From 0aae572ba684cc6c79e0ab69c3a42ea1c602824d Mon Sep 17 00:00:00 2001 From: Jinjie Huang Date: Fri, 31 Oct 2025 10:29:00 +0800 Subject: [PATCH 279/539] [BOLT] Add constant island check in scanExternalRefs() (#165577) The [previous patch](https://github.com/llvm/llvm-project/pull/163418) has added a check to prevent adding an entry point into a constant island, but only for successfully disassembled functions. Because scanExternalRefs() is also called when a function fails to be disassembled or is skipped, it can still attempt to add an entry point at constant islands. The same issue may occur if without a check for it So, this patch complements the 'constant island' check in scanExternalRefs(). --- bolt/lib/Core/BinaryFunction.cpp | 16 +++++++++++++--- bolt/test/AArch64/constant-island-entry.s | 9 +++++++-- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 84023efe1084e..fbe186454351c 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -1699,9 +1699,19 @@ bool BinaryFunction::scanExternalRefs() { const uint64_t FunctionOffset = TargetAddress - TargetFunction->getAddress(); - BranchTargetSymbol = - FunctionOffset ? TargetFunction->addEntryPointAtOffset(FunctionOffset) - : TargetFunction->getSymbol(); + if (!TargetFunction->isInConstantIsland(TargetAddress)) { + BranchTargetSymbol = + FunctionOffset + ? TargetFunction->addEntryPointAtOffset(FunctionOffset) + : TargetFunction->getSymbol(); + } else { + TargetFunction->setIgnored(); + BC.outs() << "BOLT-WARNING: Ignoring entry point at address 0x" + << Twine::utohexstr(Address) + << " in constant island of function " << *TargetFunction + << '\n'; + continue; + } } // Can't find more references. Not creating relocations since we are not diff --git a/bolt/test/AArch64/constant-island-entry.s b/bolt/test/AArch64/constant-island-entry.s index 6567114eb980a..7f8449deea130 100644 --- a/bolt/test/AArch64/constant-island-entry.s +++ b/bolt/test/AArch64/constant-island-entry.s @@ -1,10 +1,15 @@ -// This test checks that we ignore functions which add an entry point that -// is in a constant island. +## This test checks that we ignore functions which add an entry point that +## is in a constant island. # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o # RUN: %clang %cflags %t.o -pie -Wl,-q -o %t.exe + +## Check when the caller is successfully disassembled. # RUN: llvm-bolt %t.exe -o %t.bolt 2>&1 | FileCheck %s +## Skip caller to check the identical warning is triggered from ScanExternalRefs(). +# RUN: llvm-bolt %t.exe -o %t.bolt -skip-funcs=caller 2>&1 | FileCheck %s + # CHECK: BOLT-WARNING: Ignoring entry point at address 0x{{[0-9a-f]+}} in constant island of function func .globl func From 5015c42a7081f7f899caa40b71901219186b7aaf Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Fri, 31 Oct 2025 10:30:29 +0800 Subject: [PATCH 280/539] [SimplifyCFG] Avoid use-after-free when removing incoming values from PHI nodes (#165744) `PHINode::removeIncomingValue` removes itself when there are no incoming edges. Then we cannot use it to retrieve the next instruction. Closes https://github.com/llvm/llvm-project/issues/165301. --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 8 +++--- llvm/test/Transforms/SimplifyCFG/pr165301.ll | 26 ++++++++++++++++++++ 2 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 llvm/test/Transforms/SimplifyCFG/pr165301.ll diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index b03fb6213d61c..7f6d779687e94 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -5977,14 +5977,14 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, } // Prune obsolete incoming values off the successors' PHI nodes. - for (auto BBI = Dest->begin(); isa(BBI); ++BBI) { + for (auto &PHI : make_early_inc_range(Dest->phis())) { unsigned PreviousEdges = Cases->size(); if (Dest == SI->getDefaultDest()) ++PreviousEdges; for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I) - cast(BBI)->removeIncomingValue(SI->getParent()); + PHI.removeIncomingValue(SI->getParent()); } - for (auto BBI = OtherDest->begin(); isa(BBI); ++BBI) { + for (auto &PHI : make_early_inc_range(OtherDest->phis())) { unsigned PreviousEdges = OtherCases->size(); if (OtherDest == SI->getDefaultDest()) ++PreviousEdges; @@ -5993,7 +5993,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, if (NewBI->isUnconditional()) ++E; for (unsigned I = 0; I != E; ++I) - cast(BBI)->removeIncomingValue(SI->getParent()); + PHI.removeIncomingValue(SI->getParent()); } // Clean up the default block - it may have phis or other instructions before diff --git a/llvm/test/Transforms/SimplifyCFG/pr165301.ll b/llvm/test/Transforms/SimplifyCFG/pr165301.ll new file mode 100644 index 0000000000000..4a539d77af3cb --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/pr165301.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes="simplifycfg" < %s | FileCheck %s + +; Make sure there's no use after free when removing incoming values from PHI nodes + +define i32 @pr165301(i1 %cond) { +; CHECK-LABEL: define i32 @pr165301( +; CHECK-SAME: i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[SWITCHBB:.*]] +; CHECK: [[SWITCHBB]]: +; CHECK-NEXT: br label %[[SWITCHBB]] +; +entry: + br label %switchbb + +switchbb: + switch i1 %cond, label %default [ + i1 false, label %switchbb + i1 true, label %switchbb + ] + +default: + %phi.lcssa = phi i32 [ 0, %switchbb ] + ret i32 %phi.lcssa +} From 00b8e50e198760126ebe1e35104a16f68e30d30e Mon Sep 17 00:00:00 2001 From: quic-likaid Date: Fri, 31 Oct 2025 10:43:07 +0800 Subject: [PATCH 281/539] lsan: fix allocator on arm64 Android (#165656) The default config is too large for arm64 Android devices, which are typically configured with 39-bit address space. This change brings it inline with sanitizer_allocator_test.cpp. --- compiler-rt/lib/lsan/lsan_allocator.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/compiler-rt/lib/lsan/lsan_allocator.h b/compiler-rt/lib/lsan/lsan_allocator.h index 556b9f56a4a4a..2d0ea0b46fe0e 100644 --- a/compiler-rt/lib/lsan/lsan_allocator.h +++ b/compiler-rt/lib/lsan/lsan_allocator.h @@ -93,6 +93,10 @@ using LSanSizeClassMap = DefaultSizeClassMap; const uptr kAllocatorSpace = 0x600000000000ULL; const uptr kAllocatorSize = 0x40000000000ULL; // 4T. using LSanSizeClassMap = DefaultSizeClassMap; +# elif SANITIZER_ANDROID && defined(__aarch64__) +const uptr kAllocatorSpace = 0x3000000000ULL; +const uptr kAllocatorSize = 0x2000000000ULL; +using LSanSizeClassMap = VeryCompactSizeClassMap; # else const uptr kAllocatorSpace = 0x500000000000ULL; const uptr kAllocatorSize = 0x40000000000ULL; // 4T. From b55a28fd92c4d4015b0edb967bf0a1cab971a344 Mon Sep 17 00:00:00 2001 From: Congcong Cai Date: Fri, 31 Oct 2025 10:51:58 +0800 Subject: [PATCH 282/539] [clang-tidy][readability-redundant-parentheses] add option to prevent widely used work around (#164827) Part of #164125 Add a new option to ignore some decls. --------- Co-authored-by: EugeneZelenko --- .../readability/RedundantParenthesesCheck.cpp | 25 ++++++++++++++++--- .../readability/RedundantParenthesesCheck.h | 7 ++++-- .../readability/redundant-parentheses.rst | 13 ++++++++++ .../readability/redundant-parentheses.cpp | 9 +++++++ 4 files changed, 48 insertions(+), 6 deletions(-) diff --git a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp index 0ab59fff39d88..874b9618bd882 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "RedundantParenthesesCheck.h" +#include "../utils/Matchers.h" +#include "../utils/OptionsUtils.h" #include "clang/AST/Expr.h" #include "clang/ASTMatchers/ASTMatchFinder.h" #include "clang/ASTMatchers/ASTMatchers.h" @@ -32,15 +34,30 @@ AST_MATCHER(ParenExpr, isInMacro) { } // namespace +RedundantParenthesesCheck::RedundantParenthesesCheck(StringRef Name, + ClangTidyContext *Context) + : ClangTidyCheck(Name, Context), + AllowedDecls(utils::options::parseStringList( + Options.get("AllowedDecls", "std::max;std::min"))) {} + +void RedundantParenthesesCheck::storeOptions( + ClangTidyOptions::OptionMap &Opts) { + Options.store(Opts, "AllowedDecls", + utils::options::serializeStringList(AllowedDecls)); +} + void RedundantParenthesesCheck::registerMatchers(MatchFinder *Finder) { const auto ConstantExpr = expr(anyOf(integerLiteral(), floatLiteral(), characterLiteral(), cxxBoolLiteral(), stringLiteral(), cxxNullPtrLiteralExpr())); Finder->addMatcher( - parenExpr(subExpr(anyOf(parenExpr(), ConstantExpr, declRefExpr())), - unless(anyOf(isInMacro(), - // sizeof(...) is common used. - hasParent(unaryExprOrTypeTraitExpr())))) + parenExpr( + subExpr(anyOf(parenExpr(), ConstantExpr, + declRefExpr(to(namedDecl(unless( + matchers::matchesAnyListedName(AllowedDecls))))))), + unless(anyOf(isInMacro(), + // sizeof(...) is common used. + hasParent(unaryExprOrTypeTraitExpr())))) .bind("dup"), this); } diff --git a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h index 9a0409b83fff3..2638a09730f7e 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h +++ b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h @@ -20,13 +20,16 @@ namespace clang::tidy::readability { /// https://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-parentheses.html class RedundantParenthesesCheck : public ClangTidyCheck { public: - RedundantParenthesesCheck(StringRef Name, ClangTidyContext *Context) - : ClangTidyCheck(Name, Context) {} + RedundantParenthesesCheck(StringRef Name, ClangTidyContext *Context); + void storeOptions(ClangTidyOptions::OptionMap &Opts) override; void registerMatchers(ast_matchers::MatchFinder *Finder) override; void check(const ast_matchers::MatchFinder::MatchResult &Result) override; bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { return LangOpts.CPlusPlus | LangOpts.C99; } + +private: + const std::vector AllowedDecls; }; } // namespace clang::tidy::readability diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst index 23d975e646490..20e3891c72d7f 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst @@ -27,3 +27,16 @@ affect the semantics. .. code-block:: c++ int a = (1 * 2) + 3; // no warning + +Options +------- + +.. option:: AllowedDecls + + Semicolon-separated list of regular expressions matching names of declarations + to ignore when the parentheses are around. Declarations can include variables + or functions. The default is an `std::max;std::min`. + + Some STL library functions may have the same name as widely used function-like + macro. For example, ``std::max`` and ``max`` macro. A workaround to distinguish + them is adding parentheses around functions to prevent function-like macro. diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp index 926cb118c77cf..c77608c66469c 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp @@ -62,3 +62,12 @@ void exceptions() { // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: redundant parentheses around expression [readability-redundant-parentheses] // CHECK-FIXES: alignof(3); } + +namespace std { + template T max(T, T); + template T min(T, T); +} // namespace std +void ignoreStdMaxMin() { + (std::max)(1,2); + (std::min)(1,2); +} From 82b3e3bc41fb00e00828bb64ed24f105cbc40ca2 Mon Sep 17 00:00:00 2001 From: Yu Hao Date: Thu, 30 Oct 2025 20:18:08 -0700 Subject: [PATCH 283/539] [clang][transformer] Change `name` range-selector to return `Error` instead of an invalid range. (#164715) Previously, when the text in selected range was different from the decl's name, `name` returned an invalid range, which could cause crashes if `name` was nested in other range selectors that assumed always valid ranges. With this change, `name` returns an `Error` if it can't get the range. --- .../lib/Tooling/Transformer/RangeSelector.cpp | 8 ++++-- clang/unittests/Tooling/RangeSelectorTest.cpp | 25 +++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/clang/lib/Tooling/Transformer/RangeSelector.cpp b/clang/lib/Tooling/Transformer/RangeSelector.cpp index 171c786bc366f..b4bdec1fcdd69 100644 --- a/clang/lib/Tooling/Transformer/RangeSelector.cpp +++ b/clang/lib/Tooling/Transformer/RangeSelector.cpp @@ -205,8 +205,12 @@ RangeSelector transformer::name(std::string ID) { // `foo` for which this range will be too short. Doing so will // require subcasing `NamedDecl`, because it doesn't provide virtual // access to the \c DeclarationNameInfo. - if (tooling::getText(R, *Result.Context) != D->getName()) - return CharSourceRange(); + StringRef Text = tooling::getText(R, *Result.Context); + if (Text != D->getName()) + return llvm::make_error( + llvm::errc::not_supported, + "range selected by name(node id=" + ID + "): '" + Text + + "' is different from decl name '" + D->getName() + "'"); return R; } if (const auto *E = Node.get()) { diff --git a/clang/unittests/Tooling/RangeSelectorTest.cpp b/clang/unittests/Tooling/RangeSelectorTest.cpp index adf5e74ea3192..a1fcbb023832f 100644 --- a/clang/unittests/Tooling/RangeSelectorTest.cpp +++ b/clang/unittests/Tooling/RangeSelectorTest.cpp @@ -527,6 +527,31 @@ TEST(RangeSelectorTest, NameOpDeclRefError) { AllOf(HasSubstr(Ref), HasSubstr("requires property 'identifier'"))))); } +TEST(RangeSelectorTest, NameOpDeclInMacroArg) { + StringRef Code = R"cc( + #define MACRO(name) int name; + MACRO(x) + )cc"; + const char *ID = "id"; + TestMatch Match = matchCode(Code, varDecl().bind(ID)); + EXPECT_THAT_EXPECTED(select(name(ID), Match), HasValue("x")); +} + +TEST(RangeSelectorTest, NameOpDeclInMacroBodyError) { + StringRef Code = R"cc( + #define MACRO int x; + MACRO + )cc"; + const char *ID = "id"; + TestMatch Match = matchCode(Code, varDecl().bind(ID)); + EXPECT_THAT_EXPECTED( + name(ID)(Match.Result), + Failed(testing::Property( + &StringError::getMessage, + AllOf(HasSubstr("range selected by name(node id="), + HasSubstr("' is different from decl name 'x'"))))); +} + TEST(RangeSelectorTest, CallArgsOp) { const StringRef Code = R"cc( struct C { From 20fc07b968271e286b11cae1942876e98a052ad7 Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Thu, 30 Oct 2025 20:30:49 -0700 Subject: [PATCH 284/539] [libc] Add "struct tm" declaration to (#165795) `` should at least include the forward declaration of `struct tm`, since it's needed for the `wcsftime` declaration (also, see https://man7.org/linux/man-pages/man0/wchar.h.0p.html). Even though we don't yet have `wcsftime`, some downstream users (notably - libcxx) expects to see `struct tm` declaration there, to re-declare it under `std` namespace: https://github.com/llvm/llvm-project/blob/c46bfed1a484d30cd251a9a225649d74e3bf0af5/libcxx/include/cwchar#L135 So, add this type declaration to llvm-libc version of `wchar.h` now. --- libc/include/wchar.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index c8b9e21b56b28..fb5b19b523b31 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -6,6 +6,10 @@ macros: types: - type_name: FILE - type_name: size_t + # TODO: Remove this once we have a function declaration using "struct tm" + # (wcsftime). We're declaring it here now, since libc++ expects + # forward-declaration of "struct tm" in the header. + - type_name: struct_tm - type_name: wint_t - type_name: wchar_t - type_name: mbstate_t From 2adba39e01f2aef093d116bec340dcb3d1c4230f Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 23 Oct 2025 15:00:19 +0900 Subject: [PATCH 285/539] RuntimeLibcalls: Whitespace fix --- llvm/include/llvm/IR/RuntimeLibcalls.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td index 7be1b654ca727..24c1b035d0dda 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.td +++ b/llvm/include/llvm/IR/RuntimeLibcalls.td @@ -1585,7 +1585,7 @@ def __aeabi_f2ulz : RuntimeLibcallImpl; // CallingConv::ARM_AA // RTABI chapter 4.1.2, Table 7 def __aeabi_d2f : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS def __aeabi_d2h : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS -def __aeabi_f2d : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS +def __aeabi_f2d : RuntimeLibcallImpl; // CallingConv::ARM_AAPCS // Integer to floating-point conversions. // RTABI chapter 4.1.2, Table 8 From d6694424c0092b64c05f4bb238c4a70109843737 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 30 Oct 2025 21:35:04 -0700 Subject: [PATCH 286/539] [compiler-rt] Default to Lit's Internal Shell All of the tests should work with the internal shell now, at least on x86 Linux. Enable it by default for the performance/debuggability advantages. Reviewers: vitalybuka, fmayer Reviewed By: fmayer, vitalybuka Pull Request: https://github.com/llvm/llvm-project/pull/165148 --- compiler-rt/test/lit.common.cfg.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py index 9d2f02189b8bd..1468c0742a766 100644 --- a/compiler-rt/test/lit.common.cfg.py +++ b/compiler-rt/test/lit.common.cfg.py @@ -113,12 +113,17 @@ def push_dynamic_library_lookup_path(config, new_path): config.environment[dynamic_library_lookup_var] = new_ld_library_path_64 +# TODO: Consolidate the logic for turning on the internal shell by default for all LLVM test suites. +# See https://github.com/llvm/llvm-project/issues/106636 for more details. +# # Choose between lit's internal shell pipeline runner and a real shell. If # LIT_USE_INTERNAL_SHELL is in the environment, we use that as an override. -use_lit_shell = os.environ.get("LIT_USE_INTERNAL_SHELL") +use_lit_shell = True +lit_shell_env = os.environ.get("LIT_USE_INTERNAL_SHELL") +if lit_shell_env: + use_lit_shell = lit.util.pythonize_bool(lit_shell_env) if use_lit_shell: - # 0 is external, "" is default, and everything else is internal. - execute_external = use_lit_shell == "0" + execute_external = True else: # Otherwise we default to internal on Windows and external elsewhere, as # bash on Windows is usually very slow. From 60b9886750b850fbc9b98c8e9f372512d264c1e4 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 31 Oct 2025 04:38:15 +0000 Subject: [PATCH 287/539] [VPlan] Remove original recipe after narrowing to single-scalar. Directly remove RepOrWidenR after replacing all uses. Removing the dead user early unlocks additional opportunities for further narrowing. --- .../Transforms/Vectorize/VPlanTransforms.cpp | 2 + .../LoopVectorize/X86/uniform_load.ll | 39 ++++++------------- .../LoopVectorize/single-scalar-cast-minbw.ll | 8 +--- 3 files changed, 16 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 4d98014622224..986c801abf684 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1419,6 +1419,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { true /*IsSingleScalar*/); Clone->insertBefore(RepOrWidenR); RepOrWidenR->replaceAllUsesWith(Clone); + if (isDeadRecipe(*RepOrWidenR)) + RepOrWidenR->eraseFromParent(); } } } diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll index d4004daf8833c..8081c0e17f865 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll @@ -64,39 +64,24 @@ exit: define void @uniform_load_can_fold_users(ptr noalias %src, ptr %dst, i64 %start, double %d) { ; CHECK-LABEL: define void @uniform_load_can_fold_users( ; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i64 [[START:%.*]], double [[D:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[START]], 1 -; CHECK-NEXT: [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 [[START]], i64 0) -; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[SMIN]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 2 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 2 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[START]], [[N_VEC]] -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[TMP4:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[START]], %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[SRC]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[BROADCAST_SPLAT]], splat (double 9.000000e+00) -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fmul double [[TMP5]], 9.000000e+00 ; CHECK-NEXT: [[TMP8:%.*]] = fdiv double [[TMP7]], [[D]] -; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP3]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP4]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP11]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[TMP12]], i64 [[TMP10]] -; CHECK-NEXT: store double [[TMP8]], ptr [[TMP13]], align 8 ; CHECK-NEXT: store double [[TMP8]], ptr [[TMP14]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[IV_1_NEXT]] = add i64 [[TMP4]], 1 +; CHECK-NEXT: [[IV_2_NEXT]] = add i64 [[IV_2]], -1 +; CHECK-NEXT: [[EC:%.*]] = icmp sgt i64 [[IV_2]], 0 +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll index 9a699826696ec..70adac2103feb 100644 --- a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll +++ b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll @@ -84,12 +84,8 @@ define void @single_scalar_cast_stored(ptr %src, ptr %dst, i32 %n) { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC]], align 2, !alias.scope [[META4:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i16> [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i16> [[BROADCAST_SPLAT]], splat (i16 15) -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i16 [[TMP0]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = and i16 [[TMP0]], 15 ; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i16 0, i16 [[TMP4]] ; CHECK-NEXT: store i16 [[TMP5]], ptr [[DST]], align 2, !alias.scope [[META7:![0-9]+]], !noalias [[META4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 From 04446dda4c2607763c0ee36e74a26ba94b5e729b Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Thu, 21 Aug 2025 11:37:33 -0700 Subject: [PATCH 288/539] [MLIR] Apply clang-tidy fixes for performance-unnecessary-value-param in ViewOpGraph.cpp (NFC) --- mlir/lib/Transforms/ViewOpGraph.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Transforms/ViewOpGraph.cpp b/mlir/lib/Transforms/ViewOpGraph.cpp index 08cac1fe3695c..5790a77cc4e2b 100644 --- a/mlir/lib/Transforms/ViewOpGraph.cpp +++ b/mlir/lib/Transforms/ViewOpGraph.cpp @@ -158,7 +158,8 @@ class PrintOpPass : public impl::ViewOpGraphBase { /// Emit a cluster (subgraph). The specified builder generates the body of the /// cluster. Return the anchor node of the cluster. - Node emitClusterStmt(function_ref builder, std::string label = "") { + Node emitClusterStmt(function_ref builder, + const std::string &label = "") { int clusterId = ++counter; os << "subgraph cluster_" << clusterId << " {\n"; os.indent(); @@ -269,7 +270,7 @@ class PrintOpPass : public impl::ViewOpGraphBase { } /// Emit a node statement. - Node emitNodeStmt(std::string label, StringRef shape = kShapeNode, + Node emitNodeStmt(const std::string &label, StringRef shape = kShapeNode, StringRef background = "") { int nodeId = ++counter; AttributeMap attrs; From 97b0f76ec02c95fbc96b03ed2c0de4800c6c9fc6 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Thu, 21 Aug 2025 07:06:04 -0700 Subject: [PATCH 289/539] [MLIR] Apply clang-tidy fixes for misc-use-internal-linkage in LinalgTransformOps.cpp (NFC) --- .../Dialect/Linalg/TransformOps/LinalgTransformOps.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index b09112bcf0bb7..3a433825fd31a 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -1958,7 +1958,7 @@ enum class OuterOrInnerPerm { Outer = 0, Inner = 1 }; /// Return true if either `op` or `permutation` are empty to allow a simpler /// polymorphic implementation. template -bool isValidPackingPermutation( +static bool isValidPackingPermutation( RelayoutOpTy op, ArrayRef permutation, OuterOrInnerPerm outerOrInnerPerm = OuterOrInnerPerm::Outer) { static_assert( @@ -4322,9 +4322,10 @@ DiagnosedSilenceableFailure transform::TransposeMatmulOp::applyToOne( // InsertSliceToCopyOp //===----------------------------------------------------------------------===// template -DiagnosedSilenceableFailure doit(RewriterBase &rewriter, OpTy target, - transform::ApplyToEachResultList &results, - transform::TransformState &state) { +static DiagnosedSilenceableFailure +doit(RewriterBase &rewriter, OpTy target, + transform::ApplyToEachResultList &results, + transform::TransformState &state) { static_assert(llvm::is_one_of() && "wrong op type"); From cb510e1d8f71d8b81e9722fb5649ffe1a8c74bf3 Mon Sep 17 00:00:00 2001 From: quic_hchandel Date: Fri, 31 Oct 2025 10:19:25 +0530 Subject: [PATCH 290/539] [RISCV] Add short forward branch support for `min`, `max`, `maxu` and `minu` (#164394) --- .../Target/RISCV/RISCVExpandPseudoInsts.cpp | 10 + llvm/lib/Target/RISCV/RISCVFeatures.td | 5 + llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 17 +- llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td | 4 + llvm/test/CodeGen/RISCV/features-info.ll | 1 + .../RISCV/short-forward-branch-opt-min-max.ll | 703 ++++++++++++++++++ 6 files changed, 737 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/short-forward-branch-opt-min-max.ll diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index 410561855e181..526675a682d86 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -127,6 +127,10 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, case RISCV::PseudoCCAND: case RISCV::PseudoCCOR: case RISCV::PseudoCCXOR: + case RISCV::PseudoCCMAX: + case RISCV::PseudoCCMAXU: + case RISCV::PseudoCCMIN: + case RISCV::PseudoCCMINU: case RISCV::PseudoCCADDW: case RISCV::PseudoCCSUBW: case RISCV::PseudoCCSLL: @@ -217,6 +221,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, .addImm(0); } else { unsigned NewOpc; + // clang-format off switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); @@ -228,6 +233,10 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, case RISCV::PseudoCCAND: NewOpc = RISCV::AND; break; case RISCV::PseudoCCOR: NewOpc = RISCV::OR; break; case RISCV::PseudoCCXOR: NewOpc = RISCV::XOR; break; + case RISCV::PseudoCCMAX: NewOpc = RISCV::MAX; break; + case RISCV::PseudoCCMIN: NewOpc = RISCV::MIN; break; + case RISCV::PseudoCCMAXU: NewOpc = RISCV::MAXU; break; + case RISCV::PseudoCCMINU: NewOpc = RISCV::MINU; break; case RISCV::PseudoCCADDI: NewOpc = RISCV::ADDI; break; case RISCV::PseudoCCSLLI: NewOpc = RISCV::SLLI; break; case RISCV::PseudoCCSRLI: NewOpc = RISCV::SRLI; break; @@ -250,6 +259,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, case RISCV::PseudoCCNDS_BFOS: NewOpc = RISCV::NDS_BFOS; break; case RISCV::PseudoCCNDS_BFOZ: NewOpc = RISCV::NDS_BFOZ; break; } + // clang-format on if (NewOpc == RISCV::NDS_BFOZ || NewOpc == RISCV::NDS_BFOS) { BuildMI(TrueBB, DL, TII->get(NewOpc), DestReg) diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index b4556f66473d6..cfee6ab22d4ff 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1851,6 +1851,11 @@ def TuneShortForwardBranchOpt def HasShortForwardBranchOpt : Predicate<"Subtarget->hasShortForwardBranchOpt()">; def NoShortForwardBranchOpt : Predicate<"!Subtarget->hasShortForwardBranchOpt()">; +def TuneShortForwardBranchIMinMax + : SubtargetFeature<"short-forward-branch-i-minmax", "HasShortForwardBranchIMinMax", + "true", "Enable short forward branch optimization for min,max instructions in Zbb", + [TuneShortForwardBranchOpt]>; + // Some subtargets require a S2V transfer buffer to move scalars into vectors. // FIXME: Forming .vx/.vf/.wx/.wf can reduce register pressure. def TuneNoSinkSplatOperands diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 912b82d294f44..3a7013d9efae6 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1699,6 +1699,10 @@ unsigned getPredicatedOpcode(unsigned Opcode) { case RISCV::AND: return RISCV::PseudoCCAND; case RISCV::OR: return RISCV::PseudoCCOR; case RISCV::XOR: return RISCV::PseudoCCXOR; + case RISCV::MAX: return RISCV::PseudoCCMAX; + case RISCV::MAXU: return RISCV::PseudoCCMAXU; + case RISCV::MIN: return RISCV::PseudoCCMIN; + case RISCV::MINU: return RISCV::PseudoCCMINU; case RISCV::ADDI: return RISCV::PseudoCCADDI; case RISCV::SLLI: return RISCV::PseudoCCSLLI; @@ -1735,7 +1739,8 @@ unsigned getPredicatedOpcode(unsigned Opcode) { /// return the defining instruction. static MachineInstr *canFoldAsPredicatedOp(Register Reg, const MachineRegisterInfo &MRI, - const TargetInstrInfo *TII) { + const TargetInstrInfo *TII, + const RISCVSubtarget &STI) { if (!Reg.isVirtual()) return nullptr; if (!MRI.hasOneNonDBGUse(Reg)) @@ -1743,6 +1748,12 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg, MachineInstr *MI = MRI.getVRegDef(Reg); if (!MI) return nullptr; + + if (!STI.hasShortForwardBranchIMinMax() && + (MI->getOpcode() == RISCV::MAX || MI->getOpcode() == RISCV::MIN || + MI->getOpcode() == RISCV::MINU || MI->getOpcode() == RISCV::MAXU)) + return nullptr; + // Check if MI can be predicated and folded into the CCMOV. if (getPredicatedOpcode(MI->getOpcode()) == RISCV::INSTRUCTION_LIST_END) return nullptr; @@ -1806,10 +1817,10 @@ RISCVInstrInfo::optimizeSelect(MachineInstr &MI, MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); MachineInstr *DefMI = - canFoldAsPredicatedOp(MI.getOperand(5).getReg(), MRI, this); + canFoldAsPredicatedOp(MI.getOperand(5).getReg(), MRI, this, STI); bool Invert = !DefMI; if (!DefMI) - DefMI = canFoldAsPredicatedOp(MI.getOperand(4).getReg(), MRI, this); + DefMI = canFoldAsPredicatedOp(MI.getOperand(4).getReg(), MRI, this, STI); if (!DefMI) return nullptr; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td index 0114fbdc56302..5a67a5aaba293 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td @@ -106,6 +106,10 @@ def PseudoCCSRA : SFBALU_rr; def PseudoCCAND : SFBALU_rr; def PseudoCCOR : SFBALU_rr; def PseudoCCXOR : SFBALU_rr; +def PseudoCCMAX : SFBALU_rr; +def PseudoCCMIN : SFBALU_rr; +def PseudoCCMAXU : SFBALU_rr; +def PseudoCCMINU : SFBALU_rr; def PseudoCCADDI : SFBALU_ri; def PseudoCCANDI : SFBALU_ri; diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index 37e11dbb12731..988d0490afeb6 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -136,6 +136,7 @@ ; CHECK-NEXT: shgatpa - 'Shgatpa' (SvNNx4 mode supported for all modes supported by satp, as well as Bare). ; CHECK-NEXT: shifted-zextw-fusion - Enable SLLI+SRLI to be fused when computing (shifted) word zero extension. ; CHECK-NEXT: shlcofideleg - 'Shlcofideleg' (Delegating LCOFI Interrupts to VS-mode). +; CHECK-NEXT: short-forward-branch-i-minmax - Enable short forward branch optimization for min,max instructions in Zbb. ; CHECK-NEXT: short-forward-branch-opt - Enable short forward branch optimization. ; CHECK-NEXT: shtvala - 'Shtvala' (htval provides all needed values). ; CHECK-NEXT: shvsatpa - 'Shvsatpa' (vsatp supports all modes supported by satp). diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-min-max.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-min-max.ll new file mode 100644 index 0000000000000..05e06cea9967a --- /dev/null +++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-min-max.ll @@ -0,0 +1,703 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=riscv32 -mattr=+zbb | FileCheck %s --check-prefixes=RV32I-ZBB +; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb | FileCheck %s --check-prefixes=RV64I-ZBB +; RUN: llc < %s -mtriple=riscv32 -mattr=+zbb,+short-forward-branch-opt | \ +; RUN: FileCheck %s --check-prefixes=RV32I-SFB-ZBB +; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb,+short-forward-branch-opt | \ +; RUN: FileCheck %s --check-prefixes=RV64I-SFB-ZBB +; RUN: llc < %s -mtriple=riscv32 -mattr=+zbb,+short-forward-branch-i-minmax | \ +; RUN: FileCheck %s --check-prefixes=RV32I-SFBIMinMax-ZBB +; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb,+short-forward-branch-i-minmax | \ +; RUN: FileCheck %s --check-prefixes=RV64I-SFBIMinMax-ZBB + +define i32 @select_example_smax(i32 %a, i32 %b, i1 zeroext %x, i32 %y) { +; RV32I-ZBB-LABEL: select_example_smax: +; RV32I-ZBB: # %bb.0: # %entry +; RV32I-ZBB-NEXT: beqz a2, .LBB0_2 +; RV32I-ZBB-NEXT: # %bb.1: +; RV32I-ZBB-NEXT: max a1, a0, a3 +; RV32I-ZBB-NEXT: .LBB0_2: # %entry +; RV32I-ZBB-NEXT: mv a0, a1 +; RV32I-ZBB-NEXT: ret +; +; RV64I-ZBB-LABEL: select_example_smax: +; RV64I-ZBB: # %bb.0: # %entry +; RV64I-ZBB-NEXT: beqz a2, .LBB0_2 +; RV64I-ZBB-NEXT: # %bb.1: +; RV64I-ZBB-NEXT: sext.w a3, a3 +; RV64I-ZBB-NEXT: sext.w a0, a0 +; RV64I-ZBB-NEXT: max a1, a0, a3 +; RV64I-ZBB-NEXT: .LBB0_2: # %entry +; RV64I-ZBB-NEXT: mv a0, a1 +; RV64I-ZBB-NEXT: ret +; +; RV32I-SFB-ZBB-LABEL: select_example_smax: +; RV32I-SFB-ZBB: # %bb.0: # %entry +; RV32I-SFB-ZBB-NEXT: max a0, a0, a3 +; RV32I-SFB-ZBB-NEXT: bnez a2, .LBB0_2 +; RV32I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a1 +; RV32I-SFB-ZBB-NEXT: .LBB0_2: # %entry +; RV32I-SFB-ZBB-NEXT: ret +; +; RV64I-SFB-ZBB-LABEL: select_example_smax: +; RV64I-SFB-ZBB: # %bb.0: # %entry +; RV64I-SFB-ZBB-NEXT: sext.w a3, a3 +; RV64I-SFB-ZBB-NEXT: sext.w a0, a0 +; RV64I-SFB-ZBB-NEXT: max a0, a0, a3 +; RV64I-SFB-ZBB-NEXT: bnez a2, .LBB0_2 +; RV64I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFB-ZBB-NEXT: mv a0, a1 +; RV64I-SFB-ZBB-NEXT: .LBB0_2: # %entry +; RV64I-SFB-ZBB-NEXT: ret +; +; RV32I-SFBIMinMax-ZBB-LABEL: select_example_smax: +; RV32I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB0_2 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: max a1, a0, a3 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB0_2: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV32I-SFBIMinMax-ZBB-NEXT: ret +; +; RV64I-SFBIMinMax-ZBB-LABEL: select_example_smax: +; RV64I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: sext.w a3, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: sext.w a0, a0 +; RV64I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB0_2 +; RV64I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: max a1, a0, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: .LBB0_2: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV64I-SFBIMinMax-ZBB-NEXT: ret +entry: + %res = call i32 @llvm.smax.i32(i32 %a, i32 %y) + %sel = select i1 %x, i32 %res, i32 %b + ret i32 %sel +} + +define i32 @select_example_smin(i32 %a, i32 %b, i1 zeroext %x, i32 %y) { +; RV32I-ZBB-LABEL: select_example_smin: +; RV32I-ZBB: # %bb.0: # %entry +; RV32I-ZBB-NEXT: beqz a2, .LBB1_2 +; RV32I-ZBB-NEXT: # %bb.1: +; RV32I-ZBB-NEXT: min a1, a0, a3 +; RV32I-ZBB-NEXT: .LBB1_2: # %entry +; RV32I-ZBB-NEXT: mv a0, a1 +; RV32I-ZBB-NEXT: ret +; +; RV64I-ZBB-LABEL: select_example_smin: +; RV64I-ZBB: # %bb.0: # %entry +; RV64I-ZBB-NEXT: beqz a2, .LBB1_2 +; RV64I-ZBB-NEXT: # %bb.1: +; RV64I-ZBB-NEXT: sext.w a3, a3 +; RV64I-ZBB-NEXT: sext.w a0, a0 +; RV64I-ZBB-NEXT: min a1, a0, a3 +; RV64I-ZBB-NEXT: .LBB1_2: # %entry +; RV64I-ZBB-NEXT: mv a0, a1 +; RV64I-ZBB-NEXT: ret +; +; RV32I-SFB-ZBB-LABEL: select_example_smin: +; RV32I-SFB-ZBB: # %bb.0: # %entry +; RV32I-SFB-ZBB-NEXT: min a0, a0, a3 +; RV32I-SFB-ZBB-NEXT: bnez a2, .LBB1_2 +; RV32I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a1 +; RV32I-SFB-ZBB-NEXT: .LBB1_2: # %entry +; RV32I-SFB-ZBB-NEXT: ret +; +; RV64I-SFB-ZBB-LABEL: select_example_smin: +; RV64I-SFB-ZBB: # %bb.0: # %entry +; RV64I-SFB-ZBB-NEXT: sext.w a3, a3 +; RV64I-SFB-ZBB-NEXT: sext.w a0, a0 +; RV64I-SFB-ZBB-NEXT: min a0, a0, a3 +; RV64I-SFB-ZBB-NEXT: bnez a2, .LBB1_2 +; RV64I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFB-ZBB-NEXT: mv a0, a1 +; RV64I-SFB-ZBB-NEXT: .LBB1_2: # %entry +; RV64I-SFB-ZBB-NEXT: ret +; +; RV32I-SFBIMinMax-ZBB-LABEL: select_example_smin: +; RV32I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB1_2 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: min a1, a0, a3 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB1_2: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV32I-SFBIMinMax-ZBB-NEXT: ret +; +; RV64I-SFBIMinMax-ZBB-LABEL: select_example_smin: +; RV64I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: sext.w a3, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: sext.w a0, a0 +; RV64I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB1_2 +; RV64I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: min a1, a0, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: .LBB1_2: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV64I-SFBIMinMax-ZBB-NEXT: ret +entry: + %res = call i32 @llvm.smin.i32(i32 %a, i32 %y) + %sel = select i1 %x, i32 %res, i32 %b + ret i32 %sel +} + +define i32 @select_example_umax(i32 %a, i32 %b, i1 zeroext %x, i32 %y) { +; RV32I-ZBB-LABEL: select_example_umax: +; RV32I-ZBB: # %bb.0: # %entry +; RV32I-ZBB-NEXT: beqz a2, .LBB2_2 +; RV32I-ZBB-NEXT: # %bb.1: +; RV32I-ZBB-NEXT: maxu a1, a0, a3 +; RV32I-ZBB-NEXT: .LBB2_2: # %entry +; RV32I-ZBB-NEXT: mv a0, a1 +; RV32I-ZBB-NEXT: ret +; +; RV64I-ZBB-LABEL: select_example_umax: +; RV64I-ZBB: # %bb.0: # %entry +; RV64I-ZBB-NEXT: beqz a2, .LBB2_2 +; RV64I-ZBB-NEXT: # %bb.1: +; RV64I-ZBB-NEXT: sext.w a3, a3 +; RV64I-ZBB-NEXT: sext.w a0, a0 +; RV64I-ZBB-NEXT: maxu a1, a0, a3 +; RV64I-ZBB-NEXT: .LBB2_2: # %entry +; RV64I-ZBB-NEXT: mv a0, a1 +; RV64I-ZBB-NEXT: ret +; +; RV32I-SFB-ZBB-LABEL: select_example_umax: +; RV32I-SFB-ZBB: # %bb.0: # %entry +; RV32I-SFB-ZBB-NEXT: maxu a0, a0, a3 +; RV32I-SFB-ZBB-NEXT: bnez a2, .LBB2_2 +; RV32I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a1 +; RV32I-SFB-ZBB-NEXT: .LBB2_2: # %entry +; RV32I-SFB-ZBB-NEXT: ret +; +; RV64I-SFB-ZBB-LABEL: select_example_umax: +; RV64I-SFB-ZBB: # %bb.0: # %entry +; RV64I-SFB-ZBB-NEXT: sext.w a3, a3 +; RV64I-SFB-ZBB-NEXT: sext.w a0, a0 +; RV64I-SFB-ZBB-NEXT: maxu a0, a0, a3 +; RV64I-SFB-ZBB-NEXT: bnez a2, .LBB2_2 +; RV64I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFB-ZBB-NEXT: mv a0, a1 +; RV64I-SFB-ZBB-NEXT: .LBB2_2: # %entry +; RV64I-SFB-ZBB-NEXT: ret +; +; RV32I-SFBIMinMax-ZBB-LABEL: select_example_umax: +; RV32I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB2_2 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: maxu a1, a0, a3 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB2_2: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV32I-SFBIMinMax-ZBB-NEXT: ret +; +; RV64I-SFBIMinMax-ZBB-LABEL: select_example_umax: +; RV64I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: sext.w a3, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: sext.w a0, a0 +; RV64I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB2_2 +; RV64I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: maxu a1, a0, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: .LBB2_2: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV64I-SFBIMinMax-ZBB-NEXT: ret +entry: + %res = call i32 @llvm.umax.i32(i32 %a, i32 %y) + %sel = select i1 %x, i32 %res, i32 %b + ret i32 %sel +} + +define i32 @select_example_umin(i32 %a, i32 %b, i1 zeroext %x, i32 %y) { +; RV32I-ZBB-LABEL: select_example_umin: +; RV32I-ZBB: # %bb.0: # %entry +; RV32I-ZBB-NEXT: beqz a2, .LBB3_2 +; RV32I-ZBB-NEXT: # %bb.1: +; RV32I-ZBB-NEXT: minu a1, a0, a3 +; RV32I-ZBB-NEXT: .LBB3_2: # %entry +; RV32I-ZBB-NEXT: mv a0, a1 +; RV32I-ZBB-NEXT: ret +; +; RV64I-ZBB-LABEL: select_example_umin: +; RV64I-ZBB: # %bb.0: # %entry +; RV64I-ZBB-NEXT: beqz a2, .LBB3_2 +; RV64I-ZBB-NEXT: # %bb.1: +; RV64I-ZBB-NEXT: sext.w a3, a3 +; RV64I-ZBB-NEXT: sext.w a0, a0 +; RV64I-ZBB-NEXT: minu a1, a0, a3 +; RV64I-ZBB-NEXT: .LBB3_2: # %entry +; RV64I-ZBB-NEXT: mv a0, a1 +; RV64I-ZBB-NEXT: ret +; +; RV32I-SFB-ZBB-LABEL: select_example_umin: +; RV32I-SFB-ZBB: # %bb.0: # %entry +; RV32I-SFB-ZBB-NEXT: minu a0, a0, a3 +; RV32I-SFB-ZBB-NEXT: bnez a2, .LBB3_2 +; RV32I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a1 +; RV32I-SFB-ZBB-NEXT: .LBB3_2: # %entry +; RV32I-SFB-ZBB-NEXT: ret +; +; RV64I-SFB-ZBB-LABEL: select_example_umin: +; RV64I-SFB-ZBB: # %bb.0: # %entry +; RV64I-SFB-ZBB-NEXT: sext.w a3, a3 +; RV64I-SFB-ZBB-NEXT: sext.w a0, a0 +; RV64I-SFB-ZBB-NEXT: minu a0, a0, a3 +; RV64I-SFB-ZBB-NEXT: bnez a2, .LBB3_2 +; RV64I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFB-ZBB-NEXT: mv a0, a1 +; RV64I-SFB-ZBB-NEXT: .LBB3_2: # %entry +; RV64I-SFB-ZBB-NEXT: ret +; +; RV32I-SFBIMinMax-ZBB-LABEL: select_example_umin: +; RV32I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB3_2 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: minu a1, a0, a3 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB3_2: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV32I-SFBIMinMax-ZBB-NEXT: ret +; +; RV64I-SFBIMinMax-ZBB-LABEL: select_example_umin: +; RV64I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: sext.w a3, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: sext.w a0, a0 +; RV64I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB3_2 +; RV64I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: minu a1, a0, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: .LBB3_2: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV64I-SFBIMinMax-ZBB-NEXT: ret +entry: + %res = call i32 @llvm.umin.i32(i32 %a, i32 %y) + %sel = select i1 %x, i32 %res, i32 %b + ret i32 %sel +} + +define i64 @select_example_smax_1(i64 %a, i64 %b, i1 zeroext %x, i64 %y) { +; RV32I-ZBB-LABEL: select_example_smax_1: +; RV32I-ZBB: # %bb.0: # %entry +; RV32I-ZBB-NEXT: beq a1, a6, .LBB4_2 +; RV32I-ZBB-NEXT: # %bb.1: # %entry +; RV32I-ZBB-NEXT: slt a7, a6, a1 +; RV32I-ZBB-NEXT: beqz a7, .LBB4_3 +; RV32I-ZBB-NEXT: j .LBB4_4 +; RV32I-ZBB-NEXT: .LBB4_2: +; RV32I-ZBB-NEXT: sltu a7, a5, a0 +; RV32I-ZBB-NEXT: bnez a7, .LBB4_4 +; RV32I-ZBB-NEXT: .LBB4_3: # %entry +; RV32I-ZBB-NEXT: mv a1, a6 +; RV32I-ZBB-NEXT: mv a0, a5 +; RV32I-ZBB-NEXT: .LBB4_4: # %entry +; RV32I-ZBB-NEXT: beqz a4, .LBB4_6 +; RV32I-ZBB-NEXT: # %bb.5: # %entry +; RV32I-ZBB-NEXT: ret +; RV32I-ZBB-NEXT: .LBB4_6: # %entry +; RV32I-ZBB-NEXT: mv a0, a2 +; RV32I-ZBB-NEXT: mv a1, a3 +; RV32I-ZBB-NEXT: ret +; +; RV64I-ZBB-LABEL: select_example_smax_1: +; RV64I-ZBB: # %bb.0: # %entry +; RV64I-ZBB-NEXT: beqz a2, .LBB4_2 +; RV64I-ZBB-NEXT: # %bb.1: +; RV64I-ZBB-NEXT: max a1, a0, a3 +; RV64I-ZBB-NEXT: .LBB4_2: # %entry +; RV64I-ZBB-NEXT: mv a0, a1 +; RV64I-ZBB-NEXT: ret +; +; RV32I-SFB-ZBB-LABEL: select_example_smax_1: +; RV32I-SFB-ZBB: # %bb.0: # %entry +; RV32I-SFB-ZBB-NEXT: sltu a7, a5, a0 +; RV32I-SFB-ZBB-NEXT: slt t0, a6, a1 +; RV32I-SFB-ZBB-NEXT: bne a1, a6, .LBB4_2 +; RV32I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFB-ZBB-NEXT: mv t0, a7 +; RV32I-SFB-ZBB-NEXT: .LBB4_2: # %entry +; RV32I-SFB-ZBB-NEXT: bnez t0, .LBB4_4 +; RV32I-SFB-ZBB-NEXT: # %bb.3: # %entry +; RV32I-SFB-ZBB-NEXT: mv a1, a6 +; RV32I-SFB-ZBB-NEXT: .LBB4_4: # %entry +; RV32I-SFB-ZBB-NEXT: bnez t0, .LBB4_6 +; RV32I-SFB-ZBB-NEXT: # %bb.5: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a5 +; RV32I-SFB-ZBB-NEXT: .LBB4_6: # %entry +; RV32I-SFB-ZBB-NEXT: bnez a4, .LBB4_8 +; RV32I-SFB-ZBB-NEXT: # %bb.7: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a2 +; RV32I-SFB-ZBB-NEXT: .LBB4_8: # %entry +; RV32I-SFB-ZBB-NEXT: bnez a4, .LBB4_10 +; RV32I-SFB-ZBB-NEXT: # %bb.9: # %entry +; RV32I-SFB-ZBB-NEXT: mv a1, a3 +; RV32I-SFB-ZBB-NEXT: .LBB4_10: # %entry +; RV32I-SFB-ZBB-NEXT: ret +; +; RV64I-SFB-ZBB-LABEL: select_example_smax_1: +; RV64I-SFB-ZBB: # %bb.0: # %entry +; RV64I-SFB-ZBB-NEXT: max a0, a0, a3 +; RV64I-SFB-ZBB-NEXT: bnez a2, .LBB4_2 +; RV64I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFB-ZBB-NEXT: mv a0, a1 +; RV64I-SFB-ZBB-NEXT: .LBB4_2: # %entry +; RV64I-SFB-ZBB-NEXT: ret +; +; RV32I-SFBIMinMax-ZBB-LABEL: select_example_smax_1: +; RV32I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: sltu a7, a5, a0 +; RV32I-SFBIMinMax-ZBB-NEXT: slt t0, a6, a1 +; RV32I-SFBIMinMax-ZBB-NEXT: bne a1, a6, .LBB4_2 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv t0, a7 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB4_2: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez t0, .LBB4_4 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.3: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a1, a6 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB4_4: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez t0, .LBB4_6 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.5: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a5 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB4_6: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez a4, .LBB4_8 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.7: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a2 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB4_8: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez a4, .LBB4_10 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.9: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a1, a3 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB4_10: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: ret +; +; RV64I-SFBIMinMax-ZBB-LABEL: select_example_smax_1: +; RV64I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB4_2 +; RV64I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: max a1, a0, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: .LBB4_2: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV64I-SFBIMinMax-ZBB-NEXT: ret +entry: + %res = call i64 @llvm.smax.i64(i64 %a, i64 %y) + %sel = select i1 %x, i64 %res, i64 %b + ret i64 %sel +} + +define i64 @select_example_smin_1(i64 %a, i64 %b, i1 zeroext %x, i64 %y) { +; RV32I-ZBB-LABEL: select_example_smin_1: +; RV32I-ZBB: # %bb.0: # %entry +; RV32I-ZBB-NEXT: beq a1, a6, .LBB5_2 +; RV32I-ZBB-NEXT: # %bb.1: # %entry +; RV32I-ZBB-NEXT: slt a7, a1, a6 +; RV32I-ZBB-NEXT: beqz a7, .LBB5_3 +; RV32I-ZBB-NEXT: j .LBB5_4 +; RV32I-ZBB-NEXT: .LBB5_2: +; RV32I-ZBB-NEXT: sltu a7, a0, a5 +; RV32I-ZBB-NEXT: bnez a7, .LBB5_4 +; RV32I-ZBB-NEXT: .LBB5_3: # %entry +; RV32I-ZBB-NEXT: mv a1, a6 +; RV32I-ZBB-NEXT: mv a0, a5 +; RV32I-ZBB-NEXT: .LBB5_4: # %entry +; RV32I-ZBB-NEXT: beqz a4, .LBB5_6 +; RV32I-ZBB-NEXT: # %bb.5: # %entry +; RV32I-ZBB-NEXT: ret +; RV32I-ZBB-NEXT: .LBB5_6: # %entry +; RV32I-ZBB-NEXT: mv a0, a2 +; RV32I-ZBB-NEXT: mv a1, a3 +; RV32I-ZBB-NEXT: ret +; +; RV64I-ZBB-LABEL: select_example_smin_1: +; RV64I-ZBB: # %bb.0: # %entry +; RV64I-ZBB-NEXT: beqz a2, .LBB5_2 +; RV64I-ZBB-NEXT: # %bb.1: +; RV64I-ZBB-NEXT: min a1, a0, a3 +; RV64I-ZBB-NEXT: .LBB5_2: # %entry +; RV64I-ZBB-NEXT: mv a0, a1 +; RV64I-ZBB-NEXT: ret +; +; RV32I-SFB-ZBB-LABEL: select_example_smin_1: +; RV32I-SFB-ZBB: # %bb.0: # %entry +; RV32I-SFB-ZBB-NEXT: sltu a7, a0, a5 +; RV32I-SFB-ZBB-NEXT: slt t0, a1, a6 +; RV32I-SFB-ZBB-NEXT: bne a1, a6, .LBB5_2 +; RV32I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFB-ZBB-NEXT: mv t0, a7 +; RV32I-SFB-ZBB-NEXT: .LBB5_2: # %entry +; RV32I-SFB-ZBB-NEXT: bnez t0, .LBB5_4 +; RV32I-SFB-ZBB-NEXT: # %bb.3: # %entry +; RV32I-SFB-ZBB-NEXT: mv a1, a6 +; RV32I-SFB-ZBB-NEXT: .LBB5_4: # %entry +; RV32I-SFB-ZBB-NEXT: bnez t0, .LBB5_6 +; RV32I-SFB-ZBB-NEXT: # %bb.5: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a5 +; RV32I-SFB-ZBB-NEXT: .LBB5_6: # %entry +; RV32I-SFB-ZBB-NEXT: bnez a4, .LBB5_8 +; RV32I-SFB-ZBB-NEXT: # %bb.7: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a2 +; RV32I-SFB-ZBB-NEXT: .LBB5_8: # %entry +; RV32I-SFB-ZBB-NEXT: bnez a4, .LBB5_10 +; RV32I-SFB-ZBB-NEXT: # %bb.9: # %entry +; RV32I-SFB-ZBB-NEXT: mv a1, a3 +; RV32I-SFB-ZBB-NEXT: .LBB5_10: # %entry +; RV32I-SFB-ZBB-NEXT: ret +; +; RV64I-SFB-ZBB-LABEL: select_example_smin_1: +; RV64I-SFB-ZBB: # %bb.0: # %entry +; RV64I-SFB-ZBB-NEXT: min a0, a0, a3 +; RV64I-SFB-ZBB-NEXT: bnez a2, .LBB5_2 +; RV64I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFB-ZBB-NEXT: mv a0, a1 +; RV64I-SFB-ZBB-NEXT: .LBB5_2: # %entry +; RV64I-SFB-ZBB-NEXT: ret +; +; RV32I-SFBIMinMax-ZBB-LABEL: select_example_smin_1: +; RV32I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: sltu a7, a0, a5 +; RV32I-SFBIMinMax-ZBB-NEXT: slt t0, a1, a6 +; RV32I-SFBIMinMax-ZBB-NEXT: bne a1, a6, .LBB5_2 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv t0, a7 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB5_2: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez t0, .LBB5_4 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.3: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a1, a6 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB5_4: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez t0, .LBB5_6 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.5: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a5 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB5_6: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez a4, .LBB5_8 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.7: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a2 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB5_8: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez a4, .LBB5_10 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.9: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a1, a3 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB5_10: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: ret +; +; RV64I-SFBIMinMax-ZBB-LABEL: select_example_smin_1: +; RV64I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB5_2 +; RV64I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: min a1, a0, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: .LBB5_2: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV64I-SFBIMinMax-ZBB-NEXT: ret +entry: + %res = call i64 @llvm.smin.i64(i64 %a, i64 %y) + %sel = select i1 %x, i64 %res, i64 %b + ret i64 %sel +} + +define i64 @select_example_umax_1(i64 %a, i64 %b, i1 zeroext %x, i64 %y) { +; RV32I-ZBB-LABEL: select_example_umax_1: +; RV32I-ZBB: # %bb.0: # %entry +; RV32I-ZBB-NEXT: beq a1, a6, .LBB6_2 +; RV32I-ZBB-NEXT: # %bb.1: # %entry +; RV32I-ZBB-NEXT: sltu a7, a6, a1 +; RV32I-ZBB-NEXT: beqz a7, .LBB6_3 +; RV32I-ZBB-NEXT: j .LBB6_4 +; RV32I-ZBB-NEXT: .LBB6_2: +; RV32I-ZBB-NEXT: sltu a7, a5, a0 +; RV32I-ZBB-NEXT: bnez a7, .LBB6_4 +; RV32I-ZBB-NEXT: .LBB6_3: # %entry +; RV32I-ZBB-NEXT: mv a1, a6 +; RV32I-ZBB-NEXT: mv a0, a5 +; RV32I-ZBB-NEXT: .LBB6_4: # %entry +; RV32I-ZBB-NEXT: beqz a4, .LBB6_6 +; RV32I-ZBB-NEXT: # %bb.5: # %entry +; RV32I-ZBB-NEXT: ret +; RV32I-ZBB-NEXT: .LBB6_6: # %entry +; RV32I-ZBB-NEXT: mv a0, a2 +; RV32I-ZBB-NEXT: mv a1, a3 +; RV32I-ZBB-NEXT: ret +; +; RV64I-ZBB-LABEL: select_example_umax_1: +; RV64I-ZBB: # %bb.0: # %entry +; RV64I-ZBB-NEXT: beqz a2, .LBB6_2 +; RV64I-ZBB-NEXT: # %bb.1: +; RV64I-ZBB-NEXT: maxu a1, a0, a3 +; RV64I-ZBB-NEXT: .LBB6_2: # %entry +; RV64I-ZBB-NEXT: mv a0, a1 +; RV64I-ZBB-NEXT: ret +; +; RV32I-SFB-ZBB-LABEL: select_example_umax_1: +; RV32I-SFB-ZBB: # %bb.0: # %entry +; RV32I-SFB-ZBB-NEXT: sltu a7, a5, a0 +; RV32I-SFB-ZBB-NEXT: sltu t0, a6, a1 +; RV32I-SFB-ZBB-NEXT: bne a1, a6, .LBB6_2 +; RV32I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFB-ZBB-NEXT: mv t0, a7 +; RV32I-SFB-ZBB-NEXT: .LBB6_2: # %entry +; RV32I-SFB-ZBB-NEXT: bnez t0, .LBB6_4 +; RV32I-SFB-ZBB-NEXT: # %bb.3: # %entry +; RV32I-SFB-ZBB-NEXT: mv a1, a6 +; RV32I-SFB-ZBB-NEXT: .LBB6_4: # %entry +; RV32I-SFB-ZBB-NEXT: bnez t0, .LBB6_6 +; RV32I-SFB-ZBB-NEXT: # %bb.5: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a5 +; RV32I-SFB-ZBB-NEXT: .LBB6_6: # %entry +; RV32I-SFB-ZBB-NEXT: bnez a4, .LBB6_8 +; RV32I-SFB-ZBB-NEXT: # %bb.7: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a2 +; RV32I-SFB-ZBB-NEXT: .LBB6_8: # %entry +; RV32I-SFB-ZBB-NEXT: bnez a4, .LBB6_10 +; RV32I-SFB-ZBB-NEXT: # %bb.9: # %entry +; RV32I-SFB-ZBB-NEXT: mv a1, a3 +; RV32I-SFB-ZBB-NEXT: .LBB6_10: # %entry +; RV32I-SFB-ZBB-NEXT: ret +; +; RV64I-SFB-ZBB-LABEL: select_example_umax_1: +; RV64I-SFB-ZBB: # %bb.0: # %entry +; RV64I-SFB-ZBB-NEXT: maxu a0, a0, a3 +; RV64I-SFB-ZBB-NEXT: bnez a2, .LBB6_2 +; RV64I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFB-ZBB-NEXT: mv a0, a1 +; RV64I-SFB-ZBB-NEXT: .LBB6_2: # %entry +; RV64I-SFB-ZBB-NEXT: ret +; +; RV32I-SFBIMinMax-ZBB-LABEL: select_example_umax_1: +; RV32I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: sltu a7, a5, a0 +; RV32I-SFBIMinMax-ZBB-NEXT: sltu t0, a6, a1 +; RV32I-SFBIMinMax-ZBB-NEXT: bne a1, a6, .LBB6_2 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv t0, a7 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB6_2: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez t0, .LBB6_4 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.3: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a1, a6 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB6_4: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez t0, .LBB6_6 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.5: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a5 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB6_6: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez a4, .LBB6_8 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.7: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a2 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB6_8: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez a4, .LBB6_10 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.9: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a1, a3 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB6_10: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: ret +; +; RV64I-SFBIMinMax-ZBB-LABEL: select_example_umax_1: +; RV64I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB6_2 +; RV64I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: maxu a1, a0, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: .LBB6_2: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV64I-SFBIMinMax-ZBB-NEXT: ret +entry: + %res = call i64 @llvm.umax.i64(i64 %a, i64 %y) + %sel = select i1 %x, i64 %res, i64 %b + ret i64 %sel +} + +define i64 @select_example_umin_1(i64 %a, i64 %b, i1 zeroext %x, i64 %y) { +; RV32I-ZBB-LABEL: select_example_umin_1: +; RV32I-ZBB: # %bb.0: # %entry +; RV32I-ZBB-NEXT: beq a1, a6, .LBB7_2 +; RV32I-ZBB-NEXT: # %bb.1: # %entry +; RV32I-ZBB-NEXT: sltu a7, a1, a6 +; RV32I-ZBB-NEXT: beqz a7, .LBB7_3 +; RV32I-ZBB-NEXT: j .LBB7_4 +; RV32I-ZBB-NEXT: .LBB7_2: +; RV32I-ZBB-NEXT: sltu a7, a0, a5 +; RV32I-ZBB-NEXT: bnez a7, .LBB7_4 +; RV32I-ZBB-NEXT: .LBB7_3: # %entry +; RV32I-ZBB-NEXT: mv a1, a6 +; RV32I-ZBB-NEXT: mv a0, a5 +; RV32I-ZBB-NEXT: .LBB7_4: # %entry +; RV32I-ZBB-NEXT: beqz a4, .LBB7_6 +; RV32I-ZBB-NEXT: # %bb.5: # %entry +; RV32I-ZBB-NEXT: ret +; RV32I-ZBB-NEXT: .LBB7_6: # %entry +; RV32I-ZBB-NEXT: mv a0, a2 +; RV32I-ZBB-NEXT: mv a1, a3 +; RV32I-ZBB-NEXT: ret +; +; RV64I-ZBB-LABEL: select_example_umin_1: +; RV64I-ZBB: # %bb.0: # %entry +; RV64I-ZBB-NEXT: beqz a2, .LBB7_2 +; RV64I-ZBB-NEXT: # %bb.1: +; RV64I-ZBB-NEXT: minu a1, a0, a3 +; RV64I-ZBB-NEXT: .LBB7_2: # %entry +; RV64I-ZBB-NEXT: mv a0, a1 +; RV64I-ZBB-NEXT: ret +; +; RV32I-SFB-ZBB-LABEL: select_example_umin_1: +; RV32I-SFB-ZBB: # %bb.0: # %entry +; RV32I-SFB-ZBB-NEXT: sltu a7, a0, a5 +; RV32I-SFB-ZBB-NEXT: sltu t0, a1, a6 +; RV32I-SFB-ZBB-NEXT: bne a1, a6, .LBB7_2 +; RV32I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFB-ZBB-NEXT: mv t0, a7 +; RV32I-SFB-ZBB-NEXT: .LBB7_2: # %entry +; RV32I-SFB-ZBB-NEXT: bnez t0, .LBB7_4 +; RV32I-SFB-ZBB-NEXT: # %bb.3: # %entry +; RV32I-SFB-ZBB-NEXT: mv a1, a6 +; RV32I-SFB-ZBB-NEXT: .LBB7_4: # %entry +; RV32I-SFB-ZBB-NEXT: bnez t0, .LBB7_6 +; RV32I-SFB-ZBB-NEXT: # %bb.5: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a5 +; RV32I-SFB-ZBB-NEXT: .LBB7_6: # %entry +; RV32I-SFB-ZBB-NEXT: bnez a4, .LBB7_8 +; RV32I-SFB-ZBB-NEXT: # %bb.7: # %entry +; RV32I-SFB-ZBB-NEXT: mv a0, a2 +; RV32I-SFB-ZBB-NEXT: .LBB7_8: # %entry +; RV32I-SFB-ZBB-NEXT: bnez a4, .LBB7_10 +; RV32I-SFB-ZBB-NEXT: # %bb.9: # %entry +; RV32I-SFB-ZBB-NEXT: mv a1, a3 +; RV32I-SFB-ZBB-NEXT: .LBB7_10: # %entry +; RV32I-SFB-ZBB-NEXT: ret +; +; RV64I-SFB-ZBB-LABEL: select_example_umin_1: +; RV64I-SFB-ZBB: # %bb.0: # %entry +; RV64I-SFB-ZBB-NEXT: minu a0, a0, a3 +; RV64I-SFB-ZBB-NEXT: bnez a2, .LBB7_2 +; RV64I-SFB-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFB-ZBB-NEXT: mv a0, a1 +; RV64I-SFB-ZBB-NEXT: .LBB7_2: # %entry +; RV64I-SFB-ZBB-NEXT: ret +; +; RV32I-SFBIMinMax-ZBB-LABEL: select_example_umin_1: +; RV32I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: sltu a7, a0, a5 +; RV32I-SFBIMinMax-ZBB-NEXT: sltu t0, a1, a6 +; RV32I-SFBIMinMax-ZBB-NEXT: bne a1, a6, .LBB7_2 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv t0, a7 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB7_2: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez t0, .LBB7_4 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.3: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a1, a6 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB7_4: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez t0, .LBB7_6 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.5: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a5 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB7_6: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez a4, .LBB7_8 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.7: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a0, a2 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB7_8: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: bnez a4, .LBB7_10 +; RV32I-SFBIMinMax-ZBB-NEXT: # %bb.9: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: mv a1, a3 +; RV32I-SFBIMinMax-ZBB-NEXT: .LBB7_10: # %entry +; RV32I-SFBIMinMax-ZBB-NEXT: ret +; +; RV64I-SFBIMinMax-ZBB-LABEL: select_example_umin_1: +; RV64I-SFBIMinMax-ZBB: # %bb.0: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: beqz a2, .LBB7_2 +; RV64I-SFBIMinMax-ZBB-NEXT: # %bb.1: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: minu a1, a0, a3 +; RV64I-SFBIMinMax-ZBB-NEXT: .LBB7_2: # %entry +; RV64I-SFBIMinMax-ZBB-NEXT: mv a0, a1 +; RV64I-SFBIMinMax-ZBB-NEXT: ret +entry: + %res = call i64 @llvm.umin.i64(i64 %a, i64 %y) + %sel = select i1 %x, i64 %res, i64 %b + ret i64 %sel +} From 0c57911a47c5c9e79948644ecd07e1ff154dc4f0 Mon Sep 17 00:00:00 2001 From: pkarveti Date: Fri, 31 Oct 2025 10:41:06 +0530 Subject: [PATCH 291/539] [Hexagon] Handle truncate of v64i32 -> v64i1 when Hvx is enabled (#164931) Fixes #160806 --- llvm/lib/Target/Hexagon/HexagonPatternsHVX.td | 3 +++ .../CodeGen/Hexagon/isel/trunc-vNi1-HVX.ll | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 llvm/test/CodeGen/Hexagon/isel/trunc-vNi1-HVX.ll diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td index 1637b91f1fa12..d19920cfc9ea0 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td @@ -612,6 +612,9 @@ let Predicates = [UseHVX] in { (V6_vandvrt HvxVR:$Vs, (ToI32 0x01010101))>; def: Pat<(VecQ32 (trunc HVI32:$Vs)), (V6_vandvrt HvxVR:$Vs, (ToI32 0x01010101))>; + def: Pat<(VecQ16 (trunc HWI32:$Vss)), + (Combineq(VecQ32(V6_vandvrt (HiVec $Vss), (ToI32 0x01010101))), + (VecQ32 (V6_vandvrt (LoVec $Vss), (ToI32 0x01010101))))>; } let Predicates = [UseHVX] in { diff --git a/llvm/test/CodeGen/Hexagon/isel/trunc-vNi1-HVX.ll b/llvm/test/CodeGen/Hexagon/isel/trunc-vNi1-HVX.ll new file mode 100644 index 0000000000000..1491729a17f30 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/isel/trunc-vNi1-HVX.ll @@ -0,0 +1,18 @@ +; RUN: llc --mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s | FileCheck %s + +define void @f5(<64 x i32> %a0, ptr %a1) { +; CHECK-LABEL: f5: +; CHECK: [[REG0:(r[0-9]+)]] = ##16843009 +; CHECK-DAG: q[[Q0:[0-9]+]] = vand(v{{[0-9]+}},[[REG0]]) +; CHECK-DAG: q[[Q1:[0-9]+]] = vand(v{{[0-9]+}},[[REG0]]) +; CHECK: v{{[0-9]+}}.b = vpacke(v{{[0-9]+}}.h,v{{[0-9]+}}.h) +; CHECK: v{{[0-9]+}}.b = vpacke(v{{[0-9]+}}.h,v{{[0-9]+}}.h) +; CHECK: v[[VROR:[0-9]+]] = vror(v{{[0-9]+}},r{{[0-9]+}}) +; CHECK: v[[VOR:[0-9]+]] = vor(v[[VROR]],v{{[0-9]+}}) +; CHECK: q{{[0-9]+}} = vand(v[[VOR]],r{{[0-9]+}}) +b0: + %v0 = trunc <64 x i32> %a0 to <64 x i1> + store <64 x i1> %v0, ptr %a1, align 1 + ret void +} + From b7dc0fd080ccbb35ede4d6687425c7d2a3308bc4 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 30 Oct 2025 22:49:44 -0700 Subject: [PATCH 292/539] AArch64: Add more 3 element vector sincos tests (#165816) --- llvm/test/CodeGen/AArch64/llvm.sincos.ll | 195 +++++++++++++++++++++++ 1 file changed, 195 insertions(+) diff --git a/llvm/test/CodeGen/AArch64/llvm.sincos.ll b/llvm/test/CodeGen/AArch64/llvm.sincos.ll index f1dcb2a478a0d..21da8645b9b16 100644 --- a/llvm/test/CodeGen/AArch64/llvm.sincos.ll +++ b/llvm/test/CodeGen/AArch64/llvm.sincos.ll @@ -215,6 +215,133 @@ define { <2 x half>, <2 x half> } @test_sincos_v2f16(<2 x half> %a) nounwind { ret { <2 x half>, <2 x half> } %result } +define { <3 x half>, <3 x half> } @test_sincos_v3f16(<3 x half> %a) nounwind { +; CHECK-LABEL: test_sincos_v3f16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov h1, v0.h[1] +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: add x0, sp, #36 +; CHECK-NEXT: add x1, sp, #32 +; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-NEXT: fcvt s0, h1 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add x0, sp, #28 +; CHECK-NEXT: add x1, sp, #24 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add x0, sp, #44 +; CHECK-NEXT: add x1, sp, #40 +; CHECK-NEXT: mov h0, v0.h[2] +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add x0, sp, #60 +; CHECK-NEXT: add x1, sp, #56 +; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldp s2, s0, [sp, #32] +; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-NEXT: ldp s3, s1, [sp, #24] +; CHECK-NEXT: fcvt h4, s0 +; CHECK-NEXT: fcvt h2, s2 +; CHECK-NEXT: fcvt h0, s1 +; CHECK-NEXT: fcvt h1, s3 +; CHECK-NEXT: ldp s5, s3, [sp, #40] +; CHECK-NEXT: fcvt h3, s3 +; CHECK-NEXT: mov v0.h[1], v4.h[0] +; CHECK-NEXT: fcvt h4, s5 +; CHECK-NEXT: mov v1.h[1], v2.h[0] +; CHECK-NEXT: ldp s5, s2, [sp, #56] +; CHECK-NEXT: mov v0.h[2], v3.h[0] +; CHECK-NEXT: fcvt h2, s2 +; CHECK-NEXT: fcvt h3, s5 +; CHECK-NEXT: mov v1.h[2], v4.h[0] +; CHECK-NEXT: mov v0.h[3], v2.h[0] +; CHECK-NEXT: mov v1.h[3], v3.h[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ret +; +; NO-LIBCALL-LABEL: test_sincos_v3f16: +; NO-LIBCALL: // %bb.0: +; NO-LIBCALL-NEXT: sub sp, sp, #80 +; NO-LIBCALL-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO-LIBCALL-NEXT: mov h1, v0.h[1] +; NO-LIBCALL-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: str q0, [sp] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; NO-LIBCALL-NEXT: fcvt s8, h1 +; NO-LIBCALL-NEXT: fmov s0, s8 +; NO-LIBCALL-NEXT: bl sinf +; NO-LIBCALL-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: fcvt h0, s0 +; NO-LIBCALL-NEXT: fcvt s9, h1 +; NO-LIBCALL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: fmov s0, s9 +; NO-LIBCALL-NEXT: bl sinf +; NO-LIBCALL-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: fcvt h0, s0 +; NO-LIBCALL-NEXT: mov h1, v1.h[2] +; NO-LIBCALL-NEXT: fcvt s10, h1 +; NO-LIBCALL-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: mov v0.h[1], v1.h[0] +; NO-LIBCALL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: fmov s0, s10 +; NO-LIBCALL-NEXT: bl sinf +; NO-LIBCALL-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: fcvt h0, s0 +; NO-LIBCALL-NEXT: mov h1, v1.h[3] +; NO-LIBCALL-NEXT: fcvt s11, h1 +; NO-LIBCALL-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: mov v1.h[2], v0.h[0] +; NO-LIBCALL-NEXT: fmov s0, s11 +; NO-LIBCALL-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: bl sinf +; NO-LIBCALL-NEXT: fcvt h0, s0 +; NO-LIBCALL-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: mov v1.h[3], v0.h[0] +; NO-LIBCALL-NEXT: fmov s0, s8 +; NO-LIBCALL-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: bl cosf +; NO-LIBCALL-NEXT: fcvt h0, s0 +; NO-LIBCALL-NEXT: str q0, [sp] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: fmov s0, s9 +; NO-LIBCALL-NEXT: bl cosf +; NO-LIBCALL-NEXT: fcvt h0, s0 +; NO-LIBCALL-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: mov v0.h[1], v1.h[0] +; NO-LIBCALL-NEXT: str q0, [sp] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: fmov s0, s10 +; NO-LIBCALL-NEXT: bl cosf +; NO-LIBCALL-NEXT: fcvt h0, s0 +; NO-LIBCALL-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: mov v1.h[2], v0.h[0] +; NO-LIBCALL-NEXT: fmov s0, s11 +; NO-LIBCALL-NEXT: str q1, [sp] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: bl cosf +; NO-LIBCALL-NEXT: fmov s1, s0 +; NO-LIBCALL-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; NO-LIBCALL-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NO-LIBCALL-NEXT: fcvt h2, s1 +; NO-LIBCALL-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: mov v1.h[3], v2.h[0] +; NO-LIBCALL-NEXT: // kill: def $d1 killed $d1 killed $q1 +; NO-LIBCALL-NEXT: add sp, sp, #80 +; NO-LIBCALL-NEXT: ret + %result = call { <3 x half>, <3 x half> } @llvm.sincos.v3f16(<3 x half> %a) + ret { <3 x half>, <3 x half> } %result +} + define { float, float } @test_sincos_f32(float %a) nounwind { ; CHECK-LABEL: test_sincos_f32: ; CHECK: // %bb.0: @@ -493,3 +620,71 @@ define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) nounwi %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a) ret { <2 x double>, <2 x double> } %result } + +define { <3 x double>, <3 x double> } @test_sincos_v3f64(<3 x double> %a) nounwind { +; CHECK-LABEL: test_sincos_v3f64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #80 +; CHECK-NEXT: add x0, sp, #16 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: fmov d8, d2 +; CHECK-NEXT: fmov d9, d1 +; CHECK-NEXT: bl sincos +; CHECK-NEXT: fmov d0, d9 +; CHECK-NEXT: add x0, sp, #32 +; CHECK-NEXT: add x1, sp, #24 +; CHECK-NEXT: bl sincos +; CHECK-NEXT: fmov d0, d8 +; CHECK-NEXT: add x0, sp, #72 +; CHECK-NEXT: add x1, sp, #40 +; CHECK-NEXT: bl sincos +; CHECK-NEXT: ldp d3, d0, [sp, #8] +; CHECK-NEXT: ldr d2, [sp, #72] +; CHECK-NEXT: ldp d4, d1, [sp, #24] +; CHECK-NEXT: ldr d5, [sp, #40] +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 +; CHECK-NEXT: ret +; +; NO-LIBCALL-LABEL: test_sincos_v3f64: +; NO-LIBCALL: // %bb.0: +; NO-LIBCALL-NEXT: stp d13, d12, [sp, #-64]! // 16-byte Folded Spill +; NO-LIBCALL-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: fmov d10, d0 +; NO-LIBCALL-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; NO-LIBCALL-NEXT: fmov d8, d2 +; NO-LIBCALL-NEXT: fmov d9, d1 +; NO-LIBCALL-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; NO-LIBCALL-NEXT: bl sin +; NO-LIBCALL-NEXT: fmov d11, d0 +; NO-LIBCALL-NEXT: fmov d0, d9 +; NO-LIBCALL-NEXT: bl sin +; NO-LIBCALL-NEXT: fmov d12, d0 +; NO-LIBCALL-NEXT: fmov d0, d8 +; NO-LIBCALL-NEXT: bl sin +; NO-LIBCALL-NEXT: fmov d13, d0 +; NO-LIBCALL-NEXT: fmov d0, d10 +; NO-LIBCALL-NEXT: bl cos +; NO-LIBCALL-NEXT: fmov d10, d0 +; NO-LIBCALL-NEXT: fmov d0, d9 +; NO-LIBCALL-NEXT: bl cos +; NO-LIBCALL-NEXT: fmov d9, d0 +; NO-LIBCALL-NEXT: fmov d0, d8 +; NO-LIBCALL-NEXT: bl cos +; NO-LIBCALL-NEXT: fmov d5, d0 +; NO-LIBCALL-NEXT: fmov d0, d11 +; NO-LIBCALL-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; NO-LIBCALL-NEXT: fmov d3, d10 +; NO-LIBCALL-NEXT: fmov d4, d9 +; NO-LIBCALL-NEXT: fmov d1, d12 +; NO-LIBCALL-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: fmov d2, d13 +; NO-LIBCALL-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload +; NO-LIBCALL-NEXT: ldp d13, d12, [sp], #64 // 16-byte Folded Reload +; NO-LIBCALL-NEXT: ret + %result = call { <3 x double>, <3 x double> } @llvm.sincos.v3f64(<3 x double> %a) + ret { <3 x double>, <3 x double> } %result +} From c0afc361d68247ed96af258cca95edb3aad8a2b3 Mon Sep 17 00:00:00 2001 From: Zhaoxin Yang Date: Fri, 31 Oct 2025 13:57:34 +0800 Subject: [PATCH 293/539] [LoongArch][NFC] Pre-commit tests for vector ceil,floor,trunc,roundeven (#165213) --- .../CodeGen/LoongArch/lasx/fp-rounding.ll | 308 ++++++++++++++++++ .../test/CodeGen/LoongArch/lsx/fp-rounding.ll | 212 ++++++++++++ 2 files changed, 520 insertions(+) create mode 100644 llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll create mode 100644 llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll new file mode 100644 index 0000000000000..79407c3fd4c8b --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll @@ -0,0 +1,308 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +;; ceilf +define void @ceil_v8f32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: ceil_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrp.s $vr1, $vr1 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4 +; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 +; CHECK-NEXT: vfrintrp.s $vr2, $vr2 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrp.s $vr1, $vr1 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrp.s $vr1, $vr1 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 48 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrp.s $vr1, $vr1 +; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0 +; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0 +; CHECK-NEXT: vfrintrp.s $vr3, $vr3 +; CHECK-NEXT: vextrins.w $vr3, $vr1, 16 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrp.s $vr1, $vr1 +; CHECK-NEXT: vextrins.w $vr3, $vr1, 32 +; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 +; CHECK-NEXT: vfrintrp.s $vr0, $vr0 +; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 +; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2 +; CHECK-NEXT: xvst $xr3, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %a0 + %r = call <8 x float> @llvm.ceil.v8f32(<8 x float> %v0) + store <8 x float> %r, ptr %res + ret void +} + +;; ceil +define void @ceil_v4f64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: ceil_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrp.d $vr1, $vr1 +; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2 +; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0 +; CHECK-NEXT: vfrintrp.d $vr2, $vr2 +; CHECK-NEXT: vextrins.d $vr2, $vr1, 16 +; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrp.d $vr1, $vr1 +; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vfrintrp.d $vr0, $vr0 +; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 +; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %a0 + %r = call <4 x double> @llvm.ceil.v4f64(<4 x double> %v0) + store <4 x double> %r, ptr %res + ret void +} + +;; floorf +define void @floor_v8f32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: floor_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrm.s $vr1, $vr1 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4 +; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 +; CHECK-NEXT: vfrintrm.s $vr2, $vr2 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrm.s $vr1, $vr1 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrm.s $vr1, $vr1 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 48 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrm.s $vr1, $vr1 +; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0 +; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0 +; CHECK-NEXT: vfrintrm.s $vr3, $vr3 +; CHECK-NEXT: vextrins.w $vr3, $vr1, 16 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrm.s $vr1, $vr1 +; CHECK-NEXT: vextrins.w $vr3, $vr1, 32 +; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 +; CHECK-NEXT: vfrintrm.s $vr0, $vr0 +; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 +; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2 +; CHECK-NEXT: xvst $xr3, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %a0 + %r = call <8 x float> @llvm.floor.v8f32(<8 x float> %v0) + store <8 x float> %r, ptr %res + ret void +} + +;; floor +define void @floor_v4f64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: floor_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrm.d $vr1, $vr1 +; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2 +; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0 +; CHECK-NEXT: vfrintrm.d $vr2, $vr2 +; CHECK-NEXT: vextrins.d $vr2, $vr1, 16 +; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrm.d $vr1, $vr1 +; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vfrintrm.d $vr0, $vr0 +; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 +; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %a0 + %r = call <4 x double> @llvm.floor.v4f64(<4 x double> %v0) + store <4 x double> %r, ptr %res + ret void +} + +;; truncf +define void @trunc_v8f32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: trunc_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrz.s $vr1, $vr1 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4 +; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 +; CHECK-NEXT: vfrintrz.s $vr2, $vr2 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrz.s $vr1, $vr1 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrz.s $vr1, $vr1 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 48 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrz.s $vr1, $vr1 +; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0 +; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0 +; CHECK-NEXT: vfrintrz.s $vr3, $vr3 +; CHECK-NEXT: vextrins.w $vr3, $vr1, 16 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrz.s $vr1, $vr1 +; CHECK-NEXT: vextrins.w $vr3, $vr1, 32 +; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 +; CHECK-NEXT: vfrintrz.s $vr0, $vr0 +; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 +; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2 +; CHECK-NEXT: xvst $xr3, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %a0 + %r = call <8 x float> @llvm.trunc.v8f32(<8 x float> %v0) + store <8 x float> %r, ptr %res + ret void +} + +;; trunc +define void @trunc_v4f64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: trunc_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrz.d $vr1, $vr1 +; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2 +; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0 +; CHECK-NEXT: vfrintrz.d $vr2, $vr2 +; CHECK-NEXT: vextrins.d $vr2, $vr1, 16 +; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrz.d $vr1, $vr1 +; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vfrintrz.d $vr0, $vr0 +; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 +; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %a0 + %r = call <4 x double> @llvm.trunc.v4f64(<4 x double> %v0) + store <4 x double> %r, ptr %res + ret void +} + +;; roundevenf +define void @roundeven_v8f32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: roundeven_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrne.s $vr1, $vr1 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4 +; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 +; CHECK-NEXT: vfrintrne.s $vr2, $vr2 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrne.s $vr1, $vr1 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrne.s $vr1, $vr1 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 48 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrne.s $vr1, $vr1 +; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0 +; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0 +; CHECK-NEXT: vfrintrne.s $vr3, $vr3 +; CHECK-NEXT: vextrins.w $vr3, $vr1, 16 +; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrne.s $vr1, $vr1 +; CHECK-NEXT: vextrins.w $vr3, $vr1, 32 +; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 +; CHECK-NEXT: vfrintrne.s $vr0, $vr0 +; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 +; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2 +; CHECK-NEXT: xvst $xr3, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %a0 + %r = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %v0) + store <8 x float> %r, ptr %res + ret void +} + +;; roundeven +define void @roundeven_v4f64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: roundeven_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrne.d $vr1, $vr1 +; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2 +; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0 +; CHECK-NEXT: vfrintrne.d $vr2, $vr2 +; CHECK-NEXT: vextrins.d $vr2, $vr1, 16 +; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrne.d $vr1, $vr1 +; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vfrintrne.d $vr0, $vr0 +; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 +; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %a0 + %r = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %v0) + store <4 x double> %r, ptr %res + ret void +} + +declare <8 x float> @llvm.ceil.v8f32(<8 x float>) +declare <4 x double> @llvm.ceil.v4f64(<4 x double>) +declare <8 x float> @llvm.floor.v8f32(<8 x float>) +declare <4 x double> @llvm.floor.v4f64(<4 x double>) +declare <8 x float> @llvm.trunc.v8f32(<8 x float>) +declare <4 x double> @llvm.trunc.v4f64(<4 x double>) +declare <8 x float> @llvm.roundeven.v8f32(<8 x float>) +declare <4 x double> @llvm.roundeven.v4f64(<4 x double>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll new file mode 100644 index 0000000000000..1ca6290a2239b --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll @@ -0,0 +1,212 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +;; ceilf +define void @ceil_v4f32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: ceil_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrp.s $vr1, $vr1 +; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0 +; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 +; CHECK-NEXT: vfrintrp.s $vr2, $vr2 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 +; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrp.s $vr1, $vr1 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 +; CHECK-NEXT: vfrintrp.s $vr0, $vr0 +; CHECK-NEXT: vextrins.w $vr2, $vr0, 48 +; CHECK-NEXT: vst $vr2, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %a0 + %r = call <4 x float> @llvm.ceil.v4f32(<4 x float> %v0) + store <4 x float> %r, ptr %res + ret void +} + +;; ceil +define void @ceil_v2f64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: ceil_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrp.d $vr1, $vr1 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vfrintrp.d $vr0, $vr0 +; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %a0 + %r = call <2 x double> @llvm.ceil.v2f64(<2 x double> %v0) + store <2 x double> %r, ptr %res + ret void +} + +;; floorf +define void @floor_v4f32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: floor_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrm.s $vr1, $vr1 +; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0 +; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 +; CHECK-NEXT: vfrintrm.s $vr2, $vr2 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 +; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrm.s $vr1, $vr1 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 +; CHECK-NEXT: vfrintrm.s $vr0, $vr0 +; CHECK-NEXT: vextrins.w $vr2, $vr0, 48 +; CHECK-NEXT: vst $vr2, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %a0 + %r = call <4 x float> @llvm.floor.v4f32(<4 x float> %v0) + store <4 x float> %r, ptr %res + ret void +} + +;; floor +define void @floor_v2f64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: floor_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrm.d $vr1, $vr1 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vfrintrm.d $vr0, $vr0 +; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %a0 + %r = call <2 x double> @llvm.floor.v2f64(<2 x double> %v0) + store <2 x double> %r, ptr %res + ret void +} + +;; truncf +define void @trunc_v4f32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: trunc_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrz.s $vr1, $vr1 +; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0 +; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 +; CHECK-NEXT: vfrintrz.s $vr2, $vr2 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 +; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrz.s $vr1, $vr1 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 +; CHECK-NEXT: vfrintrz.s $vr0, $vr0 +; CHECK-NEXT: vextrins.w $vr2, $vr0, 48 +; CHECK-NEXT: vst $vr2, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %a0 + %r = call <4 x float> @llvm.trunc.v4f32(<4 x float> %v0) + store <4 x float> %r, ptr %res + ret void +} + +;; trunc +define void @trunc_v2f64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: trunc_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrz.d $vr1, $vr1 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vfrintrz.d $vr0, $vr0 +; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %a0 + %r = call <2 x double> @llvm.trunc.v2f64(<2 x double> %v0) + store <2 x double> %r, ptr %res + ret void +} + +;; roundevenf +define void @roundeven_v4f32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: roundeven_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrne.s $vr1, $vr1 +; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0 +; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 +; CHECK-NEXT: vfrintrne.s $vr2, $vr2 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 +; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrne.s $vr1, $vr1 +; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 +; CHECK-NEXT: vfrintrne.s $vr0, $vr0 +; CHECK-NEXT: vextrins.w $vr2, $vr0, 48 +; CHECK-NEXT: vst $vr2, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %a0 + %r = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %v0) + store <4 x float> %r, ptr %res + ret void +} + +;; roundeven +define void @roundeven_v2f64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: roundeven_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: vfrintrne.d $vr1, $vr1 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vfrintrne.d $vr0, $vr0 +; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %a0 + %r = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %v0) + store <2 x double> %r, ptr %res + ret void +} + +declare <4 x float> @llvm.ceil.v4f32(<4 x float>) +declare <2 x double> @llvm.ceil.v2f64(<2 x double>) +declare <4 x float> @llvm.floor.v4f32(<4 x float>) +declare <2 x double> @llvm.floor.v2f64(<2 x double>) +declare <4 x float> @llvm.trunc.v4f32(<4 x float>) +declare <2 x double> @llvm.trunc.v2f64(<2 x double>) +declare <4 x float> @llvm.roundeven.v4f32(<4 x float>) +declare <2 x double> @llvm.roundeven.v2f64(<2 x double>) From 3bc13344846e47c7c75c0612583e3fcee27a0ca5 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Fri, 31 Oct 2025 05:59:24 +0000 Subject: [PATCH 294/539] Revert "[compiler-rt] Default to Lit's Internal Shell" This reverts commit 16ab8c0026ab80493089663a315d28c4cf9d1794. It appears this broke a couple of buildbots: 1. https://lab.llvm.org/buildbot/#/builders/193/builds/11847 2. https://lab.llvm.org/buildbot/#/builders/161/builds/8736 Reverting for now so I have a chance to investigate. --- compiler-rt/test/lit.common.cfg.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py index 1468c0742a766..9d2f02189b8bd 100644 --- a/compiler-rt/test/lit.common.cfg.py +++ b/compiler-rt/test/lit.common.cfg.py @@ -113,17 +113,12 @@ def push_dynamic_library_lookup_path(config, new_path): config.environment[dynamic_library_lookup_var] = new_ld_library_path_64 -# TODO: Consolidate the logic for turning on the internal shell by default for all LLVM test suites. -# See https://github.com/llvm/llvm-project/issues/106636 for more details. -# # Choose between lit's internal shell pipeline runner and a real shell. If # LIT_USE_INTERNAL_SHELL is in the environment, we use that as an override. -use_lit_shell = True -lit_shell_env = os.environ.get("LIT_USE_INTERNAL_SHELL") -if lit_shell_env: - use_lit_shell = lit.util.pythonize_bool(lit_shell_env) +use_lit_shell = os.environ.get("LIT_USE_INTERNAL_SHELL") if use_lit_shell: - execute_external = True + # 0 is external, "" is default, and everything else is internal. + execute_external = use_lit_shell == "0" else: # Otherwise we default to internal on Windows and external elsewhere, as # bash on Windows is usually very slow. From 3e2f714d688b2906c2b290f4218178174aa2f77c Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Fri, 31 Oct 2025 14:13:30 +0800 Subject: [PATCH 295/539] [LoongArch][NFC] Pre-commit tests for vector type avg{floor/ceil}{s/u} (#165821) --- .../lasx/ir-instruction/avgfloor-ceil.ll | 379 ++++++++++++++++++ .../lsx/ir-instruction/avgfloor-ceil.ll | 379 ++++++++++++++++++ 2 files changed, 758 insertions(+) create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avgfloor-ceil.ll create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avgfloor-ceil.ll diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avgfloor-ceil.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avgfloor-ceil.ll new file mode 100644 index 0000000000000..c82adcb250c64 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avgfloor-ceil.ll @@ -0,0 +1,379 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +define void @xvavg_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvand.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.b $xr0, $xr0, 1 +; CHECK-NEXT: xvadd.b $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %ea = sext <32 x i8> %va to <32 x i16> + %eb = sext <32 x i8> %vb to <32 x i16> + %add = add <32 x i16> %ea, %eb + %shr = lshr <32 x i16> %add, + %r = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %r, ptr %res + ret void +} + +define void @xvavg_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvand.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.h $xr0, $xr0, 1 +; CHECK-NEXT: xvadd.h $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %ea = sext <16 x i16> %va to <16 x i32> + %eb = sext <16 x i16> %vb to <16 x i32> + %add = add <16 x i32> %ea, %eb + %shr = lshr <16 x i32> %add, + %r = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %r, ptr %res + ret void +} + +define void @xvavg_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvand.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.w $xr0, $xr0, 1 +; CHECK-NEXT: xvadd.w $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %ea = sext <8 x i32> %va to <8 x i64> + %eb = sext <8 x i32> %vb to <8 x i64> + %add = add <8 x i64> %ea, %eb + %shr = lshr <8 x i64> %add, + %r = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %r, ptr %res + ret void +} + +define void @xvavg_d(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvand.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.d $xr0, $xr0, 1 +; CHECK-NEXT: xvadd.d $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %ea = sext <4 x i64> %va to <4 x i128> + %eb = sext <4 x i64> %vb to <4 x i128> + %add = add <4 x i128> %ea, %eb + %shr = lshr <4 x i128> %add, + %r = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %r, ptr %res + ret void +} + +define void @xvavg_bu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_bu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvand.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.b $xr0, $xr0, 1 +; CHECK-NEXT: xvadd.b $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %ea = zext <32 x i8> %va to <32 x i16> + %eb = zext <32 x i8> %vb to <32 x i16> + %add = add <32 x i16> %ea, %eb + %shr = lshr <32 x i16> %add, + %r = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %r, ptr %res + ret void +} + +define void @xvavg_hu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_hu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvand.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.h $xr0, $xr0, 1 +; CHECK-NEXT: xvadd.h $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %ea = zext <16 x i16> %va to <16 x i32> + %eb = zext <16 x i16> %vb to <16 x i32> + %add = add <16 x i32> %ea, %eb + %shr = lshr <16 x i32> %add, + %r = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %r, ptr %res + ret void +} + +define void @xvavg_wu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_wu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvand.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.w $xr0, $xr0, 1 +; CHECK-NEXT: xvadd.w $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %ea = zext <8 x i32> %va to <8 x i64> + %eb = zext <8 x i32> %vb to <8 x i64> + %add = add <8 x i64> %ea, %eb + %shr = lshr <8 x i64> %add, + %r = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %r, ptr %res + ret void +} + +define void @xvavg_du(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavg_du: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvand.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.d $xr0, $xr0, 1 +; CHECK-NEXT: xvadd.d $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %ea = zext <4 x i64> %va to <4 x i128> + %eb = zext <4 x i64> %vb to <4 x i128> + %add = add <4 x i128> %ea, %eb + %shr = lshr <4 x i128> %add, + %r = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %r, ptr %res + ret void +} + +define void @xvavgr_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvor.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.b $xr0, $xr0, 1 +; CHECK-NEXT: xvsub.b $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %ea = sext <32 x i8> %va to <32 x i16> + %eb = sext <32 x i8> %vb to <32 x i16> + %add = add <32 x i16> %ea, %eb + %add1 = add <32 x i16> %add, + %shr = lshr <32 x i16> %add1, + %r = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %r, ptr %res + ret void +} + +define void @xvavgr_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvor.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.h $xr0, $xr0, 1 +; CHECK-NEXT: xvsub.h $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %ea = sext <16 x i16> %va to <16 x i32> + %eb = sext <16 x i16> %vb to <16 x i32> + %add = add <16 x i32> %ea, %eb + %add1 = add <16 x i32> %add, + %shr = lshr <16 x i32> %add1, + %r = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %r, ptr %res + ret void +} + +define void @xvavgr_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvor.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.w $xr0, $xr0, 1 +; CHECK-NEXT: xvsub.w $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %ea = sext <8 x i32> %va to <8 x i64> + %eb = sext <8 x i32> %vb to <8 x i64> + %add = add <8 x i64> %ea, %eb + %add1 = add <8 x i64> %add, + %shr = lshr <8 x i64> %add1, + %r = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %r, ptr %res + ret void +} + +define void @xvavgr_d(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvor.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrai.d $xr0, $xr0, 1 +; CHECK-NEXT: xvsub.d $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %ea = sext <4 x i64> %va to <4 x i128> + %eb = sext <4 x i64> %vb to <4 x i128> + %add = add <4 x i128> %ea, %eb + %add1 = add <4 x i128> %add, + %shr = lshr <4 x i128> %add1, + %r = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %r, ptr %res + ret void +} + +define void @xvavgr_bu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_bu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvor.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.b $xr0, $xr0, 1 +; CHECK-NEXT: xvsub.b $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %ea = zext <32 x i8> %va to <32 x i16> + %eb = zext <32 x i8> %vb to <32 x i16> + %add = add <32 x i16> %ea, %eb + %add1 = add <32 x i16> %add, + %shr = lshr <32 x i16> %add1, + %r = trunc <32 x i16> %shr to <32 x i8> + store <32 x i8> %r, ptr %res + ret void +} + +define void @xvavgr_hu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_hu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvor.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.h $xr0, $xr0, 1 +; CHECK-NEXT: xvsub.h $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %ea = zext <16 x i16> %va to <16 x i32> + %eb = zext <16 x i16> %vb to <16 x i32> + %add = add <16 x i32> %ea, %eb + %add1 = add <16 x i32> %add, + %shr = lshr <16 x i32> %add1, + %r = trunc <16 x i32> %shr to <16 x i16> + store <16 x i16> %r, ptr %res + ret void +} + +define void @xvavgr_wu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_wu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvor.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.w $xr0, $xr0, 1 +; CHECK-NEXT: xvsub.w $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %ea = zext <8 x i32> %va to <8 x i64> + %eb = zext <8 x i32> %vb to <8 x i64> + %add = add <8 x i64> %ea, %eb + %add1 = add <8 x i64> %add, + %shr = lshr <8 x i64> %add1, + %r = trunc <8 x i64> %shr to <8 x i32> + store <8 x i32> %r, ptr %res + ret void +} + +define void @xvavgr_du(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: xvavgr_du: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvor.v $xr2, $xr0, $xr1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsrli.d $xr0, $xr0, 1 +; CHECK-NEXT: xvsub.d $xr0, $xr2, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %ea = zext <4 x i64> %va to <4 x i128> + %eb = zext <4 x i64> %vb to <4 x i128> + %add = add <4 x i128> %ea, %eb + %add1 = add <4 x i128> %add, + %shr = lshr <4 x i128> %add1, + %r = trunc <4 x i128> %shr to <4 x i64> + store <4 x i64> %r, ptr %res + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avgfloor-ceil.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avgfloor-ceil.ll new file mode 100644 index 0000000000000..bb4df64a48284 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avgfloor-ceil.ll @@ -0,0 +1,379 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +define void @vavg_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vand.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.b $vr0, $vr0, 1 +; CHECK-NEXT: vadd.b $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %ea = sext <16 x i8> %va to <16 x i16> + %eb = sext <16 x i8> %vb to <16 x i16> + %add = add <16 x i16> %ea, %eb + %shr = lshr <16 x i16> %add, + %r = trunc <16 x i16> %shr to <16 x i8> + store <16 x i8> %r, ptr %res + ret void +} + +define void @vavg_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vand.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.h $vr0, $vr0, 1 +; CHECK-NEXT: vadd.h $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %ea = sext <8 x i16> %va to <8 x i32> + %eb = sext <8 x i16> %vb to <8 x i32> + %add = add <8 x i32> %ea, %eb + %shr = lshr <8 x i32> %add, + %r = trunc <8 x i32> %shr to <8 x i16> + store <8 x i16> %r, ptr %res + ret void +} + +define void @vavg_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vand.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.w $vr0, $vr0, 1 +; CHECK-NEXT: vadd.w $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %ea = sext <4 x i32> %va to <4 x i64> + %eb = sext <4 x i32> %vb to <4 x i64> + %add = add <4 x i64> %ea, %eb + %shr = lshr <4 x i64> %add, + %r = trunc <4 x i64> %shr to <4 x i32> + store <4 x i32> %r, ptr %res + ret void +} + +define void @vavg_d(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vand.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.d $vr0, $vr0, 1 +; CHECK-NEXT: vadd.d $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %ea = sext <2 x i64> %va to <2 x i128> + %eb = sext <2 x i64> %vb to <2 x i128> + %add = add <2 x i128> %ea, %eb + %shr = lshr <2 x i128> %add, + %r = trunc <2 x i128> %shr to <2 x i64> + store <2 x i64> %r, ptr %res + ret void +} + +define void @vavg_bu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_bu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vand.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.b $vr0, $vr0, 1 +; CHECK-NEXT: vadd.b $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %ea = zext <16 x i8> %va to <16 x i16> + %eb = zext <16 x i8> %vb to <16 x i16> + %add = add <16 x i16> %ea, %eb + %shr = lshr <16 x i16> %add, + %r = trunc <16 x i16> %shr to <16 x i8> + store <16 x i8> %r, ptr %res + ret void +} + +define void @vavg_hu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_hu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vand.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.h $vr0, $vr0, 1 +; CHECK-NEXT: vadd.h $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %ea = zext <8 x i16> %va to <8 x i32> + %eb = zext <8 x i16> %vb to <8 x i32> + %add = add <8 x i32> %ea, %eb + %shr = lshr <8 x i32> %add, + %r = trunc <8 x i32> %shr to <8 x i16> + store <8 x i16> %r, ptr %res + ret void +} + +define void @vavg_wu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_wu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vand.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.w $vr0, $vr0, 1 +; CHECK-NEXT: vadd.w $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %ea = zext <4 x i32> %va to <4 x i64> + %eb = zext <4 x i32> %vb to <4 x i64> + %add = add <4 x i64> %ea, %eb + %shr = lshr <4 x i64> %add, + %r = trunc <4 x i64> %shr to <4 x i32> + store <4 x i32> %r, ptr %res + ret void +} + +define void @vavg_du(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavg_du: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vand.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.d $vr0, $vr0, 1 +; CHECK-NEXT: vadd.d $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %ea = zext <2 x i64> %va to <2 x i128> + %eb = zext <2 x i64> %vb to <2 x i128> + %add = add <2 x i128> %ea, %eb + %shr = lshr <2 x i128> %add, + %r = trunc <2 x i128> %shr to <2 x i64> + store <2 x i64> %r, ptr %res + ret void +} + +define void @vavgr_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vor.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.b $vr0, $vr0, 1 +; CHECK-NEXT: vsub.b $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %ea = sext <16 x i8> %va to <16 x i16> + %eb = sext <16 x i8> %vb to <16 x i16> + %add = add <16 x i16> %ea, %eb + %add1 = add <16 x i16> %add, + %shr = lshr <16 x i16> %add1, + %r = trunc <16 x i16> %shr to <16 x i8> + store <16 x i8> %r, ptr %res + ret void +} + +define void @vavgr_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vor.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.h $vr0, $vr0, 1 +; CHECK-NEXT: vsub.h $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %ea = sext <8 x i16> %va to <8 x i32> + %eb = sext <8 x i16> %vb to <8 x i32> + %add = add <8 x i32> %ea, %eb + %add1 = add <8 x i32> %add, + %shr = lshr <8 x i32> %add1, + %r = trunc <8 x i32> %shr to <8 x i16> + store <8 x i16> %r, ptr %res + ret void +} + +define void @vavgr_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vor.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.w $vr0, $vr0, 1 +; CHECK-NEXT: vsub.w $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %ea = sext <4 x i32> %va to <4 x i64> + %eb = sext <4 x i32> %vb to <4 x i64> + %add = add <4 x i64> %ea, %eb + %add1 = add <4 x i64> %add, + %shr = lshr <4 x i64> %add1, + %r = trunc <4 x i64> %shr to <4 x i32> + store <4 x i32> %r, ptr %res + ret void +} + +define void @vavgr_d(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vor.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrai.d $vr0, $vr0, 1 +; CHECK-NEXT: vsub.d $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %ea = sext <2 x i64> %va to <2 x i128> + %eb = sext <2 x i64> %vb to <2 x i128> + %add = add <2 x i128> %ea, %eb + %add1 = add <2 x i128> %add, + %shr = lshr <2 x i128> %add1, + %r = trunc <2 x i128> %shr to <2 x i64> + store <2 x i64> %r, ptr %res + ret void +} + +define void @vavgr_bu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_bu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vor.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.b $vr0, $vr0, 1 +; CHECK-NEXT: vsub.b $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %ea = zext <16 x i8> %va to <16 x i16> + %eb = zext <16 x i8> %vb to <16 x i16> + %add = add <16 x i16> %ea, %eb + %add1 = add <16 x i16> %add, + %shr = lshr <16 x i16> %add1, + %r = trunc <16 x i16> %shr to <16 x i8> + store <16 x i8> %r, ptr %res + ret void +} + +define void @vavgr_hu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_hu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vor.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.h $vr0, $vr0, 1 +; CHECK-NEXT: vsub.h $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %ea = zext <8 x i16> %va to <8 x i32> + %eb = zext <8 x i16> %vb to <8 x i32> + %add = add <8 x i32> %ea, %eb + %add1 = add <8 x i32> %add, + %shr = lshr <8 x i32> %add1, + %r = trunc <8 x i32> %shr to <8 x i16> + store <8 x i16> %r, ptr %res + ret void +} + +define void @vavgr_wu(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_wu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vor.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.w $vr0, $vr0, 1 +; CHECK-NEXT: vsub.w $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %ea = zext <4 x i32> %va to <4 x i64> + %eb = zext <4 x i32> %vb to <4 x i64> + %add = add <4 x i64> %ea, %eb + %add1 = add <4 x i64> %add, + %shr = lshr <4 x i64> %add1, + %r = trunc <4 x i64> %shr to <4 x i32> + store <4 x i32> %r, ptr %res + ret void +} + +define void @vavgr_du(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vavgr_du: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vor.v $vr2, $vr0, $vr1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vsrli.d $vr0, $vr0, 1 +; CHECK-NEXT: vsub.d $vr0, $vr2, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %ea = zext <2 x i64> %va to <2 x i128> + %eb = zext <2 x i64> %vb to <2 x i128> + %add = add <2 x i128> %ea, %eb + %add1 = add <2 x i128> %add, + %shr = lshr <2 x i128> %add1, + %r = trunc <2 x i128> %shr to <2 x i64> + store <2 x i64> %r, ptr %res + ret void +} From 9ad90bdcdc925c765dd6b92ec9fc21bee5d1990c Mon Sep 17 00:00:00 2001 From: Jianjian Guan Date: Fri, 31 Oct 2025 14:42:56 +0800 Subject: [PATCH 296/539] [RISCV][GISel] Support select vector store instrinsics (#165500) Include Unit-stride, Strided, Mask store. --- .../RISCV/GISel/RISCVInstructionSelector.cpp | 82 +- llvm/test/CodeGen/RISCV/GlobalISel/rvv/vse.ll | 1575 +++++++++++++++ llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsm.ll | 139 ++ .../test/CodeGen/RISCV/GlobalISel/rvv/vsse.ll | 1724 +++++++++++++++++ 4 files changed, 3505 insertions(+), 15 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vse.ll create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsm.ll create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsse.ll diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 81981732ee080..282cf5d681685 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -92,6 +92,10 @@ class RISCVInstructionSelector : public InstructionSelector { void emitFence(AtomicOrdering FenceOrdering, SyncScope::ID FenceSSID, MachineIRBuilder &MIB) const; bool selectUnmergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const; + void addVectorLoadStoreOperands(MachineInstr &I, + SmallVectorImpl &SrcOps, + unsigned &CurOp, bool IsMasked, + bool IsStrided) const; bool selectIntrinsicWithSideEffects(MachineInstr &I, MachineIRBuilder &MIB) const; @@ -716,6 +720,26 @@ static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) { return GenericOpc; } +void RISCVInstructionSelector::addVectorLoadStoreOperands( + MachineInstr &I, SmallVectorImpl &SrcOps, unsigned &CurOp, + bool IsMasked, bool IsStrided) const { + // Base Pointer + auto PtrReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PtrReg); + + // Stride + if (IsStrided) { + auto StrideReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(StrideReg); + } + + // Mask + if (IsMasked) { + auto MaskReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(MaskReg); + } +} + bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( MachineInstr &I, MachineIRBuilder &MIB) const { // Find the intrinsic ID. @@ -752,21 +776,7 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( SrcOps.push_back(Register(RISCV::NoRegister)); } - // Base Pointer - auto PtrReg = I.getOperand(CurOp++).getReg(); - SrcOps.push_back(PtrReg); - - // Stride - if (IsStrided) { - auto StrideReg = I.getOperand(CurOp++).getReg(); - SrcOps.push_back(StrideReg); - } - - // Mask - if (IsMasked) { - auto MaskReg = I.getOperand(CurOp++).getReg(); - SrcOps.push_back(MaskReg); - } + addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, IsStrided); RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); const RISCV::VLEPseudo *P = @@ -795,6 +805,48 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( I.eraseFromParent(); return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); } + case Intrinsic::riscv_vsm: + case Intrinsic::riscv_vse: + case Intrinsic::riscv_vse_mask: + case Intrinsic::riscv_vsse: + case Intrinsic::riscv_vsse_mask: { + bool IsMasked = IntrinID == Intrinsic::riscv_vse_mask || + IntrinID == Intrinsic::riscv_vsse_mask; + bool IsStrided = IntrinID == Intrinsic::riscv_vsse || + IntrinID == Intrinsic::riscv_vsse_mask; + LLT VT = MRI->getType(I.getOperand(1).getReg()); + unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); + + // Sources + unsigned CurOp = 1; + SmallVector SrcOps; // Source registers. + + // Store value + auto PassthruReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PassthruReg); + + addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, IsStrided); + + RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); + const RISCV::VSEPseudo *P = RISCV::getVSEPseudo( + IsMasked, IsStrided, Log2SEW, static_cast(LMUL)); + + auto PseudoMI = MIB.buildInstr(P->Pseudo, {}, SrcOps); + + // Select VL + auto VLOpFn = renderVLOp(I.getOperand(CurOp++)); + for (auto &RenderFn : *VLOpFn) + RenderFn(PseudoMI); + + // SEW + PseudoMI.addImm(Log2SEW); + + // Memref + PseudoMI.cloneMemRefs(I); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); + } } } diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vse.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vse.ll new file mode 100644 index 0000000000000..785d9fc6a7970 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vse.ll @@ -0,0 +1,1575 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s + +declare void @llvm.riscv.vse.nxv1i64( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv1i64_nxv1i64( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv1i64( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv1i64( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv1i64_nxv1i64( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv1i64( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +define void @intrinsic_vse_allonesmask_v_nxv1i64_nxv1i64( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_allonesmask_v_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv1i64( + %0, + ptr %1, + splat (i1 true), + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv2i64( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv2i64_nxv2i64( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv2i64( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv2i64( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv2i64_nxv2i64( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv2i64( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv4i64( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv4i64_nxv4i64( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv4i64( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv4i64( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv4i64_nxv4i64( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv4i64( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv8i64( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv8i64_nxv8i64( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv8i64( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv8i64( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv8i64_nxv8i64( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv8i64( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv1f64( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv1f64_nxv1f64( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv1f64_nxv1f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv1f64( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv1f64( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv1f64_nxv1f64( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv1f64_nxv1f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv1f64( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv2f64( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv2f64_nxv2f64( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv2f64_nxv2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv2f64( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv2f64( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv2f64_nxv2f64( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv2f64_nxv2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv2f64( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv4f64( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv4f64_nxv4f64( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv4f64_nxv4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv4f64( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv4f64( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv4f64_nxv4f64( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv4f64_nxv4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv4f64( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv8f64( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv8f64_nxv8f64( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv8f64_nxv8f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv8f64( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv8f64( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv8f64_nxv8f64( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv8f64_nxv8f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv8f64( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv1i32( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv1i32_nxv1i32( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv1i32( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv1i32( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv1i32_nxv1i32( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv1i32( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv2i32( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv2i32_nxv2i32( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv2i32( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv2i32( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv2i32_nxv2i32( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv2i32( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv4i32( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv4i32_nxv4i32( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv4i32( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv4i32( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv4i32_nxv4i32( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv4i32( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv8i32( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv8i32_nxv8i32( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv8i32( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv8i32( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv8i32_nxv8i32( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv8i32( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv16i32( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv16i32_nxv16i32( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv16i32( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv16i32( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv16i32_nxv16i32( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv16i32( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv1f32( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv1f32_nxv1f32( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv1f32_nxv1f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv1f32( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv1f32( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv1f32_nxv1f32( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv1f32_nxv1f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv1f32( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv2f32( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv2f32_nxv2f32( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv2f32_nxv2f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv2f32( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv2f32( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv2f32_nxv2f32( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv2f32_nxv2f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv2f32( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv4f32( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv4f32_nxv4f32( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv4f32_nxv4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv4f32( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv4f32( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv4f32_nxv4f32( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv4f32_nxv4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv4f32( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv8f32( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv8f32_nxv8f32( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv8f32_nxv8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv8f32( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv8f32( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv8f32_nxv8f32( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv8f32_nxv8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv8f32( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv16f32( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv16f32_nxv16f32( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv16f32_nxv16f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv16f32( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv16f32( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv16f32_nxv16f32( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv16f32_nxv16f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv16f32( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv1i16( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv1i16_nxv1i16( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv1i16( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv1i16( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv1i16_nxv1i16( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv1i16( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv2i16( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv2i16_nxv2i16( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv2i16( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv2i16( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv2i16_nxv2i16( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv2i16( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv4i16( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv4i16_nxv4i16( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv4i16( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv4i16( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv4i16_nxv4i16( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv4i16( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv8i16( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv8i16_nxv8i16( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv8i16( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv8i16( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv8i16_nxv8i16( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv8i16( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv16i16( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv16i16_nxv16i16( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv16i16( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv16i16( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv16i16_nxv16i16( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv16i16( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv32i16( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv32i16_nxv32i16( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv32i16( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv32i16( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv32i16_nxv32i16( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv32i16( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv1f16( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv1f16_nxv1f16( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv1f16_nxv1f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv1f16( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv1f16( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv1f16_nxv1f16( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv1f16_nxv1f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv1f16( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv2f16( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv2f16_nxv2f16( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv2f16_nxv2f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv2f16( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv2f16( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv2f16_nxv2f16( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv2f16_nxv2f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv2f16( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv4f16( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv4f16_nxv4f16( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv4f16_nxv4f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv4f16( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv4f16( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv4f16_nxv4f16( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv4f16_nxv4f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv4f16( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv8f16( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv8f16_nxv8f16( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv8f16_nxv8f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv8f16( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv8f16( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv8f16_nxv8f16( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv8f16_nxv8f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv8f16( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv16f16( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv16f16_nxv16f16( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv16f16_nxv16f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv16f16( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv16f16( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv16f16_nxv16f16( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv16f16_nxv16f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv16f16( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv32f16( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv32f16_nxv32f16( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv32f16_nxv32f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv32f16( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv32f16( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv32f16_nxv32f16( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv32f16_nxv32f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv32f16( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv1i8( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv1i8_nxv1i8( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv1i8( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv1i8( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv1i8_nxv1i8( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv1i8( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv2i8( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv2i8_nxv2i8( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv2i8( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv2i8( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv2i8_nxv2i8( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv2i8( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv4i8( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv4i8_nxv4i8( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv4i8( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv4i8( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv4i8_nxv4i8( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv4i8( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv8i8( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv8i8_nxv8i8( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv8i8( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv8i8( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv8i8_nxv8i8( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv8i8( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv16i8( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv16i8_nxv16i8( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv16i8( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv16i8( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv16i8_nxv16i8( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv16i8( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv32i8( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv32i8_nxv32i8( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv32i8( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv32i8( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv32i8_nxv32i8( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv32i8( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vse.nxv64i8( + , + ptr, + iXLen); + +define void @intrinsic_vse_v_nxv64i8_nxv64i8( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vse_v_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.nxv64i8( + %0, + ptr %1, + iXLen %2) + + ret void +} + +declare void @llvm.riscv.vse.mask.nxv64i8( + , + ptr, + , + iXLen); + +define void @intrinsic_vse_mask_v_nxv64i8_nxv64i8( %0, ptr %1, %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vse_mask_v_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vse.mask.nxv64i8( + %0, + ptr %1, + %2, + iXLen %3) + + ret void +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsm.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsm.ll new file mode 100644 index 0000000000000..5237536c07740 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsm.ll @@ -0,0 +1,139 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -global-isel -verify-machineinstrs | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -global-isel -verify-machineinstrs | FileCheck %s + +declare void @llvm.riscv.vsm.nxv1i1(, ptr, iXLen); + +define void @intrinsic_vsm_v_nxv1i1( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vsm_v_nxv1i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsm.v v0, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsm.nxv1i1( %0, ptr %1, iXLen %2) + ret void +} + +declare void @llvm.riscv.vsm.nxv2i1(, ptr, iXLen); + +define void @intrinsic_vsm_v_nxv2i1( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vsm_v_nxv2i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsm.v v0, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsm.nxv2i1( %0, ptr %1, iXLen %2) + ret void +} + +declare void @llvm.riscv.vsm.nxv4i1(, ptr, iXLen); + +define void @intrinsic_vsm_v_nxv4i1( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vsm_v_nxv4i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsm.v v0, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsm.nxv4i1( %0, ptr %1, iXLen %2) + ret void +} + +declare void @llvm.riscv.vsm.nxv8i1(, ptr, iXLen); + +define void @intrinsic_vsm_v_nxv8i1( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vsm_v_nxv8i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsm.v v0, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsm.nxv8i1( %0, ptr %1, iXLen %2) + ret void +} + +declare void @llvm.riscv.vsm.nxv16i1(, ptr, iXLen); + +define void @intrinsic_vsm_v_nxv16i1( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vsm_v_nxv16i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsm.v v0, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsm.nxv16i1( %0, ptr %1, iXLen %2) + ret void +} + +declare void @llvm.riscv.vsm.nxv32i1(, ptr, iXLen); + +define void @intrinsic_vsm_v_nxv32i1( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vsm_v_nxv32i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsm.v v0, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsm.nxv32i1( %0, ptr %1, iXLen %2) + ret void +} + +declare void @llvm.riscv.vsm.nxv64i1(, ptr, iXLen); + +define void @intrinsic_vsm_v_nxv64i1( %0, ptr %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vsm_v_nxv64i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsm.v v0, (a0) +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsm.nxv64i1( %0, ptr %1, iXLen %2) + ret void +} + +declare @llvm.riscv.vmseq.nxv1i16( + , + , + iXLen); + +; Make sure we can use the vsetvli from the producing instruction. +define void @test_vsetvli_i16( %0, %1, ptr %2, iXLen %3) nounwind { +; CHECK-LABEL: test_vsetvli_i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vmseq.vv v8, v8, v9 +; CHECK-NEXT: vsm.v v8, (a0) +; CHECK-NEXT: ret +entry: + %a = call @llvm.riscv.vmseq.nxv1i16( + %0, + %1, + iXLen %3) + call void @llvm.riscv.vsm.nxv1i1( %a, ptr %2, iXLen %3) + ret void +} + +declare @llvm.riscv.vmseq.nxv1i32( + , + , + iXLen); + +define void @test_vsetvli_i32( %0, %1, ptr %2, iXLen %3) nounwind { +; CHECK-LABEL: test_vsetvli_i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vmseq.vv v8, v8, v9 +; CHECK-NEXT: vsm.v v8, (a0) +; CHECK-NEXT: ret +entry: + %a = call @llvm.riscv.vmseq.nxv1i32( + %0, + %1, + iXLen %3) + call void @llvm.riscv.vsm.nxv1i1( %a, ptr %2, iXLen %3) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsse.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsse.ll new file mode 100644 index 0000000000000..b7609ff5fd1cd --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsse.ll @@ -0,0 +1,1724 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s + +declare void @llvm.riscv.vsse.nxv1i64( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv1i64_nxv1i64( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv1i64( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv1i64( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv1i64_nxv1i64( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv1i64( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +define void @intrinsic_vsse_allonesmask_v_nxv1i64_nxv1i64( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_allonesmask_v_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv1i64( + %0, + ptr %1, + iXLen %2, + splat (i1 true), + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv2i64( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv2i64_nxv2i64( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv2i64( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv2i64( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv2i64_nxv2i64( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv2i64( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv4i64( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv4i64_nxv4i64( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv4i64( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv4i64( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv4i64_nxv4i64( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv4i64( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv8i64( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv8i64_nxv8i64( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv8i64( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv8i64( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv8i64_nxv8i64( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv8i64( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv1f64( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv1f64_nxv1f64( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv1f64_nxv1f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv1f64( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv1f64( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv1f64_nxv1f64( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1f64_nxv1f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv1f64( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv2f64( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv2f64_nxv2f64( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv2f64_nxv2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv2f64( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv2f64( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv2f64_nxv2f64( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2f64_nxv2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv2f64( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv4f64( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv4f64_nxv4f64( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv4f64_nxv4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv4f64( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv4f64( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv4f64_nxv4f64( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4f64_nxv4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv4f64( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv8f64( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv8f64_nxv8f64( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv8f64_nxv8f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv8f64( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv8f64( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv8f64_nxv8f64( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8f64_nxv8f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv8f64( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv1i32( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv1i32_nxv1i32( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv1i32( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv1i32( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv1i32_nxv1i32( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv1i32( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv2i32( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv2i32_nxv2i32( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv2i32( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv2i32( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv2i32_nxv2i32( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv2i32( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv4i32( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv4i32_nxv4i32( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv4i32( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv4i32( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv4i32_nxv4i32( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv4i32( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv8i32( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv8i32_nxv8i32( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv8i32( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv8i32( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv8i32_nxv8i32( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv8i32( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv16i32( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv16i32_nxv16i32( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv16i32( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv16i32( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv16i32_nxv16i32( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv16i32( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv1f32( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv1f32_nxv1f32( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv1f32_nxv1f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv1f32( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv1f32( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv1f32_nxv1f32( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1f32_nxv1f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv1f32( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv2f32( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv2f32_nxv2f32( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv2f32_nxv2f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv2f32( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv2f32( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv2f32_nxv2f32( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2f32_nxv2f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv2f32( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv4f32( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv4f32_nxv4f32( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv4f32_nxv4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv4f32( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv4f32( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv4f32_nxv4f32( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4f32_nxv4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv4f32( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv8f32( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv8f32_nxv8f32( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv8f32_nxv8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv8f32( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv8f32( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv8f32_nxv8f32( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8f32_nxv8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv8f32( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv16f32( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv16f32_nxv16f32( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv16f32_nxv16f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv16f32( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv16f32( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv16f32_nxv16f32( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16f32_nxv16f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv16f32( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv1i16( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv1i16_nxv1i16( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv1i16( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv1i16( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv1i16_nxv1i16( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv1i16( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv2i16( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv2i16_nxv2i16( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv2i16( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv2i16( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv2i16_nxv2i16( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv2i16( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv4i16( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv4i16_nxv4i16( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv4i16( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv4i16( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv4i16_nxv4i16( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv4i16( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv8i16( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv8i16_nxv8i16( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv8i16( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv8i16( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv8i16_nxv8i16( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv8i16( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv16i16( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv16i16_nxv16i16( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv16i16( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv16i16( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv16i16_nxv16i16( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv16i16( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv32i16( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv32i16_nxv32i16( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv32i16( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv32i16( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv32i16_nxv32i16( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv32i16( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv1f16( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv1f16_nxv1f16( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv1f16_nxv1f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv1f16( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv1f16( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv1f16_nxv1f16( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1f16_nxv1f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv1f16( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv2f16( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv2f16_nxv2f16( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv2f16_nxv2f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv2f16( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv2f16( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv2f16_nxv2f16( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2f16_nxv2f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv2f16( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv4f16( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv4f16_nxv4f16( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv4f16_nxv4f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv4f16( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv4f16( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv4f16_nxv4f16( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4f16_nxv4f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv4f16( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv8f16( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv8f16_nxv8f16( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv8f16_nxv8f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv8f16( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv8f16( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv8f16_nxv8f16( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8f16_nxv8f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv8f16( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv16f16( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv16f16_nxv16f16( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv16f16_nxv16f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv16f16( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv16f16( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv16f16_nxv16f16( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16f16_nxv16f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv16f16( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv32f16( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv32f16_nxv32f16( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv32f16_nxv32f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv32f16( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv32f16( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv32f16_nxv32f16( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv32f16_nxv32f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv32f16( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv1i8( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv1i8_nxv1i8( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv1i8( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv1i8( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv1i8_nxv1i8( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv1i8( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv2i8( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv2i8_nxv2i8( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, mf4, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv2i8( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv2i8( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv2i8_nxv2i8( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, mf4, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv2i8( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv4i8( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv4i8_nxv4i8( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv4i8( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv4i8( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv4i8_nxv4i8( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv4i8( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv8i8( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv8i8_nxv8i8( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv8i8( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv8i8( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv8i8_nxv8i8( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv8i8( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv16i8( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv16i8_nxv16i8( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv16i8( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv16i8( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv16i8_nxv16i8( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv16i8( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv32i8( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv32i8_nxv32i8( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv32i8( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv32i8( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv32i8_nxv32i8( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv32i8( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsse.nxv64i8( + , + ptr, + iXLen, + iXLen); + +define void @intrinsic_vsse_v_nxv64i8_nxv64i8( %0, ptr %1, iXLen %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsse_v_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.nxv64i8( + %0, + ptr %1, + iXLen %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv64i8( + , + ptr, + iXLen, + , + iXLen); + +define void @intrinsic_vsse_mask_v_nxv64i8_nxv64i8( %0, ptr %1, iXLen %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsse_mask_v_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsse.mask.nxv64i8( + %0, + ptr %1, + iXLen %2, + %3, + iXLen %4) + + ret void +} From d25796197ab736ecca39f018f4a300ead2a9a071 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Thu, 21 Aug 2025 06:00:36 -0700 Subject: [PATCH 297/539] [MLIR] Apply clang-tidy fixes for bugprone-argument-comment in ConvertVectorToLLVM.cpp (NFC) --- mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp index 41d8d532757ad..69a317ecd101f 100644 --- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp +++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp @@ -716,7 +716,7 @@ lowerReductionWithStartValue(ConversionPatternRewriter &rewriter, Location loc, accumulator = getOrCreateAccumulator(rewriter, loc, llvmType, accumulator); return LLVMRedIntrinOp::create(rewriter, loc, llvmType, - /*startValue=*/accumulator, vectorOperand, + /*start_value=*/accumulator, vectorOperand, fmf); } @@ -743,7 +743,7 @@ static Value lowerPredicatedReductionWithStartValue( Value vectorLength = createVectorLengthValue(rewriter, loc, vectorOperand.getType()); return LLVMVPRedIntrinOp::create(rewriter, loc, llvmType, - /*startValue=*/accumulator, vectorOperand, + /*satrt_value=*/accumulator, vectorOperand, mask, vectorLength); } From c4b61457a7a946dc27a371b00a0594090f0ef9fb Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Thu, 21 Aug 2025 06:15:39 -0700 Subject: [PATCH 298/539] [MLIR] Apply clang-tidy fixes for readability-simplify-boolean-expr in SuperVectorize.cpp (NFC) --- mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp index e08cc6f645d71..d428fbf2886ff 100644 --- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp @@ -1106,10 +1106,7 @@ static bool isUniformDefinition(Value value, return false; } - if (!value.getType().isIntOrIndexOrFloat()) - return false; - - return true; + return value.getType().isIntOrIndexOrFloat(); } /// Generates a broadcast op for the provided uniform value using the From b4abf4eff8bcdc368bd1a7973f4eb4b6cfa56ab5 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Fri, 31 Oct 2025 15:35:41 +0800 Subject: [PATCH 299/539] [clang][SPIR][SPIRV] Don't generate constant NULL from addrspacecast generic NULL (#165353) Fix a regression caused by 1ffff05a38c9. OpenCL/SPIRV generic address space doesn't cover constant address space. --------- Co-authored-by: Alexey Bader --- clang/lib/CodeGen/Targets/SPIR.cpp | 3 +- clang/test/CodeGenOpenCL/amdgpu-nullptr.cl | 633 ------------------ clang/test/CodeGenOpenCL/nullptr.cl | 735 +++++++++++++++++++++ 3 files changed, 737 insertions(+), 634 deletions(-) delete mode 100644 clang/test/CodeGenOpenCL/amdgpu-nullptr.cl create mode 100644 clang/test/CodeGenOpenCL/nullptr.cl diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp index 15d0b353d748c..abd049aca0ed7 100644 --- a/clang/lib/CodeGen/Targets/SPIR.cpp +++ b/clang/lib/CodeGen/Targets/SPIR.cpp @@ -260,7 +260,8 @@ CommonSPIRTargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM, LangAS AS = QT->getUnqualifiedDesugaredType()->isNullPtrType() ? LangAS::Default : QT->getPointeeType().getAddressSpace(); - if (AS == LangAS::Default || AS == LangAS::opencl_generic) + if (AS == LangAS::Default || AS == LangAS::opencl_generic || + AS == LangAS::opencl_constant) return llvm::ConstantPointerNull::get(PT); auto &Ctx = CGM.getContext(); diff --git a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl deleted file mode 100644 index d0bcd1fccb7ce..0000000000000 --- a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl +++ /dev/null @@ -1,633 +0,0 @@ -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -O0 -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck --check-prefix=NOOPT %s -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -fcommon -emit-llvm -o - | FileCheck %s --check-prefix=COMMON - -typedef struct { - private char *p1; - local char *p2; - constant char *p3; - global char *p4; - generic char *p5; -} StructTy1; - -typedef struct { - constant char *p3; - global char *p4; - generic char *p5; -} StructTy2; - -// Test 0 as initializer. - -// CHECK: @private_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4 -private char *private_p = 0; - -// CHECK: @local_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4 -local char *local_p = 0; - -// CHECK: @global_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8 -global char *global_p = 0; - -// CHECK: @constant_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8 -constant char *constant_p = 0; - -// CHECK: @generic_p ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8 -generic char *generic_p = 0; - -// Test NULL as initializer. - -// CHECK: @private_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4 -private char *private_p_NULL = NULL; - -// CHECK: @local_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4 -local char *local_p_NULL = NULL; - -// CHECK: @global_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8 -global char *global_p_NULL = NULL; - -// CHECK: @constant_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8 -constant char *constant_p_NULL = NULL; - -// CHECK: @generic_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8 -generic char *generic_p_NULL = NULL; - -// Test constant folding of null pointer. -// A null pointer should be folded to a null pointer in the target address space. - -// CHECK: @fold_generic ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8 -generic int *fold_generic = (global int*)(generic float*)(private char*)0; - -// CHECK: @fold_priv ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr addrspace(1) null to ptr addrspace(5)), align 4 -private short *fold_priv = (private short*)(generic int*)(global void*)0; - -// CHECK: @fold_priv_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) inttoptr (i32 9 to ptr addrspace(5)), align 4 -private char *fold_priv_arith = (private char*)0 + 10; - -// CHECK: @fold_local_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) inttoptr (i32 9 to ptr addrspace(3)), align 4 -local char *fold_local_arith = (local char*)0 + 10; - -// CHECK: @fold_int ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4 -int fold_int = (int)(private void*)(generic char*)(global int*)0 + 14; - -// CHECK: @fold_int2 ={{.*}} local_unnamed_addr addrspace(1) global i32 12, align 4 -int fold_int2 = (int) ((private void*)0 + 13); - -// CHECK: @fold_int3 ={{.*}} local_unnamed_addr addrspace(1) global i32 -1, align 4 -int fold_int3 = (int) ((private int*)0); - -// CHECK: @fold_int4 ={{.*}} local_unnamed_addr addrspace(1) global i32 7, align 4 -int fold_int4 = (int) &((private int*)0)[2]; - -// CHECK: @fold_int5 ={{.*}} local_unnamed_addr addrspace(1) global i32 3, align 4 -int fold_int5 = (int) &((private StructTy1*)0)->p2; - - -// CHECK: @fold_int_local ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4 -int fold_int_local = (int)(local void*)(generic char*)(global int*)0 + 14; - -// CHECK: @fold_int2_local ={{.*}} local_unnamed_addr addrspace(1) global i32 12, align 4 -int fold_int2_local = (int) ((local void*)0 + 13); - -// CHECK: @fold_int3_local ={{.*}} local_unnamed_addr addrspace(1) global i32 -1, align 4 -int fold_int3_local = (int) ((local int*)0); - -// CHECK: @fold_int4_local ={{.*}} local_unnamed_addr addrspace(1) global i32 7, align 4 -int fold_int4_local = (int) &((local int*)0)[2]; - -// CHECK: @fold_int5_local ={{.*}} local_unnamed_addr addrspace(1) global i32 3, align 4 -int fold_int5_local = (int) &((local StructTy1*)0)->p2; - - -// Test static variable initialization. - -// NOOPT: @test_static_var_private.sp1 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4 -// NOOPT: @test_static_var_private.sp2 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4 -// NOOPT: @test_static_var_private.sp3 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4 -// NOOPT: @test_static_var_private.sp4 = internal addrspace(1) global ptr addrspace(5) null, align 4 -// NOOPT: @test_static_var_private.sp5 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4 -// NOOPT: @test_static_var_private.SS1 = internal addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8 -// NOOPT: @test_static_var_private.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8 - -void test_static_var_private(void) { - static private char *sp1 = 0; - static private char *sp2 = NULL; - static private char *sp3; - static private char *sp4 = (private char*)((void)0, 0); - const int x = 0; - static private char *sp5 = (private char*)x; - static StructTy1 SS1; - static StructTy2 SS2; -} - -// NOOPT: @test_static_var_local.sp1 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4 -// NOOPT: @test_static_var_local.sp2 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4 -// NOOPT: @test_static_var_local.sp3 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4 -// NOOPT: @test_static_var_local.sp4 = internal addrspace(1) global ptr addrspace(3) null, align 4 -// NOOPT: @test_static_var_local.sp5 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4 -// NOOPT: @test_static_var_local.SS1 = internal addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8 -// NOOPT: @test_static_var_local.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8 -void test_static_var_local(void) { - static local char *sp1 = 0; - static local char *sp2 = NULL; - static local char *sp3; - static local char *sp4 = (local char*)((void)0, 0); - const int x = 0; - static local char *sp5 = (local char*)x; - static StructTy1 SS1; - static StructTy2 SS2; -} - -// Test function-scope variable initialization. -// NOOPT-LABEL: @test_func_scope_var_private( -// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp1{{.*}}, align 4 -// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp2{{.*}}, align 4 -// NOOPT: store ptr addrspace(5) null, ptr addrspace(5) %sp3{{.*}}, align 4 -// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp4{{.*}}, align 4 -// NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_private.SS1, i64 32, i1 false) -// NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2{{.*}}, i8 0, i64 24, i1 false) -void test_func_scope_var_private(void) { - private char *sp1 = 0; - private char *sp2 = NULL; - private char *sp3 = (private char*)((void)0, 0); - const int x = 0; - private char *sp4 = (private char*)x; - StructTy1 SS1 = {0, 0, 0, 0, 0}; - StructTy2 SS2 = {0, 0, 0}; -} - -// Test function-scope variable initialization. -// NOOPT-LABEL: @test_func_scope_var_local( -// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp1{{.*}}, align 4 -// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp2{{.*}}, align 4 -// NOOPT: store ptr addrspace(3) null, ptr addrspace(5) %sp3{{.*}}, align 4 -// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp4{{.*}}, align 4 -// NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_local.SS1, i64 32, i1 false) -// NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2{{.*}}, i8 0, i64 24, i1 false) -void test_func_scope_var_local(void) { - local char *sp1 = 0; - local char *sp2 = NULL; - local char *sp3 = (local char*)((void)0, 0); - const int x = 0; - local char *sp4 = (local char*)x; - StructTy1 SS1 = {0, 0, 0, 0, 0}; - StructTy2 SS2 = {0, 0, 0}; -} - - -// Test default initialization of pointers. - -// Tentative definition of global variables with non-zero initializer -// cannot have common linkage since common linkage requires zero initialization -// and does not have explicit section. - -// CHECK: @p1 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4 -// COMMON: @p1 = weak local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4 -private char *p1; - -// CHECK: @p2 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4 -// COMMON: @p2 = weak local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4 -local char *p2; - -// CHECK: @p3 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8 -// COMMON: @p3 = common local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8 -constant char *p3; - -// CHECK: @p4 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8 -// COMMON: @p4 = common local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8 -global char *p4; - -// CHECK: @p5 ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8 -// COMMON: @p5 = common local_unnamed_addr addrspace(1) global ptr null, align 8 -generic char *p5; - -// Test default initialization of structure. - -// CHECK: @S1 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8 -StructTy1 S1; - -// CHECK: @S2 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy2 zeroinitializer, align 8 -StructTy2 S2; - -// Test default initialization of array. -// CHECK: @A1 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy1] [%struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }], align 8 -StructTy1 A1[2]; - -// CHECK: @A2 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy2] zeroinitializer, align 8 -StructTy2 A2[2]; - -// Test comparison with 0. - -// CHECK-LABEL: cmp_private -// CHECK: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5)) -void cmp_private(private char* p) { - if (p != 0) - *p = 0; -} - -// CHECK-LABEL: cmp_local -// CHECK: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3)) -void cmp_local(local char* p) { - if (p != 0) - *p = 0; -} - -// CHECK-LABEL: cmp_global -// CHECK: icmp eq ptr addrspace(1) %p, null -void cmp_global(global char* p) { - if (p != 0) - *p = 0; -} - -// CHECK-LABEL: cmp_constant -// CHECK: icmp eq ptr addrspace(4) %p, null -char cmp_constant(constant char* p) { - if (p != 0) - return *p; - else - return 0; -} - -// CHECK-LABEL: cmp_generic -// CHECK: icmp eq ptr %p, null -void cmp_generic(generic char* p) { - if (p != 0) - *p = 0; -} - -// Test comparison with NULL. - -// CHECK-LABEL: cmp_NULL_private -// CHECK: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5)) -void cmp_NULL_private(private char* p) { - if (p != NULL) - *p = 0; -} - -// CHECK-LABEL: cmp_NULL_local -// CHECK: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3)) -void cmp_NULL_local(local char* p) { - if (p != NULL) - *p = 0; -} - -// CHECK-LABEL: cmp_NULL_global -// CHECK: icmp eq ptr addrspace(1) %p, null -void cmp_NULL_global(global char* p) { - if (p != NULL) - *p = 0; -} - -// CHECK-LABEL: cmp_NULL_constant -// CHECK: icmp eq ptr addrspace(4) %p, null -char cmp_NULL_constant(constant char* p) { - if (p != NULL) - return *p; - else - return 0; -} - -// CHECK-LABEL: cmp_NULL_generic -// CHECK: icmp eq ptr %p, null -void cmp_NULL_generic(generic char* p) { - if (p != NULL) - *p = 0; -} - -// Test storage 0 as null pointer. -// CHECK-LABEL: test_storage_null_pointer -// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %arg_private -// CHECK: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %arg_local -// CHECK: store ptr addrspace(1) null, ptr %arg_global -// CHECK: store ptr addrspace(4) null, ptr %arg_constant -// CHECK: store ptr null, ptr %arg_generic -void test_storage_null_pointer(private char** arg_private, - local char** arg_local, - global char** arg_global, - constant char** arg_constant, - generic char** arg_generic) { - *arg_private = 0; - *arg_local = 0; - *arg_global = 0; - *arg_constant = 0; - *arg_generic = 0; -} - -// Test storage NULL as null pointer. -// CHECK-LABEL: test_storage_null_pointer_NULL -// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %arg_private -// CHECK: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %arg_local -// CHECK: store ptr addrspace(1) null, ptr %arg_global -// CHECK: store ptr addrspace(4) null, ptr %arg_constant -// CHECK: store ptr null, ptr %arg_generic -void test_storage_null_pointer_NULL(private char** arg_private, - local char** arg_local, - global char** arg_global, - constant char** arg_constant, - generic char** arg_generic) { - *arg_private = NULL; - *arg_local = NULL; - *arg_global = NULL; - *arg_constant = NULL; - *arg_generic = NULL; -} - -// Test pass null pointer to function as argument. -void test_pass_null_pointer_arg_calee(private char* arg_private, - local char* arg_local, - global char* arg_global, - constant char* arg_constant, - generic char* arg_generic); - -// CHECK-LABEL: test_pass_null_pointer_arg -// CHECK: call void @test_pass_null_pointer_arg_calee(ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(1) null, ptr addrspace(4) null, ptr null) -// CHECK: call void @test_pass_null_pointer_arg_calee(ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(1) null, ptr addrspace(4) null, ptr null) -void test_pass_null_pointer_arg(void) { - test_pass_null_pointer_arg_calee(0, 0, 0, 0, 0); - test_pass_null_pointer_arg_calee(NULL, NULL, NULL, NULL, NULL); -} - -// Test cast null pointer to size_t. -void test_cast_null_pointer_to_sizet_calee(size_t arg_private, - size_t arg_local, - size_t arg_global, - size_t arg_constant, - size_t arg_generic); - -// CHECK-LABEL: test_cast_null_pointer_to_sizet -// CHECK: call void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i64), i64 0, i64 0, i64 0) -// CHECK: call void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i64), i64 0, i64 0, i64 0) -void test_cast_null_pointer_to_sizet(void) { - test_cast_null_pointer_to_sizet_calee((size_t)((private char*)0), - (size_t)((local char*)0), - (size_t)((global char*)0), - (size_t)((constant char*)0), - (size_t)((generic char*)0)); - test_cast_null_pointer_to_sizet_calee((size_t)((private char*)NULL), - (size_t)((local char*)NULL), - (size_t)((global char*)NULL), - (size_t)((constant char*)0), // NULL cannot be casted to constant pointer since it is defined as a generic pointer - (size_t)((generic char*)NULL)); -} - -// Test comparison between null pointers. -#define TEST_EQ00(addr1, addr2) int test_eq00_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)0; } -#define TEST_EQ0N(addr1, addr2) int test_eq0N_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)NULL; } -#define TEST_EQN0(addr1, addr2) int test_eqN0_##addr1##_##addr2(void) { return (addr1 char*)NULL == (addr2 char*)0; } -#define TEST_EQNN(addr1, addr2) int test_eqNN_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)NULL; } -#define TEST_NE00(addr1, addr2) int test_ne00_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)0; } -#define TEST_NE0N(addr1, addr2) int test_ne0N_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)NULL; } -#define TEST_NEN0(addr1, addr2) int test_neN0_##addr1##_##addr2(void) { return (addr1 char*)NULL != (addr2 char*)0; } -#define TEST_NENN(addr1, addr2) int test_neNN_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)NULL; } -#define TEST(addr1, addr2) \ - TEST_EQ00(addr1, addr2) \ - TEST_EQ0N(addr1, addr2) \ - TEST_EQN0(addr1, addr2) \ - TEST_EQNN(addr1, addr2) \ - TEST_NE00(addr1, addr2) \ - TEST_NE0N(addr1, addr2) \ - TEST_NEN0(addr1, addr2) \ - TEST_NENN(addr1, addr2) - -// CHECK-LABEL: test_eq00_generic_private -// CHECK: ret i32 1 -// CHECK-LABEL: test_eq0N_generic_private -// CHECK: ret i32 1 -// CHECK-LABEL: test_eqN0_generic_private -// CHECK: ret i32 1 -// CHECK-LABEL: test_eqNN_generic_private -// CHECK: ret i32 1 -// CHECK-LABEL: test_ne00_generic_private -// CHECK: ret i32 0 -// CHECK-LABEL: test_ne0N_generic_private -// CHECK: ret i32 0 -// CHECK-LABEL: test_neN0_generic_private -// CHECK: ret i32 0 -// CHECK-LABEL: test_neNN_generic_private -// CHECK: ret i32 0 -TEST(generic, private) - -// CHECK-LABEL: test_eq00_generic_local -// CHECK: ret i32 1 -// CHECK-LABEL: test_eq0N_generic_local -// CHECK: ret i32 1 -// CHECK-LABEL: test_eqN0_generic_local -// CHECK: ret i32 1 -// CHECK-LABEL: test_eqNN_generic_local -// CHECK: ret i32 1 -// CHECK-LABEL: test_ne00_generic_local -// CHECK: ret i32 0 -// CHECK-LABEL: test_ne0N_generic_local -// CHECK: ret i32 0 -// CHECK-LABEL: test_neN0_generic_local -// CHECK: ret i32 0 -// CHECK-LABEL: test_neNN_generic_local -// CHECK: ret i32 0 -TEST(generic, local) - -// CHECK-LABEL: test_eq00_generic_global -// CHECK: ret i32 1 -// CHECK-LABEL: test_eq0N_generic_global -// CHECK: ret i32 1 -// CHECK-LABEL: test_eqN0_generic_global -// CHECK: ret i32 1 -// CHECK-LABEL: test_eqNN_generic_global -// CHECK: ret i32 1 -// CHECK-LABEL: test_ne00_generic_global -// CHECK: ret i32 0 -// CHECK-LABEL: test_ne0N_generic_global -// CHECK: ret i32 0 -// CHECK-LABEL: test_neN0_generic_global -// CHECK: ret i32 0 -// CHECK-LABEL: test_neNN_generic_global -// CHECK: ret i32 0 -TEST(generic, global) - -// CHECK-LABEL: test_eq00_generic_generic -// CHECK: ret i32 1 -// CHECK-LABEL: test_eq0N_generic_generic -// CHECK: ret i32 1 -// CHECK-LABEL: test_eqN0_generic_generic -// CHECK: ret i32 1 -// CHECK-LABEL: test_eqNN_generic_generic -// CHECK: ret i32 1 -// CHECK-LABEL: test_ne00_generic_generic -// CHECK: ret i32 0 -// CHECK-LABEL: test_ne0N_generic_generic -// CHECK: ret i32 0 -// CHECK-LABEL: test_neN0_generic_generic -// CHECK: ret i32 0 -// CHECK-LABEL: test_neNN_generic_generic -// CHECK: ret i32 0 -TEST(generic, generic) - -// CHECK-LABEL: test_eq00_constant_constant -// CHECK: ret i32 1 -TEST_EQ00(constant, constant) - -// Test cast to bool. - -// CHECK-LABEL: cast_bool_private -// CHECK: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5)) -void cast_bool_private(private char* p) { - if (p) - *p = 0; -} - -// CHECK-LABEL: cast_bool_local -// CHECK: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3)) -void cast_bool_local(local char* p) { - if (p) - *p = 0; -} - -// CHECK-LABEL: cast_bool_global -// CHECK: icmp eq ptr addrspace(1) %p, null -void cast_bool_global(global char* p) { - if (p) - *p = 0; -} - -// CHECK-LABEL: cast_bool_constant -// CHECK: icmp eq ptr addrspace(4) %p, null -char cast_bool_constant(constant char* p) { - if (p) - return *p; - else - return 0; -} - -// CHECK-LABEL: cast_bool_generic -// CHECK: icmp eq ptr %p, null -void cast_bool_generic(generic char* p) { - if (p) - *p = 0; -} - -// Test initialize a struct using memset. -// For large structures which is mostly zero, clang generats llvm.memset for -// the zero part and store for non-zero members. -typedef struct { - long a, b, c, d; - private char *p; -} StructTy3; - -// CHECK-LABEL: test_memset_private -// CHECK: call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 8 {{.*}}, i8 0, i64 32, i1 false) -// CHECK: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) %ptr, i32 32 -// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) [[GEP]] -// CHECK: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) {{.*}}, i32 36 -// CHECK: store i32 0, ptr addrspace(5) [[GEP1]], align 4 -void test_memset_private(private StructTy3 *ptr) { - StructTy3 S3 = {0, 0, 0, 0, 0}; - *ptr = S3; -} - -// Test casting literal 0 to pointer. -// A 0 literal casted to pointer should become a null pointer. - -// CHECK-LABEL: test_cast_0_to_local_ptr -// CHECK: ret ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) -local int* test_cast_0_to_local_ptr(void) { - return (local int*)0; -} - -// CHECK-LABEL: test_cast_0_to_private_ptr -// CHECK: ret ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) -private int* test_cast_0_to_private_ptr(void) { - return (private int*)0; -} - -// Test casting non-literal integer with 0 value to pointer. -// A non-literal integer expression with 0 value is casted to a pointer with -// zero value. - -// CHECK-LABEL: test_cast_int_to_ptr1_private -// CHECK: ret ptr addrspace(5) null -private int* test_cast_int_to_ptr1_private(void) { - return (private int*)((void)0, 0); -} - -// CHECK-LABEL: test_cast_int_to_ptr1_local - // CHECK: ret ptr addrspace(3) null -local int* test_cast_int_to_ptr1_local(void) { - return (local int*)((void)0, 0); -} - -// CHECK-LABEL: test_cast_int_to_ptr2 -// CHECK: ret ptr addrspace(5) null -private int* test_cast_int_to_ptr2(void) { - int x = 0; - return (private int*)x; -} - -// Test logical operations. -// CHECK-LABEL: test_not_nullptr -// CHECK: ret i32 1 -int test_not_nullptr(void) { - return !(private char*)NULL; -} - -// CHECK-LABEL: test_and_nullptr -// CHECK: ret i32 0 -int test_and_nullptr(int a) { - return a && ((private char*)NULL); -} - -// CHECK-LABEL: test_not_private_ptr -// CHECK: %[[lnot:.*]] = icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5)) -// CHECK: %[[lnot_ext:.*]] = zext i1 %[[lnot]] to i32 -// CHECK: ret i32 %[[lnot_ext]] -int test_not_private_ptr(private char* p) { - return !p; -} - -// CHECK-LABEL: test_not_local_ptr -// CHECK: %[[lnot:.*]] = icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3)) -// CHECK: %[[lnot_ext:.*]] = zext i1 %[[lnot]] to i32 -// CHECK: ret i32 %[[lnot_ext]] -int test_not_local_ptr(local char* p) { - return !p; -} - - -// CHECK-LABEL: test_and_ptr -// CHECK: %[[tobool:.*]] = icmp ne ptr addrspace(5) %p1, addrspacecast (ptr null to ptr addrspace(5)) -// CHECK: %[[tobool1:.*]] = icmp ne ptr addrspace(3) %p2, addrspacecast (ptr null to ptr addrspace(3)) -// CHECK: %[[res:.*]] = select i1 %[[tobool]], i1 %[[tobool1]], i1 false -// CHECK: %[[land_ext:.*]] = zext i1 %[[res]] to i32 -// CHECK: ret i32 %[[land_ext]] -int test_and_ptr(private char* p1, local char* p2) { - return p1 && p2; -} - -// Test folding of null pointer in function scope. -// NOOPT-LABEL: test_fold_private -// NOOPT: call void @test_fold_callee -// NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob{{.*}}, align 8 -// NOOPT: %{{.*}} = sub i64 %{{.*}}, 0 -// NOOPT: call void @test_fold_callee -// NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i32) to i64 -// NOOPT: %{{.*}} = add nsw i64 %1, %[[SEXT]] -// NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1 -void test_fold_callee(void); -void test_fold_private(void) { - global int* glob = (test_fold_callee(), (global int*)(generic char*)0); - long x = glob - (global int*)(generic char*)0; - x = x + (int)(test_fold_callee(), (private int*)(generic char*)(global short*)0); - x = x - (int)((private int*)0 == (private int*)(generic char*)0); -} - -// NOOPT-LABEL: test_fold_local -// NOOPT: call void @test_fold_callee -// NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob{{.*}}, align 8 -// NOOPT: %{{.*}} = sub i64 %{{.*}}, 0 -// NOOPT: call void @test_fold_callee -// NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i32) to i64 -// NOOPT: %{{.*}} = add nsw i64 %{{.*}}, %[[SEXT]] -// NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1 -void test_fold_local(void) { - global int* glob = (test_fold_callee(), (global int*)(generic char*)0); - long x = glob - (global int*)(generic char*)0; - x = x + (int)(test_fold_callee(), (local int*)(generic char*)(global short*)0); - x = x - (int)((local int*)0 == (local int*)(generic char*)0); -} diff --git a/clang/test/CodeGenOpenCL/nullptr.cl b/clang/test/CodeGenOpenCL/nullptr.cl new file mode 100644 index 0000000000000..976e12c0bef47 --- /dev/null +++ b/clang/test/CodeGenOpenCL/nullptr.cl @@ -0,0 +1,735 @@ +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple spir64 -emit-llvm -o - -Wno-void-pointer-to-int-cast -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast | FileCheck %s --check-prefixes=CHECK,SPIR64 +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -O0 -cl-std=CL2.0 -include opencl-c.h -triple spir64 -emit-llvm -o - -Wno-void-pointer-to-int-cast -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast | FileCheck --check-prefixes=CHECK-NOOPT,SPIR64-NOOPT %s +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,AMDGCN +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -O0 -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck --check-prefixes=CHECK-NOOPT,AMDGCN-NOOPT %s +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s --check-prefix=AMDGCN +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -fcommon -emit-llvm -o - | FileCheck %s --check-prefix=AMDGCN-COMMON + +typedef struct { + private char *p1; + local char *p2; + constant char *p3; + global char *p4; + generic char *p5; +} StructTy1; + +typedef struct { + constant char *p3; + global char *p4; + generic char *p5; +} StructTy2; + +// Test 0 as initializer. + +// SPIR64: @private_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8 +// AMDGCN: @private_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4 +private char *private_p = 0; + +// SPIR64: @local_p = local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8 +// AMDGCN: @local_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4 +local char *local_p = 0; + +// SPIR64: @global_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), align 8 +// AMDGCN: @global_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8 +global char *global_p = 0; + +// SPIR64: @constant_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(2) null, align 8 +// AMDGCN: @constant_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8 +constant char *constant_p = 0; + +// SPIR64: @generic_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8 +// AMDGCN: @generic_p ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8 +generic char *generic_p = 0; + +// Test NULL as initializer. + +// SPIR64: @private_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8 +// AMDGCN: @private_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4 +private char *private_p_NULL = NULL; + +// SPIR64: @local_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8 +// AMDGCN: @local_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4 +local char *local_p_NULL = NULL; + +// SPIR64: @global_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), align 8 +// AMDGCN: @global_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8 +global char *global_p_NULL = NULL; + +// SPIR64: @constant_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(2) null, align 8 +// AMDGCN: @constant_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8 +constant char *constant_p_NULL = NULL; + +// SPIR64: @generic_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8 +// AMDGCN: @generic_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8 +generic char *generic_p_NULL = NULL; + +// Test constant folding of null pointer. +// A null pointer should be folded to a null pointer in the target address space. + +// SPIR64: @fold_generic ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8 +// AMDGCN: @fold_generic ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8 +generic int *fold_generic = (global int*)(generic float*)(private char*)0; + +// SPIR64: @fold_priv ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8 +// AMDGCN: @fold_priv ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr addrspace(1) null to ptr addrspace(5)), align 4 +private short *fold_priv = (private short*)(generic int*)(global void*)0; + +// SPIR64: @fold_priv_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr inttoptr (i64 10 to ptr), align 8 +// AMDGCN: @fold_priv_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) inttoptr (i32 9 to ptr addrspace(5)), align 4 +private char *fold_priv_arith = (private char*)0 + 10; + +// SPIR64: @fold_local_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) inttoptr (i64 10 to ptr addrspace(3)), align 8 +// AMDGCN: @fold_local_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) inttoptr (i32 9 to ptr addrspace(3)), align 4 +local char *fold_local_arith = (local char*)0 + 10; + +// SPIR64: @fold_int ={{.*}} local_unnamed_addr addrspace(1) global i32 14, align 4 +// AMDGCN: @fold_int ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4 +int fold_int = (int)(private void*)(generic char*)(global int*)0 + 14; + +// SPIR64: @fold_int2 ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4 +// AMDGCN: @fold_int2 ={{.*}} local_unnamed_addr addrspace(1) global i32 12, align 4 +int fold_int2 = (int) ((private void*)0 + 13); + +// SPIR64: @fold_int3 ={{.*}} local_unnamed_addr addrspace(1) global i32 0, align 4 +// AMDGCN: @fold_int3 ={{.*}} local_unnamed_addr addrspace(1) global i32 -1, align 4 +int fold_int3 = (int) ((private int*)0); + +// SPIR64: @fold_int4 ={{.*}} local_unnamed_addr addrspace(1) global i32 8, align 4 +// AMDGCN: @fold_int4 ={{.*}} local_unnamed_addr addrspace(1) global i32 7, align 4 +int fold_int4 = (int) &((private int*)0)[2]; + +// SPIR64: @fold_int5 ={{.*}} local_unnamed_addr addrspace(1) global i32 8, align 4 +// AMDGCN: @fold_int5 ={{.*}} local_unnamed_addr addrspace(1) global i32 3, align 4 +int fold_int5 = (int) &((private StructTy1*)0)->p2; + +// SPIR64: @fold_int_local ={{.*}} local_unnamed_addr addrspace(1) global i32 14, align 4 +// AMDGCN: @fold_int_local = local_unnamed_addr addrspace(1) global i32 13, align 4 +int fold_int_local = (int)(local void*)(generic char*)(global int*)0 + 14; + +// SPIR64: @fold_int2_local ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4 +// AMDGCN: @fold_int2_local ={{.*}} local_unnamed_addr addrspace(1) global i32 12, align 4 +int fold_int2_local = (int) ((local void*)0 + 13); + +// SPIR64: @fold_int3_local ={{.*}} local_unnamed_addr addrspace(1) global i32 0, align 4 +// AMDGCN: @fold_int3_local ={{.*}} local_unnamed_addr addrspace(1) global i32 -1, align 4 +int fold_int3_local = (int) ((local int*)0); + +// SPIR64: @fold_int4_local ={{.*}} local_unnamed_addr addrspace(1) global i32 8, align 4 +// AMDGCN: @fold_int4_local ={{.*}} local_unnamed_addr addrspace(1) global i32 7, align 4 +int fold_int4_local = (int) &((local int*)0)[2]; + +// SPIR64: @fold_int5_local ={{.*}} local_unnamed_addr addrspace(1) global i32 8, align 4 +// AMDGCN: @fold_int5_local ={{.*}} local_unnamed_addr addrspace(1) global i32 3, align 4 +int fold_int5_local = (int) &((local StructTy1*)0)->p2; + + +// Test static variable initialization. + +// SPIR64-NOOPT: @test_static_var_private.sp1 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8 +// SPIR64-NOOPT: @test_static_var_private.sp2 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8 +// SPIR64-NOOPT: @test_static_var_private.sp3 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8 +// SPIR64-NOOPT: @test_static_var_private.sp4 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8 +// SPIR64-NOOPT: @test_static_var_private.sp5 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8 +// SPIR64-NOOPT: @test_static_var_private.SS1 = internal addrspace(1) global %struct.StructTy1 zeroinitializer, align 8 +// AMDGCN-NOOPT: @test_static_var_private.sp1 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4 +// AMDGCN-NOOPT: @test_static_var_private.sp2 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4 +// AMDGCN-NOOPT: @test_static_var_private.sp3 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4 +// AMDGCN-NOOPT: @test_static_var_private.sp4 = internal addrspace(1) global ptr addrspace(5) null, align 4 +// AMDGCN-NOOPT: @test_static_var_private.sp5 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4 +// AMDGCN-NOOPT: @test_static_var_private.SS1 = internal addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8 +// CHECK-NOOPT: @test_static_var_private.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8 + +void test_static_var_private(void) { + static private char *sp1 = 0; + static private char *sp2 = NULL; + static private char *sp3; + static private char *sp4 = (private char*)((void)0, 0); + const int x = 0; + static private char *sp5 = (private char*)x; + static StructTy1 SS1; + static StructTy2 SS2; +} + +// SPIR64-NOOPT: @test_static_var_local.sp1 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8 +// SPIR64-NOOPT: @test_static_var_local.sp2 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8 +// SPIR64-NOOPT: @test_static_var_local.sp3 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8 +// SPIR64-NOOPT: @test_static_var_local.sp4 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8 +// SPIR64-NOOPT: @test_static_var_local.sp5 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8 +// SPIR64-NOOPT: @test_static_var_local.SS1 = internal addrspace(1) global %struct.StructTy1 zeroinitializer, align 8 +// AMDGCN-NOOPT: @test_static_var_local.sp1 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4 +// AMDGCN-NOOPT: @test_static_var_local.sp2 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4 +// AMDGCN-NOOPT: @test_static_var_local.sp3 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4 +// AMDGCN-NOOPT: @test_static_var_local.sp4 = internal addrspace(1) global ptr addrspace(3) null, align 4 +// AMDGCN-NOOPT: @test_static_var_local.sp5 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4 +// AMDGCN-NOOPT: @test_static_var_local.SS1 = internal addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8 +// CHECK-NOOPT: @test_static_var_local.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8 +void test_static_var_local(void) { + static local char *sp1 = 0; + static local char *sp2 = NULL; + static local char *sp3; + static local char *sp4 = (local char*)((void)0, 0); + const int x = 0; + static local char *sp5 = (local char*)x; + static StructTy1 SS1; + static StructTy2 SS2; +} + +// Test function-scope variable initialization. +// CHECK-NOOPT-LABEL: @test_func_scope_var_private( +// SPIR64-NOOPT: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr %sp1{{.*}}, align 8 +// SPIR64-NOOPT: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr %sp2{{.*}}, align 8 +// SPIR64-NOOPT: store ptr null, ptr %sp3{{.*}}, align 8 +// SPIR64-NOOPT: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr %sp4{{.*}}, align 8 +// SPIR64-NOOPT: call void @llvm.memset.p0.i64(ptr align 8 %SS1{{.*}}, i8 0, i64 40, i1 false) +// SPIR64-NOOPT: call void @llvm.memcpy.p0.p2.i64(ptr align 8 %SS2{{.*}}, ptr addrspace(2) align 8 @__const.test_func_scope_var_private.SS2, i64 24, i1 false) +// AMDGCN-NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp1{{.*}}, align 4 +// AMDGCN-NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp2{{.*}}, align 4 +// AMDGCN-NOOPT: store ptr addrspace(5) null, ptr addrspace(5) %sp3{{.*}}, align 4 +// AMDGCN-NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp4{{.*}}, align 4 +// AMDGCN-NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_private.SS1, i64 32, i1 false) +// AMDGCN-NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2{{.*}}, i8 0, i64 24, i1 false) +void test_func_scope_var_private(void) { + private char *sp1 = 0; + private char *sp2 = NULL; + private char *sp3 = (private char*)((void)0, 0); + const int x = 0; + private char *sp4 = (private char*)x; + StructTy1 SS1 = {0, 0, 0, 0, 0}; + StructTy2 SS2 = {0, 0, 0}; +} + +// Test function-scope variable initialization. +// CHECK-NOOPT-LABEL: @test_func_scope_var_local( +// SPIR64-NOOPT: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr %sp1{{.*}}, align 8 +// SPIR64-NOOPT: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr %sp2{{.*}}, align 8 +// SPIR64-NOOPT: store ptr addrspace(3) null, ptr %sp3{{.*}}, align 8 +// SPIR64-NOOPT: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr %sp4{{.*}}, align 8 +// SPIR64-NOOPT: call void @llvm.memset.p0.i64(ptr align 8 %SS1{{.*}}, i8 0, i64 40, i1 false) +// SPIR64-NOOPT: call void @llvm.memcpy.p0.p2.i64(ptr align 8 %SS2{{.*}}, ptr addrspace(2) align 8 @__const.test_func_scope_var_local.SS2, i64 24, i1 false) +// AMDGCN-NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp1{{.*}}, align 4 +// AMDGCN-NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp2{{.*}}, align 4 +// AMDGCN-NOOPT: store ptr addrspace(3) null, ptr addrspace(5) %sp3{{.*}}, align 4 +// AMDGCN-NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp4{{.*}}, align 4 +// AMDGCN-NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_local.SS1, i64 32, i1 false) +// AMDGCN-NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2{{.*}}, i8 0, i64 24, i1 false) +void test_func_scope_var_local(void) { + local char *sp1 = 0; + local char *sp2 = NULL; + local char *sp3 = (local char*)((void)0, 0); + const int x = 0; + local char *sp4 = (local char*)x; + StructTy1 SS1 = {0, 0, 0, 0, 0}; + StructTy2 SS2 = {0, 0, 0}; +} + + +// Test default initialization of pointers. + +// Tentative definition of global variables with non-zero initializer +// cannot have common linkage since common linkage requires zero initialization +// and does not have explicit section. + +// SPIR64: @p1 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8 +// AMDGCN: @p1 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4 +// AMDGCN-COMMON: @p1 = weak local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4 +private char *p1; + +// SPIR64: @p2 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8 +// AMDGCN: @p2 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4 +// AMDGCN-COMMON: @p2 = weak local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4 +local char *p2; + +// SPIR64: @p3 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(2) null, align 8 +// AMDGCN: @p3 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8 +// AMDGCN-COMMON: @p3 = common local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8 +constant char *p3; + +// SPIR64: @p4 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), align 8 +// AMDGCN: @p4 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8 +// AMDGCN-COMMON: @p4 = common local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8 +global char *p4; + +// SPIR64: @p5 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8 +// AMDGCN: @p5 ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8 +// AMDGCN-COMMON: @p5 = common local_unnamed_addr addrspace(1) global ptr null, align 8 +generic char *p5; + +// Test default initialization of structure. + +// SPIR64: @S1 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy1 zeroinitializer, align 8 +// AMDGCN: @S1 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8 +StructTy1 S1; + +// CHECK: @S2 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy2 zeroinitializer, align 8 +StructTy2 S2; + +// Test default initialization of array. +// SPIR64: @A1 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy1] zeroinitializer, align 8 +// AMDGCN: @A1 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy1] [%struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }], align 8 +StructTy1 A1[2]; + +// CHECK: @A2 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy2] zeroinitializer, align 8 +StructTy2 A2[2]; + +// Test comparison with 0. + +// CHECK-LABEL: cmp_private +// SPIR64: icmp eq ptr %p, addrspacecast (ptr addrspace(4) null to ptr) +// AMDGCN: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5)) +void cmp_private(private char* p) { + if (p != 0) + *p = 0; +} + +// CHECK-LABEL: cmp_local +// SPIR64: icmp eq ptr addrspace(3) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) +// AMDGCN: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3)) +void cmp_local(local char* p) { + if (p != 0) + *p = 0; +} + +// CHECK-LABEL: cmp_global +// SPIR64: icmp eq ptr addrspace(1) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) +// AMDGCN: icmp eq ptr addrspace(1) %p, null +void cmp_global(global char* p) { + if (p != 0) + *p = 0; +} + +// CHECK-LABEL: cmp_constant +// SPIR64: icmp eq ptr addrspace(2) %p, null +// AMDGCN: icmp eq ptr addrspace(4) %p, null +char cmp_constant(constant char* p) { + if (p != 0) + return *p; + else + return 0; +} + +// CHECK-LABEL: cmp_generic +// SPIR64: icmp eq ptr addrspace(4) %p, null +// AMDGCN: icmp eq ptr %p, null +void cmp_generic(generic char* p) { + if (p != 0) + *p = 0; +} + +// Test comparison with NULL. + +// CHECK-LABEL: cmp_NULL_private +// SPIR64: icmp eq ptr %p, addrspacecast (ptr addrspace(4) null to ptr) +// AMDGCN: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5)) +void cmp_NULL_private(private char* p) { + if (p != NULL) + *p = 0; +} + +// CHECK-LABEL: cmp_NULL_local +// SPIR64: icmp eq ptr addrspace(3) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) +// AMDGCN: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3)) +void cmp_NULL_local(local char* p) { + if (p != NULL) + *p = 0; +} + +// CHECK-LABEL: cmp_NULL_global +// SPIR64: icmp eq ptr addrspace(1) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) +// AMDGCN: icmp eq ptr addrspace(1) %p, null +void cmp_NULL_global(global char* p) { + if (p != NULL) + *p = 0; +} + +// CHECK-LABEL: cmp_NULL_constant +// SPIR64: icmp eq ptr addrspace(2) %p, null +// AMDGCN: icmp eq ptr addrspace(4) %p, null +char cmp_NULL_constant(constant char* p) { + if (p != NULL) + return *p; + else + return 0; +} + +// CHECK-LABEL: cmp_NULL_generic +// SPIR64: icmp eq ptr addrspace(4) %p, null +// AMDGCN: icmp eq ptr %p, null +void cmp_NULL_generic(generic char* p) { + if (p != NULL) + *p = 0; +} + +// Test storage 0 as null pointer. +// CHECK-LABEL: test_storage_null_pointer +// SPIR64: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr addrspace(4) %arg_private +// SPIR64: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr addrspace(4) %arg_local +// SPIR64: store ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr addrspace(4) %arg_global +// SPIR64: store ptr addrspace(2) null, ptr addrspace(4) %arg_constant +// SPIR64: store ptr addrspace(4) null, ptr addrspace(4) %arg_generic +// AMDGCN: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %arg_private +// AMDGCN: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %arg_local +// AMDGCN: store ptr addrspace(1) null, ptr %arg_global +// AMDGCN: store ptr addrspace(4) null, ptr %arg_constant +// AMDGCN: store ptr null, ptr %arg_generic +void test_storage_null_pointer(private char** arg_private, + local char** arg_local, + global char** arg_global, + constant char** arg_constant, + generic char** arg_generic) { + *arg_private = 0; + *arg_local = 0; + *arg_global = 0; + *arg_constant = 0; + *arg_generic = 0; +} + +// Test storage NULL as null pointer. +// CHECK-LABEL: test_storage_null_pointer_NULL +// SPIR64: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr addrspace(4) %arg_private +// SPIR64: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr addrspace(4) %arg_local +// SPIR64: store ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr addrspace(4) %arg_global +// SPIR64: store ptr addrspace(2) null, ptr addrspace(4) %arg_constant +// SPIR64: store ptr addrspace(4) null, ptr addrspace(4) %arg_generic +// AMDGCN: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %arg_private +// AMDGCN: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %arg_local +// AMDGCN: store ptr addrspace(1) null, ptr %arg_global +// AMDGCN: store ptr addrspace(4) null, ptr %arg_constant +// AMDGCN: store ptr null, ptr %arg_generic +void test_storage_null_pointer_NULL(private char** arg_private, + local char** arg_local, + global char** arg_global, + constant char** arg_constant, + generic char** arg_generic) { + *arg_private = NULL; + *arg_local = NULL; + *arg_global = NULL; + *arg_constant = NULL; + *arg_generic = NULL; +} + +// Test pass null pointer to function as argument. +void test_pass_null_pointer_arg_calee(private char* arg_private, + local char* arg_local, + global char* arg_global, + constant char* arg_constant, + generic char* arg_generic); + +// CHECK-LABEL: test_pass_null_pointer_arg +// SPIR64: call spir_func void @test_pass_null_pointer_arg_calee(ptr addrspacecast (ptr addrspace(4) null to ptr), ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr addrspace(2) null, ptr addrspace(4) null) +// SPIR64: call spir_func void @test_pass_null_pointer_arg_calee(ptr addrspacecast (ptr addrspace(4) null to ptr), ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr addrspace(2) null, ptr addrspace(4) null) +// AMDGCN: call void @test_pass_null_pointer_arg_calee(ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(1) null, ptr addrspace(4) null, ptr null) +// AMDGCN: call void @test_pass_null_pointer_arg_calee(ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(1) null, ptr addrspace(4) null, ptr null) +void test_pass_null_pointer_arg(void) { + test_pass_null_pointer_arg_calee(0, 0, 0, 0, 0); + test_pass_null_pointer_arg_calee(NULL, NULL, NULL, NULL, NULL); +} + +// Test cast null pointer to size_t. +void test_cast_null_pointer_to_sizet_calee(size_t arg_private, + size_t arg_local, + size_t arg_global, + size_t arg_constant, + size_t arg_generic); + +// CHECK-LABEL: test_cast_null_pointer_to_sizet +// SPIR64: call spir_func void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) to i64), i64 ptrtoint (ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) to i64), i64 0, i64 0) +// SPIR64: call spir_func void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) to i64), i64 ptrtoint (ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) to i64), i64 0, i64 0) +// AMDGCN: call void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i64), i64 0, i64 0, i64 0) +// AMDGCN: call void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i64), i64 0, i64 0, i64 0) +void test_cast_null_pointer_to_sizet(void) { + test_cast_null_pointer_to_sizet_calee((size_t)((private char*)0), + (size_t)((local char*)0), + (size_t)((global char*)0), + (size_t)((constant char*)0), + (size_t)((generic char*)0)); + test_cast_null_pointer_to_sizet_calee((size_t)((private char*)NULL), + (size_t)((local char*)NULL), + (size_t)((global char*)NULL), + (size_t)((constant char*)0), // NULL cannot be casted to constant pointer since it is defined as a generic pointer + (size_t)((generic char*)NULL)); +} + +// Test comparison between null pointers. +#define TEST_EQ00(addr1, addr2) int test_eq00_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)0; } +#define TEST_EQ0N(addr1, addr2) int test_eq0N_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)NULL; } +#define TEST_EQN0(addr1, addr2) int test_eqN0_##addr1##_##addr2(void) { return (addr1 char*)NULL == (addr2 char*)0; } +#define TEST_EQNN(addr1, addr2) int test_eqNN_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)NULL; } +#define TEST_NE00(addr1, addr2) int test_ne00_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)0; } +#define TEST_NE0N(addr1, addr2) int test_ne0N_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)NULL; } +#define TEST_NEN0(addr1, addr2) int test_neN0_##addr1##_##addr2(void) { return (addr1 char*)NULL != (addr2 char*)0; } +#define TEST_NENN(addr1, addr2) int test_neNN_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)NULL; } +#define TEST(addr1, addr2) \ + TEST_EQ00(addr1, addr2) \ + TEST_EQ0N(addr1, addr2) \ + TEST_EQN0(addr1, addr2) \ + TEST_EQNN(addr1, addr2) \ + TEST_NE00(addr1, addr2) \ + TEST_NE0N(addr1, addr2) \ + TEST_NEN0(addr1, addr2) \ + TEST_NENN(addr1, addr2) + +// CHECK-LABEL: test_eq00_generic_private +// CHECK: ret i32 1 +// CHECK-LABEL: test_eq0N_generic_private +// CHECK: ret i32 1 +// CHECK-LABEL: test_eqN0_generic_private +// CHECK: ret i32 1 +// CHECK-LABEL: test_eqNN_generic_private +// CHECK: ret i32 1 +// CHECK-LABEL: test_ne00_generic_private +// CHECK: ret i32 0 +// CHECK-LABEL: test_ne0N_generic_private +// CHECK: ret i32 0 +// CHECK-LABEL: test_neN0_generic_private +// CHECK: ret i32 0 +// CHECK-LABEL: test_neNN_generic_private +// CHECK: ret i32 0 +TEST(generic, private) + +// CHECK-LABEL: test_eq00_generic_local +// CHECK: ret i32 1 +// CHECK-LABEL: test_eq0N_generic_local +// CHECK: ret i32 1 +// CHECK-LABEL: test_eqN0_generic_local +// CHECK: ret i32 1 +// CHECK-LABEL: test_eqNN_generic_local +// CHECK: ret i32 1 +// CHECK-LABEL: test_ne00_generic_local +// CHECK: ret i32 0 +// CHECK-LABEL: test_ne0N_generic_local +// CHECK: ret i32 0 +// CHECK-LABEL: test_neN0_generic_local +// CHECK: ret i32 0 +// CHECK-LABEL: test_neNN_generic_local +// CHECK: ret i32 0 +TEST(generic, local) + +// CHECK-LABEL: test_eq00_generic_global +// CHECK: ret i32 1 +// CHECK-LABEL: test_eq0N_generic_global +// CHECK: ret i32 1 +// CHECK-LABEL: test_eqN0_generic_global +// CHECK: ret i32 1 +// CHECK-LABEL: test_eqNN_generic_global +// CHECK: ret i32 1 +// CHECK-LABEL: test_ne00_generic_global +// CHECK: ret i32 0 +// CHECK-LABEL: test_ne0N_generic_global +// CHECK: ret i32 0 +// CHECK-LABEL: test_neN0_generic_global +// CHECK: ret i32 0 +// CHECK-LABEL: test_neNN_generic_global +// CHECK: ret i32 0 +TEST(generic, global) + +// CHECK-LABEL: test_eq00_generic_generic +// CHECK: ret i32 1 +// CHECK-LABEL: test_eq0N_generic_generic +// CHECK: ret i32 1 +// CHECK-LABEL: test_eqN0_generic_generic +// CHECK: ret i32 1 +// CHECK-LABEL: test_eqNN_generic_generic +// CHECK: ret i32 1 +// CHECK-LABEL: test_ne00_generic_generic +// CHECK: ret i32 0 +// CHECK-LABEL: test_ne0N_generic_generic +// CHECK: ret i32 0 +// CHECK-LABEL: test_neN0_generic_generic +// CHECK: ret i32 0 +// CHECK-LABEL: test_neNN_generic_generic +// CHECK: ret i32 0 +TEST(generic, generic) + +// CHECK-LABEL: test_eq00_constant_constant +// CHECK: ret i32 1 +TEST_EQ00(constant, constant) + +// Test cast to bool. + +// CHECK-LABEL: cast_bool_private +// SPIR64: icmp eq ptr %p, addrspacecast (ptr addrspace(4) null to ptr) +// AMDGCN: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5)) +void cast_bool_private(private char* p) { + if (p) + *p = 0; +} + +// CHECK-LABEL: cast_bool_local +// SPIR64: icmp eq ptr addrspace(3) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) +// AMDGCN: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3)) +void cast_bool_local(local char* p) { + if (p) + *p = 0; +} + +// CHECK-LABEL: cast_bool_global +// SPIR64: icmp eq ptr addrspace(1) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) +// AMDGCN: icmp eq ptr addrspace(1) %p, null +void cast_bool_global(global char* p) { + if (p) + *p = 0; +} + +// CHECK-LABEL: cast_bool_constant +// SPIR64: icmp eq ptr addrspace(2) %p, null +// AMDGCN: icmp eq ptr addrspace(4) %p, null +char cast_bool_constant(constant char* p) { + if (p) + return *p; + else + return 0; +} + +// CHECK-LABEL: cast_bool_generic +// SPIR64: icmp eq ptr addrspace(4) %p, null +// AMDGCN: icmp eq ptr %p, null +void cast_bool_generic(generic char* p) { + if (p) + *p = 0; +} + +// Test initialize a struct using memset. +// For large structures which is mostly zero, clang generats llvm.memset for +// the zero part and store for non-zero members. +typedef struct { + long a, b, c, d; + private char *p; +} StructTy3; + +// CHECK-LABEL: test_memset_private +// SPIR64: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %ptr, i8 0, i64 32, i1 false) +// SPIR64: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr %ptr, i64 32 +// SPIR64: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr [[GEP]], align 8 +// AMDGCN: call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 8 {{.*}}, i8 0, i64 32, i1 false) +// AMDGCN: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) %ptr, i32 32 +// AMDGCN: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) [[GEP]] +// AMDGCN: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) {{.*}}, i32 36 +// AMDGCN: store i32 0, ptr addrspace(5) [[GEP1]], align 4 +void test_memset_private(private StructTy3 *ptr) { + StructTy3 S3 = {0, 0, 0, 0, 0}; + *ptr = S3; +} + +// Test casting literal 0 to pointer. +// A 0 literal casted to pointer should become a null pointer. + +// CHECK-LABEL: test_cast_0_to_local_ptr +// SPIR64: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) +// AMDGCN: ret ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) +local int* test_cast_0_to_local_ptr(void) { + return (local int*)0; +} + +// CHECK-LABEL: test_cast_0_to_private_ptr +// SPIR64: ptr addrspacecast (ptr addrspace(4) null to ptr) +// AMDGCN: ret ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) +private int* test_cast_0_to_private_ptr(void) { + return (private int*)0; +} + +// Test casting non-literal integer with 0 value to pointer. +// A non-literal integer expression with 0 value is casted to a pointer with +// zero value. + +// CHECK-LABEL: test_cast_int_to_ptr1_private +// SPIR64: ret ptr null +// AMDGCN: ret ptr addrspace(5) null +private int* test_cast_int_to_ptr1_private(void) { + return (private int*)((void)0, 0); +} + +// CHECK-LABEL: test_cast_int_to_ptr1_local +// CHECK: ret ptr addrspace(3) null +local int* test_cast_int_to_ptr1_local(void) { + return (local int*)((void)0, 0); +} + +// CHECK-LABEL: test_cast_int_to_ptr2 +// SPIR64: ret ptr null +// AMDGCN: ret ptr addrspace(5) null +private int* test_cast_int_to_ptr2(void) { + int x = 0; + return (private int*)x; +} + +// Test logical operations. +// CHECK-LABEL: test_not_nullptr +// CHECK: ret i32 1 +int test_not_nullptr(void) { + return !(private char*)NULL; +} + +// CHECK-LABEL: test_and_nullptr +// CHECK: ret i32 0 +int test_and_nullptr(int a) { + return a && ((private char*)NULL); +} + +// CHECK-LABEL: test_not_private_ptr +// SPIR64: %[[lnot:.*]] = icmp eq ptr %p, addrspacecast (ptr addrspace(4) null to ptr) +// AMDGCN: %[[lnot:.*]] = icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5)) +// CHECK: %[[lnot_ext:.*]] = zext i1 %[[lnot]] to i32 +// CHECK: ret i32 %[[lnot_ext]] +int test_not_private_ptr(private char* p) { + return !p; +} + +// CHECK-LABEL: test_not_local_ptr +// SPIR64: %[[lnot:.*]] = icmp eq ptr addrspace(3) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) +// AMDGCN: %[[lnot:.*]] = icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3)) +// CHECK: %[[lnot_ext:.*]] = zext i1 %[[lnot]] to i32 +// CHECK: ret i32 %[[lnot_ext]] +int test_not_local_ptr(local char* p) { + return !p; +} + + +// CHECK-LABEL: test_and_ptr +// SPIR64: %[[tobool:.*]] = icmp ne ptr %p1, addrspacecast (ptr addrspace(4) null to ptr) +// SPIR64: %[[tobool1:.*]] = icmp ne ptr addrspace(3) %p2, addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) +// AMDGCN: %[[tobool:.*]] = icmp ne ptr addrspace(5) %p1, addrspacecast (ptr null to ptr addrspace(5)) +// AMDGCN: %[[tobool1:.*]] = icmp ne ptr addrspace(3) %p2, addrspacecast (ptr null to ptr addrspace(3)) +// CHECK: %[[res:.*]] = select i1 %[[tobool]], i1 %[[tobool1]], i1 false +// CHECK: %[[land_ext:.*]] = zext i1 %[[res]] to i32 +// CHECK: ret i32 %[[land_ext]] +int test_and_ptr(private char* p1, local char* p2) { + return p1 && p2; +} + +// Test folding of null pointer in function scope. +// CHECK-NOOPT-LABEL: test_fold_private +// SPIR64-NOOPT: call{{.*}} void @test_fold_callee +// SPIR64-NOOPT: store ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr %glob{{.*}}, align 8 +// SPIR64-NOOPT: %{{.*}} = sub i64 %{{.*}}, ptrtoint (ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) to i64) +// AMDGCN-NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob{{.*}}, align 8 +// AMDGCN-NOOPT: %{{.*}} = sub i64 %{{.*}}, 0 +// SPIR64-NOOPT: call{{.*}} void @test_fold_callee +// SPIR64-NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i32) to i64 +// AMDGCN-NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i32) to i64 +// CHECK-NOOPT: %{{.*}} = add nsw i64 %{{.*}}, %[[SEXT]] +// CHECK-NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1 +void test_fold_callee(void); +void test_fold_private(void) { + global int* glob = (test_fold_callee(), (global int*)(generic char*)0); + long x = glob - (global int*)(generic char*)0; + x = x + (int)(test_fold_callee(), (private int*)(generic char*)(global short*)0); + x = x - (int)((private int*)0 == (private int*)(generic char*)0); +} + +// CHECK-NOOPT-LABEL: test_fold_local +// CHECK-NOOPT: call{{.*}} void @test_fold_callee +// SPIR64-NOOPT: store ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr %glob{{.*}}, align 8 +// SPIR64-NOOPT: %{{.*}} = sub i64 %{{.*}}, ptrtoint (ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) to i64) +// AMDGCN-NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob{{.*}}, align 8 +// AMDGCN-NOOPT: %{{.*}} = sub i64 %{{.*}}, 0 +// CHECK-NOOPT: call{{.*}} void @test_fold_callee +// SPIR64-NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) to i32) to i64 +// AMDGCN-NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i32) to i64 +// CHECK-NOOPT: %{{.*}} = add nsw i64 %{{.*}}, %[[SEXT]] +// CHECK-NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1 +void test_fold_local(void) { + global int* glob = (test_fold_callee(), (global int*)(generic char*)0); + long x = glob - (global int*)(generic char*)0; + x = x + (int)(test_fold_callee(), (local int*)(generic char*)(global short*)0); + x = x - (int)((local int*)0 == (local int*)(generic char*)0); +} From ec662bc247a5707ed97b0a26085e876fccca0392 Mon Sep 17 00:00:00 2001 From: Dan Blackwell Date: Fri, 31 Oct 2025 08:15:23 +0000 Subject: [PATCH 300/539] [Fuzzer][Test-Only][Darwin] Mark coverage.test and exit_on_src_pos.test unsupported (#165408) These tests are currently failing on some CI macOS instances due to an issue with the system symbolizer. This patch marks the tests unsupported on Darwin while we wait for all CI machines to be updated to a newer OS. rdar://160410051 --- compiler-rt/test/fuzzer/coverage.test | 2 ++ compiler-rt/test/fuzzer/exit_on_src_pos.test | 1 + 2 files changed, 3 insertions(+) diff --git a/compiler-rt/test/fuzzer/coverage.test b/compiler-rt/test/fuzzer/coverage.test index cf36784ce21da..a4af2648d61e1 100644 --- a/compiler-rt/test/fuzzer/coverage.test +++ b/compiler-rt/test/fuzzer/coverage.test @@ -2,6 +2,8 @@ UNSUPPORTED: target={{.*windows.*}} # FIXME: CreatePCArray() emits PLT stub addresses for entry blocks, which are ignored by TracePC::PrintCoverage(). UNSUPPORTED: target=s390x{{.*}} +UNSUPPORTED: darwin + RUN: mkdir -p %t.dir && cd %t.dir RUN: %cpp_compiler -mllvm -use-unknown-locations=Disable %S/NullDerefTest.cpp -o %t.dir/NullDerefTest RUN: %cpp_compiler -mllvm -use-unknown-locations=Disable %S/DSO1.cpp -fPIC %ld_flags_rpath_so1 -O0 -shared -o %dynamiclib1 diff --git a/compiler-rt/test/fuzzer/exit_on_src_pos.test b/compiler-rt/test/fuzzer/exit_on_src_pos.test index 020424e2d9fdd..ba4fb01780ce2 100644 --- a/compiler-rt/test/fuzzer/exit_on_src_pos.test +++ b/compiler-rt/test/fuzzer/exit_on_src_pos.test @@ -8,6 +8,7 @@ UNSUPPORTED: target=thumb{{.*}} # Timeout on loongarch64 machine UNSUPPORTED: target=loongarch64{{.*}} +UNSUPPORTED: darwin RUN: %cpp_compiler -O0 %S/SimpleTest.cpp -o %t-SimpleTest.exe -mllvm -use-unknown-locations=Disable RUN: %cpp_compiler -O0 %S/ShrinkControlFlowTest.cpp -o %t-ShrinkControlFlowTest.exe From 1b399fdd131de80efecf09583f6661364d3be679 Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Fri, 31 Oct 2025 04:24:37 -0400 Subject: [PATCH 301/539] [OpenMP] Remove OS checks for ARM and AArch64 (#165640) --- openmp/runtime/src/z_Linux_asm.S | 14 ++++++-------- openmp/runtime/src/z_Linux_util.cpp | 3 +-- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S index 89359759fcb42..684c7e2816442 100644 --- a/openmp/runtime/src/z_Linux_asm.S +++ b/openmp/runtime/src/z_Linux_asm.S @@ -121,8 +121,7 @@ KMP_PREFIX_UNDERSCORE(\proc): # endif // KMP_OS_DARWIN #endif // KMP_ARCH_X86 || KMP_ARCH_x86_64 -#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS || KMP_OS_OPENBSD) && \ - (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM) +#if KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM # if KMP_OS_DARWIN # define KMP_PREFIX_UNDERSCORE(x) _##x // extra underscore for OS X* symbols @@ -237,8 +236,7 @@ KMP_PREFIX_UNDERSCORE(\proc): # define PACBTI_RET # define GNU_PROPERTY_BTI_PAC # endif -#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS || KMP_OS_OPENBSD) && \ - (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM) +#endif // KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM .macro COMMON name, size, align_power #if KMP_OS_DARWIN @@ -1302,7 +1300,7 @@ KMP_LABEL(kmp_no_args): #endif /* KMP_ARCH_X86_64 */ // ' -#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32) +#if KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 //------------------------------------------------------------------------ // int @@ -1428,9 +1426,9 @@ KMP_LABEL(kmp_1): DEBUG_INFO __kmp_invoke_microtask // -- End __kmp_invoke_microtask -#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32) */ +#endif /* KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 */ -#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM +#if KMP_ARCH_ARM //------------------------------------------------------------------------ // int @@ -1573,7 +1571,7 @@ KMP_LABEL(kmp_1): DEBUG_INFO __kmp_invoke_microtask // -- End __kmp_invoke_microtask -#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM */ +#endif /* KMP_ARCH_ARM */ #if KMP_ARCH_PPC64 diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp index 368c0b6e872cc..c7fe0642cea63 100644 --- a/openmp/runtime/src/z_Linux_util.cpp +++ b/openmp/runtime/src/z_Linux_util.cpp @@ -2736,8 +2736,7 @@ int __kmp_get_load_balance(int max) { #endif // USE_LOAD_BALANCE -#if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC || \ - ((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) || \ +#if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC || KMP_ARCH_AARCH64 || \ KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \ KMP_ARCH_ARM || KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_PPC_XCOFF || \ KMP_ARCH_AARCH64_32) From caf1fea1f25f74c83ae2a66049483bda657cc244 Mon Sep 17 00:00:00 2001 From: Haocong Lu Date: Fri, 31 Oct 2025 16:43:25 +0800 Subject: [PATCH 302/539] [mlir][vector] Fix missed `return` in ExtractStridedSliceOp::fold (#165669) Fix missed `return` when folding splat ConstantOp, it could work well probably because of good compatibility of `foldExtractStridedSliceNonSplatConstant`. --- mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index ad8255a95cb4e..ae3423c40040d 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -4336,7 +4336,7 @@ OpFoldResult ExtractStridedSliceOp::fold(FoldAdaptor adaptor) { // ExtractStridedSliceOp(splat ConstantOp) -> ConstantOp. if (auto splat = llvm::dyn_cast_if_present(adaptor.getSource())) - DenseElementsAttr::get(getType(), splat.getSplatValue()); + return DenseElementsAttr::get(getType(), splat.getSplatValue()); // ExtractStridedSliceOp(non-splat ConstantOp) -> ConstantOp. return foldExtractStridedSliceNonSplatConstant(*this, adaptor.getSource()); From e95e64f160777bf21891dd1ce87e74ab60d2d9a8 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Fri, 31 Oct 2025 09:03:23 +0000 Subject: [PATCH 303/539] =?UTF-8?q?=F0=9F=8D=92=20[lldb]=20Fix=20TestRealD?= =?UTF-8?q?efinition=20on=20older=20DWARF=20versions=20(#165729)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cherry-picks this fix from the Apple LLDB fork. Ever since we upstreamed https://github.com/llvm/llvm-project/pull/164011, this test is failing on our pre-DWARFv5 bots: ``` 13:47:54 ====================================================================== 13:47:54 FAIL: test_frame_var_after_stop_at_implementation_dsym (TestRealDefinition.TestRealDefinition) 13:47:54 Test that we can find the implementation for an objective C type 13:47:54 ---------------------------------------------------------------------- 13:47:54 Traceback (most recent call last): 13:47:54 File "/Users/ec2-user/jenkins/workspace/llvm.org/lldb-cmake-matrix/llvm-project/lldb/packages/Python/lldbsuite/test/lldbtest.py", line 1804, in test_method 13:47:54 return attrvalue(self) 13:47:54 File "/Users/ec2-user/jenkins/workspace/llvm.org/lldb-cmake-matrix/llvm-project/lldb/test/API/lang/objc/real-definition/TestRealDefinition.py", line 60, in test_frame_var_after_stop_at_implementation 13:47:54 self.expect( 13:47:54 File "/Users/ec2-user/jenkins/workspace/llvm.org/lldb-cmake-matrix/llvm-project/lldb/packages/Python/lldbsuite/test/lldbtest.py", line 2416, in expect 13:47:54 self.runCmd( 13:47:54 File "/Users/ec2-user/jenkins/workspace/llvm.org/lldb-cmake-matrix/llvm-project/lldb/packages/Python/lldbsuite/test/lldbtest.py", line 1006, in runCmd 13:47:54 self.assertTrue(self.res.Succeeded(), msg + output) 13:47:54 AssertionError: False is not true : Variable(s) displayed correctly 13:47:54 Error output: 13:47:54 error: :1:12: "_hidden_ivar" is not a member of "(id) _bar" 13:47:54 1 | foo->_bar->_hidden_ivar 13:47:54 | ^ ``` Original commit message: For a while, tests were run with `target.prefer-dynamic-value` overridden to `no-dynamic-values` – but the override was removed in [D132382](https://reviews.llvm.org/D132382). At that time, tests that failed were individually opted in to `no-dynamic-values`. I don't recall specifics about `TestRealDefinition`, but it currently fails with `no-dynamic-values`, and that is correct behavior. This change removes the `no-dynamic-values` override. --- .../API/lang/objc/real-definition/TestRealDefinition.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/lldb/test/API/lang/objc/real-definition/TestRealDefinition.py b/lldb/test/API/lang/objc/real-definition/TestRealDefinition.py index 6cbb9ddec264d..9fb2bea93e9c2 100644 --- a/lldb/test/API/lang/objc/real-definition/TestRealDefinition.py +++ b/lldb/test/API/lang/objc/real-definition/TestRealDefinition.py @@ -27,13 +27,11 @@ def test_frame_var_after_stop_at_interface(self): # Run at stop at main lldbutil.check_breakpoint(self, bpno=1, expected_hit_count=1) - self.runCmd("settings set target.prefer-dynamic-value no-dynamic-values") - # This should display correctly. self.expect( "frame variable foo->_bar->_hidden_ivar", VARIABLES_DISPLAYED_CORRECTLY, - substrs=["(NSString *)", "foo->_bar->_hidden_ivar = 0x"], + substrs=["foo->_bar->_hidden_ivar = 0x"], ) def test_frame_var_after_stop_at_implementation(self): @@ -54,11 +52,9 @@ def test_frame_var_after_stop_at_implementation(self): # Run at stop at main lldbutil.check_breakpoint(self, bpno=1, expected_hit_count=1) - self.runCmd("settings set target.prefer-dynamic-value no-dynamic-values") - # This should display correctly. self.expect( "frame variable foo->_bar->_hidden_ivar", VARIABLES_DISPLAYED_CORRECTLY, - substrs=["(NSString *)", "foo->_bar->_hidden_ivar = 0x"], + substrs=["foo->_bar->_hidden_ivar = 0x"], ) From 1bb089a4ed6a747386ab47f0ea71e2ea364b438b Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Fri, 31 Oct 2025 09:08:12 +0000 Subject: [PATCH 304/539] [lldb][TypeSystem] Remove count parameter from TypeSystem::GetEncoding (#165702) There were a couple of quirks with this parameter: 1. It wasn't being set consistently. E.g., vector types would be of count `1` but complex types would be `2`. Hence, it wasn't clear what count was referring to. 2. `count` was not being set if the input type was invalid, possibly leaving the input reference uninitialized. 3. Only one callsite actually made use of `count`, and that in itself seems like it could be improved (added a FIXME). If we ever need a "how many elements does this type represent", we can implement one with a new `TypeSystem` API that does exactly that. --- lldb/include/lldb/Symbol/CompilerType.h | 2 +- lldb/include/lldb/Symbol/Type.h | 2 +- lldb/include/lldb/Symbol/TypeSystem.h | 3 +-- .../Plugins/TypeSystem/Clang/TypeSystemClang.cpp | 10 +++------- lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h | 3 +-- lldb/source/Symbol/CompilerType.cpp | 10 +++++----- lldb/source/Symbol/Type.cpp | 4 ++-- lldb/source/ValueObject/ValueObject.cpp | 6 ++---- 8 files changed, 16 insertions(+), 24 deletions(-) diff --git a/lldb/include/lldb/Symbol/CompilerType.h b/lldb/include/lldb/Symbol/CompilerType.h index df8489a7fe582..1fcf255123d9f 100644 --- a/lldb/include/lldb/Symbol/CompilerType.h +++ b/lldb/include/lldb/Symbol/CompilerType.h @@ -400,7 +400,7 @@ class CompilerType { /// Return the size of the type in bits. llvm::Expected GetBitSize(ExecutionContextScope *exe_scope) const; - lldb::Encoding GetEncoding(uint64_t &count) const; + lldb::Encoding GetEncoding() const; lldb::Format GetFormat() const; diff --git a/lldb/include/lldb/Symbol/Type.h b/lldb/include/lldb/Symbol/Type.h index e657357b942f1..02b43e300a83e 100644 --- a/lldb/include/lldb/Symbol/Type.h +++ b/lldb/include/lldb/Symbol/Type.h @@ -507,7 +507,7 @@ class Type : public std::enable_shared_from_this, public UserID { lldb::Format GetFormat(); - lldb::Encoding GetEncoding(uint64_t &count); + lldb::Encoding GetEncoding(); SymbolContextScope *GetSymbolContextScope() { return m_context; } const SymbolContextScope *GetSymbolContextScope() const { return m_context; } diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h index 0ec3a28898329..40a80d8d09286 100644 --- a/lldb/include/lldb/Symbol/TypeSystem.h +++ b/lldb/include/lldb/Symbol/TypeSystem.h @@ -317,8 +317,7 @@ class TypeSystem : public PluginInterface, GetBitSize(lldb::opaque_compiler_type_t type, ExecutionContextScope *exe_scope) = 0; - virtual lldb::Encoding GetEncoding(lldb::opaque_compiler_type_t type, - uint64_t &count) = 0; + virtual lldb::Encoding GetEncoding(lldb::opaque_compiler_type_t type) = 0; virtual lldb::Format GetFormat(lldb::opaque_compiler_type_t type) = 0; diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index 6ec054d5eac05..f5a8d84a3ce50 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -4864,12 +4864,10 @@ TypeSystemClang::GetTypeBitAlign(lldb::opaque_compiler_type_t type, return {}; } -lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type, - uint64_t &count) { +lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type) { if (!type) return lldb::eEncodingInvalid; - count = 1; clang::QualType qual_type = RemoveWrappingTypes(GetCanonicalQualType(type)); switch (qual_type->getTypeClass()) { @@ -4903,7 +4901,6 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type, case clang::Type::DependentVector: case clang::Type::ExtVector: case clang::Type::Vector: - // TODO: Set this to more than one??? break; case clang::Type::BitInt: @@ -5104,11 +5101,10 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type, const clang::ComplexType *complex_type = qual_type->getAsComplexIntegerType(); if (complex_type) - encoding = GetType(complex_type->getElementType()).GetEncoding(count); + encoding = GetType(complex_type->getElementType()).GetEncoding(); else encoding = lldb::eEncodingSint; } - count = 2; return encoding; } @@ -5165,7 +5161,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type, case clang::Type::SubstBuiltinTemplatePack: break; } - count = 0; + return lldb::eEncodingInvalid; } diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h index 9e0a54209345d..11107c0fea4f6 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h @@ -837,8 +837,7 @@ class TypeSystemClang : public TypeSystem { GetBitSize(lldb::opaque_compiler_type_t type, ExecutionContextScope *exe_scope) override; - lldb::Encoding GetEncoding(lldb::opaque_compiler_type_t type, - uint64_t &count) override; + lldb::Encoding GetEncoding(lldb::opaque_compiler_type_t type) override; lldb::Format GetFormat(lldb::opaque_compiler_type_t type) override; diff --git a/lldb/source/Symbol/CompilerType.cpp b/lldb/source/Symbol/CompilerType.cpp index 62c0ddf51c012..73da3127a98a3 100644 --- a/lldb/source/Symbol/CompilerType.cpp +++ b/lldb/source/Symbol/CompilerType.cpp @@ -793,10 +793,10 @@ CompilerType::GetTypeBitAlign(ExecutionContextScope *exe_scope) const { return {}; } -lldb::Encoding CompilerType::GetEncoding(uint64_t &count) const { +lldb::Encoding CompilerType::GetEncoding() const { if (IsValid()) if (auto type_system_sp = GetTypeSystem()) - return type_system_sp->GetEncoding(m_type, count); + return type_system_sp->GetEncoding(m_type); return lldb::eEncodingInvalid; } @@ -1093,10 +1093,10 @@ bool CompilerType::GetValueAsScalar(const lldb_private::DataExtractor &data, if (IsAggregateType()) { return false; // Aggregate types don't have scalar values } else { - uint64_t count = 0; - lldb::Encoding encoding = GetEncoding(count); + // FIXME: check that type is scalar instead of checking encoding? + lldb::Encoding encoding = GetEncoding(); - if (encoding == lldb::eEncodingInvalid || count != 1) + if (encoding == lldb::eEncodingInvalid || (GetTypeInfo() & eTypeIsComplex)) return false; auto byte_size_or_err = GetByteSize(exe_scope); diff --git a/lldb/source/Symbol/Type.cpp b/lldb/source/Symbol/Type.cpp index 952b2bdee1886..0c3246d238701 100644 --- a/lldb/source/Symbol/Type.cpp +++ b/lldb/source/Symbol/Type.cpp @@ -531,9 +531,9 @@ lldb::TypeSP Type::GetTypedefType() { lldb::Format Type::GetFormat() { return GetForwardCompilerType().GetFormat(); } -lldb::Encoding Type::GetEncoding(uint64_t &count) { +lldb::Encoding Type::GetEncoding() { // Make sure we resolve our type if it already hasn't been. - return GetForwardCompilerType().GetEncoding(count); + return GetForwardCompilerType().GetEncoding(); } bool Type::ReadFromMemory(ExecutionContext *exe_ctx, lldb::addr_t addr, diff --git a/lldb/source/ValueObject/ValueObject.cpp b/lldb/source/ValueObject/ValueObject.cpp index 38b9f77e6ddda..aeea32f19ee2c 100644 --- a/lldb/source/ValueObject/ValueObject.cpp +++ b/lldb/source/ValueObject/ValueObject.cpp @@ -790,8 +790,7 @@ bool ValueObject::SetData(DataExtractor &data, Status &error) { return false; } - uint64_t count = 0; - const Encoding encoding = GetCompilerType().GetEncoding(count); + const Encoding encoding = GetCompilerType().GetEncoding(); const size_t byte_size = llvm::expectedToOptional(GetByteSize()).value_or(0); @@ -1669,8 +1668,7 @@ bool ValueObject::SetValueFromCString(const char *value_str, Status &error) { return false; } - uint64_t count = 0; - const Encoding encoding = GetCompilerType().GetEncoding(count); + const Encoding encoding = GetCompilerType().GetEncoding(); const size_t byte_size = llvm::expectedToOptional(GetByteSize()).value_or(0); From ba98f96de284b103b0fa8f997e1affe4b47a95cf Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 31 Oct 2025 09:18:07 +0000 Subject: [PATCH 305/539] [GlobalISel] SBFX/UBFX does not create poison (#165675) This adds G_SBFX/G_UBFX to the list of instructions that do not generate poison, to allowing freeze to be hoisted above one. --- llvm/lib/CodeGen/GlobalISel/Utils.cpp | 2 + .../AArch64/GlobalISel/combine-freeze.mir | 47 +++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index ca82857319abc..5fab6ec506e94 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -1893,6 +1893,8 @@ static bool canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI, case TargetOpcode::G_UADDSAT: case TargetOpcode::G_SSUBSAT: case TargetOpcode::G_USUBSAT: + case TargetOpcode::G_SBFX: + case TargetOpcode::G_UBFX: return false; case TargetOpcode::G_SSHLSAT: case TargetOpcode::G_USHLSAT: diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir index 6b84a8488e478..1950e602ec83a 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir @@ -1440,3 +1440,50 @@ body: | %freeze:_(<4 x s32>) = G_FREEZE %extract $q0 = COPY %freeze(<4 x s32>) RET_ReallyLR implicit $x0 +... +--- +name: ubfx_does_not_generate_poison +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: ubfx_does_not_generate_poison + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: %c1:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]] + ; CHECK-NEXT: [[UBFX:%[0-9]+]]:_(s64) = G_UBFX [[FREEZE]], %c1(s64), %c1 + ; CHECK-NEXT: $x0 = COPY [[UBFX]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %c1:_(s64) = G_CONSTANT i64 1 + %1:_(s64) = G_UBFX %0, %c1, %c1 + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 + +... +--- +name: sbfx_does_not_generate_poison +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: sbfx_does_not_generate_poison + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: %c1:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]] + ; CHECK-NEXT: [[SBFX:%[0-9]+]]:_(s64) = G_SBFX [[FREEZE]], %c1(s64), %c1 + ; CHECK-NEXT: $x0 = COPY [[SBFX]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %c1:_(s64) = G_CONSTANT i64 1 + %1:_(s64) = G_SBFX %0, %c1, %c1 + %2:_(s64) = G_FREEZE %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 + +... From 2eb08176b9ff64cdf240c985bbb070250d087766 Mon Sep 17 00:00:00 2001 From: aokblast Date: Fri, 31 Oct 2025 17:18:28 +0800 Subject: [PATCH 306/539] [Object,ELF] Implement PN_XNUM extension for program headers (#162288) In ELF file, there is a possible extended header for those phnum, shnum, and shstrndx larger than the maximum of 16 bits. This extended header use section 0 to record these fields in 32 bits. We implment this feature so that programs rely on ELFFile::program_headers() can get the correct number of segments. Also, the consumers don't have to check the section 0 themselve, insteead, they can use the getPhNum() as an alternative. --- llvm/include/llvm/BinaryFormat/ELF.h | 2 + llvm/include/llvm/Object/ELF.h | 106 ++++++++++++++++++++++----- 2 files changed, 89 insertions(+), 19 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 6ee6b666c1735..39e9611c7190e 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -1125,6 +1125,8 @@ struct Elf64_Shdr { Elf64_Xword sh_entsize; }; +enum { PN_XNUM = 0xffff }; + // Special section indices. enum { SHN_UNDEF = 0, // Undefined, missing, irrelevant, or meaningless diff --git a/llvm/include/llvm/Object/ELF.h b/llvm/include/llvm/Object/ELF.h index 59f63eb6b5bb6..03d5ee21a71b4 100644 --- a/llvm/include/llvm/Object/ELF.h +++ b/llvm/include/llvm/Object/ELF.h @@ -278,9 +278,46 @@ class ELFFile { std::vector FakeSections; SmallString<0> FakeSectionStrings; + // When the number of program headers is >= PN_XNUM, the actual number is + // contained in the sh_info field of the section header at index 0. + std::optional RealPhNum; + // When the number of section headers is >= SHN_LORESERVE, the actual number + // is contained in the sh_size field of the section header at index 0. + std::optional RealShNum; + // When the section index of the section name table is >= SHN_LORESERVE, the + // actual number is contained in the sh_link field of the section header at + // index 0. + std::optional RealShStrNdx; + ELFFile(StringRef Object); + Error readShdrZero(); + public: + Expected getPhNum() const { + if (!RealPhNum) { + if (Error E = const_cast *>(this)->readShdrZero()) + return std::move(E); + } + return *RealPhNum; + } + + Expected getShNum() const { + if (!RealShNum) { + if (Error E = const_cast *>(this)->readShdrZero()) + return std::move(E); + } + return *RealShNum; + } + + Expected getShStrNdx() const { + if (!RealShStrNdx) { + if (Error E = const_cast *>(this)->readShdrZero()) + return std::move(E); + } + return *RealShStrNdx; + } + const Elf_Ehdr &getHeader() const { return *reinterpret_cast(base()); } @@ -379,22 +416,26 @@ class ELFFile { /// Iterate over program header table. Expected program_headers() const { - if (getHeader().e_phnum && getHeader().e_phentsize != sizeof(Elf_Phdr)) + uint32_t NumPh; + if (Expected PhNumOrErr = getPhNum()) + NumPh = *PhNumOrErr; + else + return PhNumOrErr.takeError(); + if (NumPh && getHeader().e_phentsize != sizeof(Elf_Phdr)) return createError("invalid e_phentsize: " + Twine(getHeader().e_phentsize)); - uint64_t HeadersSize = - (uint64_t)getHeader().e_phnum * getHeader().e_phentsize; + uint64_t HeadersSize = (uint64_t)NumPh * getHeader().e_phentsize; uint64_t PhOff = getHeader().e_phoff; if (PhOff + HeadersSize < PhOff || PhOff + HeadersSize > getBufSize()) return createError("program headers are longer than binary of size " + Twine(getBufSize()) + ": e_phoff = 0x" + Twine::utohexstr(getHeader().e_phoff) + - ", e_phnum = " + Twine(getHeader().e_phnum) + + ", e_phnum = " + Twine(NumPh) + ", e_phentsize = " + Twine(getHeader().e_phentsize)); auto *Begin = reinterpret_cast(base() + PhOff); - return ArrayRef(Begin, Begin + getHeader().e_phnum); + return ArrayRef(Begin, Begin + NumPh); } /// Get an iterator over notes in a program header. @@ -772,19 +813,15 @@ template Expected ELFFile::getSectionStringTable(Elf_Shdr_Range Sections, WarningHandler WarnHandler) const { - uint32_t Index = getHeader().e_shstrndx; - if (Index == ELF::SHN_XINDEX) { - // If the section name string table section index is greater than - // or equal to SHN_LORESERVE, then the actual index of the section name - // string table section is contained in the sh_link field of the section - // header at index 0. - if (Sections.empty()) - return createError( - "e_shstrndx == SHN_XINDEX, but the section header table is empty"); + Expected ShStrNdxOrErr = getShStrNdx(); + if (!ShStrNdxOrErr) + return ShStrNdxOrErr.takeError(); - Index = Sections[0].sh_link; - } + if (*ShStrNdxOrErr == ELF::SHN_XINDEX && Sections.empty()) + return createError( + "e_shstrndx == SHN_XINDEX, but the section header table is empty"); + uint32_t Index = *ShStrNdxOrErr; // There is no section name string table. Return FakeSectionStrings which // is non-empty if we have created fake sections. if (!Index) @@ -891,6 +928,35 @@ Expected ELFFile::getDynSymtabSize() const { template ELFFile::ELFFile(StringRef Object) : Buf(Object) {} +template Error ELFFile::readShdrZero() { + const Elf_Ehdr &Header = getHeader(); + + if ((Header.e_phnum == ELF::PN_XNUM || Header.e_shnum == 0 || + Header.e_shstrndx == ELF::SHN_XINDEX) && + Header.e_shoff != 0) { + // Pretend we have section 0 or sections() would call getShNum and thus + // become an infinite recursion. + RealShNum = 1; + auto SecOrErr = getSection(0); + if (!SecOrErr) { + RealShNum = std::nullopt; + return SecOrErr.takeError(); + } + + RealPhNum = + Header.e_phnum == ELF::PN_XNUM ? (*SecOrErr)->sh_info : Header.e_phnum; + RealShNum = Header.e_shnum == 0 ? (*SecOrErr)->sh_size : Header.e_shnum; + RealShStrNdx = Header.e_shstrndx == ELF::SHN_XINDEX ? (*SecOrErr)->sh_link + : Header.e_shstrndx; + } else { + RealPhNum = Header.e_phnum; + RealShNum = Header.e_shnum; + RealShStrNdx = Header.e_shstrndx; + } + + return Error::success(); +} + template Expected> ELFFile::create(StringRef Object) { if (sizeof(Elf_Ehdr) > Object.size()) @@ -956,9 +1022,11 @@ Expected ELFFile::sections() const { const Elf_Shdr *First = reinterpret_cast(base() + SectionTableOffset); - uintX_t NumSections = getHeader().e_shnum; - if (NumSections == 0) - NumSections = First->sh_size; + uintX_t NumSections = 0; + if (Expected ShNumOrErr = getShNum()) + NumSections = *ShNumOrErr; + else + return ShNumOrErr.takeError(); if (NumSections > UINT64_MAX / sizeof(Elf_Shdr)) return createError("invalid number of sections specified in the NULL " From c60eae8df62cfac44157eafb2b82d060a4bb4cb8 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Fri, 31 Oct 2025 10:25:39 +0100 Subject: [PATCH 307/539] [SDAG] Preserve InBounds in DAGCombines (#165424) This PR preserves the InBounds flag (#162477) where possible in PTRADD-related DAGCombines. We can't preserve them in all the cases that we could in the analogous GISel change (#152495) because SDAG usually represents pointers as integers, which means that pointer provenance is not preserved between PTRADD operations (see the discussion at PR #162477 for more details). This PR marks the places in the DAGCombiner where this is relevant explicitly. For SWDEV-516125. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 1ef5dc2863eb6..893556bd85240 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2715,6 +2715,12 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) { (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap; SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags); AddToWorklist(Add.getNode()); + // We can't set InBounds even if both original ptradds were InBounds and + // NUW: SDAG usually represents pointers as integers, therefore, the + // matched pattern behaves as if it had implicit casts: + // (ptradd inbounds (inttoptr (ptrtoint (ptradd inbounds x, y))), z) + // The outer inbounds ptradd might therefore rely on a provenance that x + // does not have. return DAG.getMemBasePlusOffset(X, Add, DL, Flags); } } @@ -2740,6 +2746,12 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) { // that. SDNodeFlags Flags = (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap; + // We can't set InBounds even if both original ptradds were InBounds and + // NUW: SDAG usually represents pointers as integers, therefore, the + // matched pattern behaves as if it had implicit casts: + // (ptradd inbounds (inttoptr (ptrtoint (ptradd inbounds GA, v))), c) + // The outer inbounds ptradd might therefore rely on a provenance that + // GA does not have. SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags); AddToWorklist(Inner.getNode()); return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags); @@ -2763,8 +2775,13 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) { bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); // If both additions in the original were NUW, reassociation preserves that. - SDNodeFlags ReassocFlags = - (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap; + SDNodeFlags CommonFlags = N->getFlags() & N1->getFlags(); + SDNodeFlags ReassocFlags = CommonFlags & SDNodeFlags::NoUnsignedWrap; + if (CommonFlags.hasNoUnsignedWrap()) { + // If both operations are NUW and the PTRADD is inbounds, the offests are + // both non-negative, so the reassociated PTRADDs are also inbounds. + ReassocFlags |= N->getFlags() & SDNodeFlags::InBounds; + } if (ZIsConstant != YIsConstant) { if (YIsConstant) From 0c046816085ff9df5cd75d9f63483fb78ddb0b12 Mon Sep 17 00:00:00 2001 From: Jens Reidel Date: Fri, 31 Oct 2025 10:30:53 +0100 Subject: [PATCH 308/539] [PowerPC] Take ABI into account for data layout (#149725) Prior to this change, the data layout calculation would not account for explicitly set `-mabi=elfv2` on `powerpc64-unknown-linux-gnu`, a target that defaults to `elfv1`. This is loosely inspired by the equivalent ARM / RISC-V code. `make check-llvm` passes fine for me, though AFAICT all the tests specify the data layout manually so there isn't really a test for this and I am not really sure what the best way to go about adding one would be. Signed-off-by: Jens Reidel --- clang/lib/Basic/Targets/PPC.h | 45 ++++++++++++------- .../PowerPC/ppc64-abi-override-datalayout.c | 8 ++++ llvm/lib/Target/PowerPC/PPCTargetMachine.cpp | 5 ++- llvm/lib/TargetParser/TargetDataLayout.cpp | 7 +-- 4 files changed, 43 insertions(+), 22 deletions(-) create mode 100644 clang/test/CodeGen/PowerPC/ppc64-abi-override-datalayout.c diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h index 846b240218172..d2eb9c5e12a90 100644 --- a/clang/lib/Basic/Targets/PPC.h +++ b/clang/lib/Basic/Targets/PPC.h @@ -445,27 +445,17 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo { LongWidth = LongAlign = PointerWidth = PointerAlign = 64; IntMaxType = SignedLong; Int64Type = SignedLong; - std::string DataLayout; if (Triple.isOSAIX()) { // TODO: Set appropriate ABI for AIX platform. - DataLayout = "E-m:a-Fi64-i64:64-i128:128-n32:64"; LongDoubleWidth = 64; LongDoubleAlign = DoubleAlign = 32; LongDoubleFormat = &llvm::APFloat::IEEEdouble(); - } else if ((Triple.getArch() == llvm::Triple::ppc64le)) { - DataLayout = "e-m:e-Fn32-i64:64-i128:128-n32:64"; + } else if ((Triple.getArch() == llvm::Triple::ppc64le) || + Triple.isPPC64ELFv2ABI()) { ABI = "elfv2"; } else { - DataLayout = "E-m:e"; - if (Triple.isPPC64ELFv2ABI()) { - ABI = "elfv2"; - DataLayout += "-Fn32"; - } else { - ABI = "elfv1"; - DataLayout += "-Fi64"; - } - DataLayout += "-i64:64-i128:128-n32:64"; + ABI = "elfv1"; } if (Triple.isOSFreeBSD() || Triple.isOSOpenBSD() || Triple.isMusl()) { @@ -473,14 +463,12 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo { LongDoubleFormat = &llvm::APFloat::IEEEdouble(); } - if (Triple.isOSAIX() || Triple.isOSLinux()) - DataLayout += "-S128-v256:256:256-v512:512:512"; - resetDataLayout(DataLayout); - // Newer PPC64 instruction sets support atomics up to 16 bytes. MaxAtomicPromoteWidth = 128; // Baseline PPC64 supports inlining atomics up to 8 bytes. MaxAtomicInlineWidth = 64; + + calculateDataLayout(); } void setMaxAtomicWidth() override { @@ -495,10 +483,33 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo { return TargetInfo::CharPtrBuiltinVaList; } + void calculateDataLayout() { + std::string DataLayout; + + if (getTriple().isOSAIX()) { + DataLayout = "E-m:a-Fi64-i64:64-i128:128-n32:64"; + } else if ((getTriple().getArch() == llvm::Triple::ppc64le)) { + DataLayout = "e-m:e-Fn32-i64:64-i128:128-n32:64"; + } else { + DataLayout = "E-m:e"; + if (ABI == "elfv2") { + DataLayout += "-Fn32"; + } else { + DataLayout += "-Fi64"; + } + DataLayout += "-i64:64-i128:128-n32:64"; + } + + if (getTriple().isOSAIX() || getTriple().isOSLinux()) + DataLayout += "-S128-v256:256:256-v512:512:512"; + resetDataLayout(DataLayout); + } + // PPC64 Linux-specific ABI options. bool setABI(const std::string &Name) override { if (Name == "elfv1" || Name == "elfv2") { ABI = Name; + calculateDataLayout(); return true; } return false; diff --git a/clang/test/CodeGen/PowerPC/ppc64-abi-override-datalayout.c b/clang/test/CodeGen/PowerPC/ppc64-abi-override-datalayout.c new file mode 100644 index 0000000000000..30b85d24a56fd --- /dev/null +++ b/clang/test/CodeGen/PowerPC/ppc64-abi-override-datalayout.c @@ -0,0 +1,8 @@ +// RUN: %clang_cc1 -triple powerpc64-unknown-linux-gnu -target-abi elfv2 %s -o - -emit-llvm | FileCheck %s + +// REQUIRES: powerpc-registered-target + +// Make sure that overriding the ABI to ELFv2 on a target that defaults to +// ELFv1 changes the data layout: + +// CHECK: target datalayout = "E-m:e-Fn32-i64:64-i128:128-n32:64-S128-v256:256:256-v512:512:512" diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index 000d29610678f..4ff489d482fa5 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -296,8 +296,9 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT, std::optional RM, std::optional CM, CodeGenOptLevel OL, bool JIT) - : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, - computeFSAdditions(FS, OL, TT), Options, + : CodeGenTargetMachineImpl(T, + TT.computeDataLayout(Options.MCOptions.ABIName), + TT, CPU, computeFSAdditions(FS, OL, TT), Options, getEffectiveRelocModel(TT, RM), getEffectivePPCCodeModel(TT, CM, JIT), OL), TLOF(createTLOF(getTargetTriple())), diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp index d765d9ccb284d..d7359234b02f7 100644 --- a/llvm/lib/TargetParser/TargetDataLayout.cpp +++ b/llvm/lib/TargetParser/TargetDataLayout.cpp @@ -208,7 +208,7 @@ static std::string computeMipsDataLayout(const Triple &TT, StringRef ABIName) { return Ret; } -static std::string computePowerDataLayout(const Triple &T) { +static std::string computePowerDataLayout(const Triple &T, StringRef ABIName) { bool is64Bit = T.isPPC64(); std::string Ret; @@ -228,7 +228,8 @@ static std::string computePowerDataLayout(const Triple &T) { // If the target ABI uses function descriptors, then the alignment of function // pointers depends on the alignment used to emit the descriptor. Otherwise, // function pointers are aligned to 32 bits because the instructions must be. - if ((T.getArch() == Triple::ppc64 && !T.isPPC64ELFv2ABI())) { + if ((T.getArch() == Triple::ppc64 && + (!T.isPPC64ELFv2ABI() && ABIName != "elfv2"))) { Ret += "-Fi64"; } else if (T.isOSAIX()) { Ret += is64Bit ? "-Fi64" : "-Fi32"; @@ -573,7 +574,7 @@ std::string Triple::computeDataLayout(StringRef ABIName) const { case Triple::ppcle: case Triple::ppc64: case Triple::ppc64le: - return computePowerDataLayout(*this); + return computePowerDataLayout(*this, ABIName); case Triple::r600: case Triple::amdgcn: return computeAMDDataLayout(*this); From 6afb6272385f0585ab0baac724d1591ff56c8069 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Fri, 31 Oct 2025 09:31:18 +0000 Subject: [PATCH 309/539] [lld][ARM] Don't emit veneers for wraparound branches. (#165263) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If an instruction at the high end of the 32-bit address space branches to one at the low end, then the branch can be within range for a B or BL instruction, and doesn't need a veneer. `ARM::inBranchRange` was failing to detect this because it calculated the offset as an int64_t, so that the offset was a small value ± 2^32 instead of just the small value. Fixes #165211. --- lld/ELF/Arch/ARM.cpp | 2 +- lld/test/ELF/arm-wraparound-veneer.s | 102 +++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 lld/test/ELF/arm-wraparound-veneer.s diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index 91a673f13d68e..6c4290ff1e448 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -472,7 +472,7 @@ bool ARM::inBranchRange(RelType type, uint64_t src, uint64_t dst) const { // Bit 0 == 1 denotes Thumb state, it is not part of the range. dst &= ~0x1; - int64_t offset = dst - src; + int64_t offset = llvm::SignExtend64<32>(dst - src); switch (type) { case R_ARM_PC24: case R_ARM_PLT32: diff --git a/lld/test/ELF/arm-wraparound-veneer.s b/lld/test/ELF/arm-wraparound-veneer.s new file mode 100644 index 0000000000000..74dd6f29d8170 --- /dev/null +++ b/lld/test/ELF/arm-wraparound-veneer.s @@ -0,0 +1,102 @@ +// REQUIRES: arm +// RUN: rm -rf %t && split-file %s %t && cd %t +// RUN: llvm-mc -filetype=obj -triple=armv7-none-eabi code.s -o code.o +// RUN: ld.lld -T unsigned1.ld code.o -o unsigned1.elf +// RUN: llvm-objdump --triple=armv7 --no-show-raw-insn -d unsigned1.elf | FileCheck %s --check-prefix=UNSIGNED1 +// RUN: ld.lld -T unsigned2.ld code.o -o unsigned2.elf +// RUN: llvm-objdump --triple=armv7 --no-show-raw-insn -d unsigned2.elf | FileCheck %s --check-prefix=UNSIGNED2 +// RUN: ld.lld -T signed1.ld code.o -o signed1.elf +// RUN: llvm-objdump --triple=armv7 --no-show-raw-insn -d signed1.elf | FileCheck %s --check-prefix=SIGNED1 +// RUN: ld.lld -T signed2.ld code.o -o signed2.elf +// RUN: llvm-objdump --triple=armv7 --no-show-raw-insn -d signed2.elf | FileCheck %s --check-prefix=SIGNED2 + +/// The aim of this test is to ensure that a BL instruction near one end of the +/// address space can reach a function at the extreme other end, directly, +/// using a branch offset that makes the address wrap round. We check this at +/// both the unsigned wraparound point (one address near 0 and the other near +/// 0xFFFFFFFF) and the signed wraparound point (addresses either side of +/// 0x80000000), crossing the boundary in both directions. In all four cases we +/// expect a direct branch with no veneer. + +// UNSIGNED1: Disassembly of section .text.lowaddr: +// UNSIGNED1: : +// UNSIGNED1: 10000: bx lr +// +// UNSIGNED1: Disassembly of section .text.highaddr: +// UNSIGNED1: <_start>: +// UNSIGNED1: ffff0000: bl 0x10000 +// UNSIGNED1-NEXT: bx lr + +// UNSIGNED2: Disassembly of section .text.lowaddr: +// UNSIGNED2: <_start>: +// UNSIGNED2: 10000: bl 0xffff0000 +// UNSIGNED2-NEXT: bx lr +// +// UNSIGNED2: Disassembly of section .text.highaddr: +// UNSIGNED2: : +// UNSIGNED2: ffff0000: bx lr + +// SIGNED1: Disassembly of section .text.posaddr: +// SIGNED1: <_start>: +// SIGNED1: 7fff0000: bl 0x80010000 +// SIGNED1-NEXT: bx lr +// +// SIGNED1: Disassembly of section .text.negaddr: +// SIGNED1: : +// SIGNED1: 80010000: bx lr + +// SIGNED2: Disassembly of section .text.posaddr: +// SIGNED2: : +// SIGNED2: 7fff0000: bx lr +// +// SIGNED2: Disassembly of section .text.negaddr: +// SIGNED2: <_start>: +// SIGNED2: 80010000: bl 0x7fff0000 +// SIGNED2-NEXT: bx lr + +//--- code.s + + .section .text.callee, "ax", %progbits + .global func + .type func, %function +func: + bx lr + + .section .text.caller, "ax", %progbits + .global _start + .type _start, %function +_start: + bl func + bx lr + +//--- unsigned1.ld + +ENTRY(_start) +SECTIONS { + .text.lowaddr 0x00010000 : AT(0x00010000) { *(.text.callee) } + .text.highaddr 0xffff0000 : AT(0xffff0000) { *(.text.caller) } +} + +//--- unsigned2.ld + +ENTRY(_start) +SECTIONS { + .text.lowaddr 0x00010000 : AT(0x00010000) { *(.text.caller) } + .text.highaddr 0xffff0000 : AT(0xffff0000) { *(.text.callee) } +} + +//--- signed1.ld + +ENTRY(_start) +SECTIONS { + .text.posaddr 0x7fff0000 : AT(0x7fff0000) { *(.text.caller) } + .text.negaddr 0x80010000 : AT(0x80010000) { *(.text.callee) } +} + +//--- signed2.ld + +ENTRY(_start) +SECTIONS { + .text.posaddr 0x7fff0000 : AT(0x7fff0000) { *(.text.callee) } + .text.negaddr 0x80010000 : AT(0x80010000) { *(.text.caller) } +} From 99afa70ddc3f66329468536d9597d8740c23e676 Mon Sep 17 00:00:00 2001 From: nerix Date: Fri, 31 Oct 2025 10:33:37 +0100 Subject: [PATCH 310/539] [LLDB][NativePDB] Estimate symbol sizes (#165727) In #165604, a test was skipped on Windows, because the native PDB plugin didn't set sizes on symbols. While the test isn't compiled with debug info, it's linked with `-gdwarf`, causing a PDB to be created on Windows. This PDB will only contain the public symbols (written by the linker) and section information. The symbols themselves don't have a size, however the DIA SDK sets a size for them. It seems like, for these data symbols, the size given from DIA is the distance to the next symbol (or the section end). This PR implements the naive approach for the native plugin. The main difference is in function/code symbols. There, DIA searches for a corresponding `S_GPROC32` which have a "code size" that is sometimes slightly smaller than the difference to the next symbol. --- .../NativePDB/SymbolFileNativePDB.cpp | 67 ++++++++++++++----- .../multiple-slides/TestMultipleSlides.py | 7 +- .../Shell/SymbolFile/NativePDB/symtab.cpp | 30 ++++----- 3 files changed, 69 insertions(+), 35 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp index e76b7a3cf274a..aaec1600dacff 100644 --- a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp +++ b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp @@ -1130,7 +1130,35 @@ void SymbolFileNativePDB::AddSymbols(Symtab &symtab) { if (!section_list) return; - for (auto pid : m_index->publics().getPublicsTable()) { + PublicSym32 last_sym; + size_t last_sym_idx = 0; + lldb::SectionSP section_sp; + + // To estimate the size of a symbol, we use the difference to the next symbol. + // If there's no next symbol or the section/segment changed, the symbol will + // take the remaining space. The estimate can be too high in case there's + // padding between symbols. This similar to the algorithm used by the DIA + // SDK. + auto finish_last_symbol = [&](const PublicSym32 *next) { + if (!section_sp) + return; + Symbol *last = symtab.SymbolAtIndex(last_sym_idx); + if (!last) + return; + + if (next && last_sym.Segment == next->Segment) { + assert(last_sym.Offset <= next->Offset); + last->SetByteSize(next->Offset - last_sym.Offset); + } else { + // the last symbol was the last in its section + assert(section_sp->GetByteSize() >= last_sym.Offset); + assert(!next || next->Segment > last_sym.Segment); + last->SetByteSize(section_sp->GetByteSize() - last_sym.Offset); + } + }; + + // The address map is sorted by the address of a symbol. + for (auto pid : m_index->publics().getAddressMap()) { PdbGlobalSymId global{pid, true}; CVSymbol sym = m_index->ReadSymbolRecord(global); auto kind = sym.kind(); @@ -1138,8 +1166,11 @@ void SymbolFileNativePDB::AddSymbols(Symtab &symtab) { continue; PublicSym32 pub = llvm::cantFail(SymbolDeserializer::deserializeAs(sym)); + finish_last_symbol(&pub); + + if (!section_sp || last_sym.Segment != pub.Segment) + section_sp = section_list->FindSectionByID(pub.Segment); - auto section_sp = section_list->FindSectionByID(pub.Segment); if (!section_sp) continue; @@ -1148,20 +1179,24 @@ void SymbolFileNativePDB::AddSymbols(Symtab &symtab) { (pub.Flags & PublicSymFlags::Code) != PublicSymFlags::None) type = eSymbolTypeCode; - symtab.AddSymbol(Symbol(/*symID=*/pid, - /*name=*/pub.Name, - /*type=*/type, - /*external=*/true, - /*is_debug=*/true, - /*is_trampoline=*/false, - /*is_artificial=*/false, - /*section_sp=*/section_sp, - /*value=*/pub.Offset, - /*size=*/0, - /*size_is_valid=*/false, - /*contains_linker_annotations=*/false, - /*flags=*/0)); - } + last_sym_idx = + symtab.AddSymbol(Symbol(/*symID=*/pid, + /*name=*/pub.Name, + /*type=*/type, + /*external=*/true, + /*is_debug=*/true, + /*is_trampoline=*/false, + /*is_artificial=*/false, + /*section_sp=*/section_sp, + /*value=*/pub.Offset, + /*size=*/0, + /*size_is_valid=*/false, + /*contains_linker_annotations=*/false, + /*flags=*/0)); + last_sym = pub; + } + + finish_last_symbol(nullptr); } size_t SymbolFileNativePDB::ParseFunctions(CompileUnit &comp_unit) { diff --git a/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py b/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py index 7fd2ff4229004..5fd2b767a6237 100644 --- a/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py +++ b/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py @@ -12,10 +12,6 @@ class MultipleSlidesTestCase(TestBase): NO_DEBUG_INFO_TESTCASE = True - # The intermediate object main.o is compiled without debug info, but - # a.out is linked with `-gdwarf` on Windows. This creates a PDB. - # However, in the native PDB plugin, the symbols don't have a size. - @expectedFailureWindows def test_mulitple_slides(self): """Test that a binary can be slid multiple times correctly.""" self.build() @@ -33,10 +29,13 @@ def test_mulitple_slides(self): first_sym.GetEndAddress().GetOffset() - first_sym.GetStartAddress().GetOffset() ) + int_size = target.FindFirstType("int").GetByteSize() + self.assertGreaterEqual(first_size, 2048 * int_size) second_size = ( second_sym.GetEndAddress().GetOffset() - second_sym.GetStartAddress().GetOffset() ) + self.assertGreaterEqual(second_size, 2048 * int_size) # View the first element of `first` and `second` while # they have no load address set. diff --git a/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp b/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp index beb5ae2f90256..75c59c560fad9 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp @@ -42,18 +42,18 @@ int main(int argc, char **argv) { return ns::a_function() + b.b_func(); } -// CHECK-DAG: Code {{.*}} main -// CHECK-DAG: Code {{.*}} ?b_func@?$B@F@ns@@QEBAHXZ -// CHECK-DAG: Code {{.*}} ?something@A@@QEAAXXZ -// CHECK-DAG: Code {{.*}} ??_GDyn@ns@@UEAAPEAXI@Z -// CHECK-DAG: Code {{.*}} ??2@YAPEAX_K@Z -// CHECK-DAG: Code {{.*}} ??3@YAXPEAX_K@Z -// CHECK-DAG: Code {{.*}} ?static_fn@C@?$B@H@ns@@SAHXZ -// CHECK-DAG: Code {{.*}} ?a_function@ns@@YAHXZ -// CHECK-DAG: Code {{.*}} ?static_fn@C@?$B@_N@ns@@SAHXZ -// CHECK-DAG: Code {{.*}} ??1Dyn@ns@@UEAA@XZ -// CHECK-DAG: Code {{.*}} ??0Dyn@ns@@QEAA@XZ -// CHECK-DAG: Data {{.*}} ?global_int@@3HA -// CHECK-DAG: Data {{.*}} ??_7Dyn@ns@@6B@ -// CHECK-DAG: Data {{.*}} ?global_a@@3UA@@A -// CHECK-DAG: Data {{.*}} ?global_c@@3UC@?$B@_J@ns@@A +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 main +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?b_func@?$B@F@ns@@QEBAHXZ +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?something@A@@QEAAXXZ +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??_GDyn@ns@@UEAAPEAXI@Z +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??2@YAPEAX_K@Z +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??3@YAXPEAX_K@Z +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?static_fn@C@?$B@H@ns@@SAHXZ +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?a_function@ns@@YAHXZ +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?static_fn@C@?$B@_N@ns@@SAHXZ +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??1Dyn@ns@@UEAA@XZ +// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??0Dyn@ns@@QEAA@XZ +// CHECK-DAG: Data 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?global_int@@3HA +// CHECK-DAG: Data 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??_7Dyn@ns@@6B@ +// CHECK-DAG: Data 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?global_a@@3UA@@A +// CHECK-DAG: Data 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?global_c@@3UC@?$B@_J@ns@@A From 87ad9dda3e4815e10d3a6c19d2d94019894bcdf4 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Fri, 31 Oct 2025 09:44:13 +0000 Subject: [PATCH 311/539] [llvm][dwarfdump] Show name of referenced DW_TAG_APPLE_property (#165537) This patch makes `dwarfdump` show the `DW_AT_APPLE_property_name` of a referenced `DW_TAG_APPLE_property` (similar to how we show the name of a referenced `DW_AT_type`). Eventually we'll extend this to the DWARFv6 property tags too. Before: ``` 0x00000013: DW_TAG_APPLE_property DW_AT_APPLE_property_name ("propertyName") 0x0000001b: DW_TAG_member DW_AT_name ("_ivar") DW_AT_APPLE_property (0x00000013) ``` After: ``` 0x00000013: DW_TAG_APPLE_property DW_AT_APPLE_property_name ("propertyName") 0x0000001b: DW_TAG_member DW_AT_name ("_ivar") DW_AT_APPLE_property (0x00000013 "propertyName") ``` --- llvm/lib/DebugInfo/DWARF/DWARFDie.cpp | 28 ++++ .../AArch64/DW_AT_APPLE_property.s | 126 ++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 llvm/test/tools/llvm-dwarfdump/AArch64/DW_AT_APPLE_property.s diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index db5cc37c93f90..6c78ef05e1b61 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -129,6 +129,25 @@ prettyLanguageVersionString(const DWARFAttribute &AttrValue, static_cast(*LName), *LVersion); } +static llvm::Expected +getApplePropertyName(const DWARFDie &PropDIE) { + if (!PropDIE) + return llvm::createStringError("invalid DIE"); + + if (PropDIE.getTag() != DW_TAG_APPLE_property) + return llvm::createStringError("not referencing a DW_TAG_APPLE_property"); + + auto PropNameForm = PropDIE.find(DW_AT_APPLE_property_name); + if (!PropNameForm) + return ""; + + auto NameOrErr = PropNameForm->getAsCString(); + if (!NameOrErr) + return NameOrErr.takeError(); + + return *NameOrErr; +} + static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, const DWARFAttribute &AttrValue, unsigned Indent, DIDumpOptions DumpOpts) { @@ -233,6 +252,15 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, Die.getAttributeValueAsReferencedDie(FormValue).getName( DINameKind::LinkageName)) OS << Space << "\"" << Name << '\"'; + } else if (Attr == DW_AT_APPLE_property) { + auto PropDIE = Die.getAttributeValueAsReferencedDie(FormValue); + if (auto PropNameOrErr = getApplePropertyName(PropDIE)) + OS << Space << "\"" << *PropNameOrErr << '\"'; + else + DumpOpts.RecoverableErrorHandler(createStringError( + errc::invalid_argument, + llvm::formatv("decoding DW_AT_APPLE_property_name: {}", + toString(PropNameOrErr.takeError())))); } else if (Attr == DW_AT_type || Attr == DW_AT_containing_type) { DWARFDie D = resolveReferencedType(Die, FormValue); if (D && !D.isNULL()) { diff --git a/llvm/test/tools/llvm-dwarfdump/AArch64/DW_AT_APPLE_property.s b/llvm/test/tools/llvm-dwarfdump/AArch64/DW_AT_APPLE_property.s new file mode 100644 index 0000000000000..6c38791b0a083 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/AArch64/DW_AT_APPLE_property.s @@ -0,0 +1,126 @@ +# Checks that we correctly display the DW_AT_APPLE_property_name of a +# referenced DW_TAG_APPLE_property. +# +# RUN: llvm-mc -triple=aarch64--darwin -filetype=obj -o %t.o < %s +# RUN: not llvm-dwarfdump %t.o 2> %t.errs.txt | FileCheck %s +# RUN: FileCheck %s --check-prefix=ERRORS < %t.errs.txt + +# CHECK: 0x[[PROP_REF:[0-9a-f]+]]: DW_TAG_APPLE_property +# CHECK-NEXT: DW_AT_APPLE_property_name ("autoSynthProp") +# +# CHECK: 0x[[NO_NAME_PROP:[0-9a-f]+]]: DW_TAG_APPLE_property +# CHECK-NOT: DW_AT_APPLE_property_name +# +# CHECK: 0x[[INVALID_STRP:[0-9a-f]+]]: DW_TAG_APPLE_property +# CHECK-NEXT: DW_AT_APPLE_property_name +# +# CHECK: DW_TAG_member +# CHECK: DW_AT_APPLE_property (0x[[PROP_REF]] "autoSynthProp") +# CHECK: DW_AT_APPLE_property (0x[[NO_NAME_PROP]] "") +# CHECK: DW_AT_APPLE_property (0x{{.*}}) +# CHECK: DW_AT_APPLE_property (0x{{.*}}) +# CHECK: DW_AT_APPLE_property (0x[[INVALID_STRP]]) + +# ERRORS: error: decoding DW_AT_APPLE_property_name: not referencing a DW_TAG_APPLE_property +# ERRORS: error: decoding DW_AT_APPLE_property_name: invalid DIE +# ERRORS: error: decoding DW_AT_APPLE_property_name: DW_FORM_strp offset 102 is beyond .debug_str bounds + + .section __DWARF,__debug_abbrev,regular,debug +Lsection_abbrev: + .byte 1 ; Abbreviation Code + .byte 17 ; DW_TAG_compile_unit + .byte 1 ; DW_CHILDREN_yes + .byte 114 ; DW_AT_str_offsets_base + .byte 23 ; DW_FORM_sec_offset + .byte 0 ; EOM(1) + .byte 0 ; EOM(2) + .byte 2 ; Abbreviation Code + .byte 19 ; DW_TAG_structure_type + .byte 1 ; DW_CHILDREN_yes + .byte 3 ; DW_AT_name + .byte 37 ; DW_FORM_strx1 + .byte 0 ; EOM(1) + .byte 0 ; EOM(2) + .byte 3 ; Abbreviation Code + .ascii "\200\204\001" ; DW_TAG_APPLE_property + .byte 0 ; DW_CHILDREN_no + .ascii "\350\177" ; DW_AT_APPLE_property_name + .byte 37 ; DW_FORM_strx1 + .byte 0 ; EOM(1) + .byte 0 ; EOM(2) + .byte 4 ; Abbreviation Code + .ascii "\200\204\001" ; DW_TAG_APPLE_property + .byte 0 ; DW_CHILDREN_no + .byte 0 ; EOM(1) + .byte 0 ; EOM(2) + .byte 5 ; Abbreviation Code + .ascii "\200\204\001" ; DW_TAG_APPLE_property + .byte 0 ; DW_CHILDREN_no + .ascii "\350\177" ; DW_AT_APPLE_property_name + .byte 14 ; DW_FORM_strp + .byte 0 ; EOM(1) + .byte 0 ; EOM(2) + .byte 6 ; Abbreviation Code + .byte 13 ; DW_TAG_member + .byte 0 ; DW_CHILDREN_no + .byte 3 ; DW_AT_name + .byte 37 ; DW_FORM_strx1 + .ascii "\355\177" ; DW_AT_APPLE_property + .byte 19 ; DW_FORM_ref4 + .ascii "\355\177" ; DW_AT_APPLE_property + .byte 19 ; DW_FORM_ref4 + .ascii "\355\177" ; DW_AT_APPLE_property + .byte 19 ; DW_FORM_ref4 + .ascii "\355\177" ; DW_AT_APPLE_property + .byte 19 ; DW_FORM_ref4 + .ascii "\355\177" ; DW_AT_APPLE_property + .byte 19 ; DW_FORM_ref4 + .byte 0 ; EOM(1) + .byte 0 ; EOM(2) + .byte 0 ; EOM(3) + .section __DWARF,__debug_info,regular,debug +Lsection_info: +Lcu_begin0: +Lset0 = Ldebug_info_end0-Ldebug_info_start0 ; Length of Unit + .long Lset0 +Ldebug_info_start0: + .short 5 ; DWARF version number + .byte 1 ; DWARF Unit Type + .byte 8 ; Address Size (in bytes) +Lset1 = Lsection_abbrev-Lsection_abbrev ; Offset Into Abbrev. Section + .long Lset1 + .byte 1 ; Abbrev [1] DW_TAG_compile_unit +Lset2 = Lstr_offsets_base0-Lsection_str_off ; DW_AT_str_offsets_base + .long Lset2 + .byte 2 ; Abbrev [2] DW_TAG_structure_type + .byte 2 ; DW_AT_name + .byte 3 ; Abbrev [3] DW_TAG_APPLE_property + .byte 0 ; DW_AT_APPLE_property_name + .byte 4 ; Abbrev [4] DW_TAG_APPLE_property + .byte 5 ; Abbrev [5] DW_TAG_APPLE_property + .long 102 ; DW_AT_APPLE_property_name + .byte 6 ; Abbrev [6] DW_TAG_member + .byte 1 ; DW_AT_name + .long 19 ; DW_AT_APPLE_property + .long 21 ; DW_AT_APPLE_property + .long 17 ; DW_AT_APPLE_property + .long 0 ; DW_AT_APPLE_property + .long 22 ; DW_AT_APPLE_property + .byte 0 ; End Of Children Mark + .byte 0 ; End Of Children Mark +Ldebug_info_end0: + .section __DWARF,__debug_str_offs,regular,debug +Lsection_str_off: + .long 16 ; Length of String Offsets Set + .short 5 + .short 0 +Lstr_offsets_base0: + .section __DWARF,__debug_str,regular,debug +Linfo_string: + .asciz "autoSynthProp" ; string offset=0 + .asciz "_var" ; string offset=14 + .asciz "Foo" ; string offset=19 + .section __DWARF,__debug_str_offs,regular,debug + .long 0 + .long 14 + .long 19 From 28cec4dbebea175e6ea80e785ddc2ff9c3540bda Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 31 Oct 2025 09:45:55 +0000 Subject: [PATCH 312/539] [llvm][clang] Remove handling of Python2 ConfigParser module name (#163727) LLVM now requires Python >= 3.8, and ConfigParser was renamed to configparser in 3.0: https://docs.python.org/3/whatsnew/3.0.html#library-changes A few places imported it under the Python2 name even for Python3, I have swapped those to the Python3 name. This was reported by https://pypi.org/project/vermin/ as the file having incompatible versions. Since once import is 2.x and one is 3.x. --- clang/tools/scan-view/share/ScanView.py | 6 +----- clang/utils/check_cfc/check_cfc.py | 6 +----- .../lit/tests/Inputs/test-data-micro/dummy_format.py | 8 ++------ llvm/utils/lit/tests/Inputs/test-data/dummy_format.py | 8 ++------ llvm/utils/lit/tests/Inputs/xunit-output/dummy_format.py | 8 ++------ 5 files changed, 8 insertions(+), 28 deletions(-) diff --git a/clang/tools/scan-view/share/ScanView.py b/clang/tools/scan-view/share/ScanView.py index a89bf3f24fc5a..c395b9590e0ee 100644 --- a/clang/tools/scan-view/share/ScanView.py +++ b/clang/tools/scan-view/share/ScanView.py @@ -29,11 +29,7 @@ import itertools import Reporter - -try: - import configparser -except ImportError: - import ConfigParser as configparser +import configparser ### # Various patterns matched or replaced by server. diff --git a/clang/utils/check_cfc/check_cfc.py b/clang/utils/check_cfc/check_cfc.py index 8d42ec532bbb7..7658f6c27009b 100755 --- a/clang/utils/check_cfc/check_cfc.py +++ b/clang/utils/check_cfc/check_cfc.py @@ -56,11 +56,7 @@ import subprocess import sys import tempfile - -try: - import configparser -except ImportError: - import ConfigParser as configparser +import configparser import io import obj_diff diff --git a/llvm/utils/lit/tests/Inputs/test-data-micro/dummy_format.py b/llvm/utils/lit/tests/Inputs/test-data-micro/dummy_format.py index b400083a0d967..27b738edf8e14 100644 --- a/llvm/utils/lit/tests/Inputs/test-data-micro/dummy_format.py +++ b/llvm/utils/lit/tests/Inputs/test-data-micro/dummy_format.py @@ -1,9 +1,5 @@ import os - -try: - import ConfigParser -except ImportError: - import configparser as ConfigParser +import configparser import lit.formats import lit.Test @@ -16,7 +12,7 @@ def execute(self, test, lit_config): source_path = test.getSourcePath() - cfg = ConfigParser.ConfigParser() + cfg = configparser.ConfigParser() cfg.read(source_path) # Create the basic test result. diff --git a/llvm/utils/lit/tests/Inputs/test-data/dummy_format.py b/llvm/utils/lit/tests/Inputs/test-data/dummy_format.py index 30bd1814a6a42..b4c1b92637d01 100644 --- a/llvm/utils/lit/tests/Inputs/test-data/dummy_format.py +++ b/llvm/utils/lit/tests/Inputs/test-data/dummy_format.py @@ -1,9 +1,5 @@ import os - -try: - import ConfigParser -except ImportError: - import configparser as ConfigParser +import configparser import lit.formats import lit.Test @@ -16,7 +12,7 @@ def execute(self, test, lit_config): source_path = test.getSourcePath() - cfg = ConfigParser.ConfigParser() + cfg = configparser.ConfigParser() cfg.read(source_path) # Create the basic test result. diff --git a/llvm/utils/lit/tests/Inputs/xunit-output/dummy_format.py b/llvm/utils/lit/tests/Inputs/xunit-output/dummy_format.py index efac0b561c44b..43da0973df614 100644 --- a/llvm/utils/lit/tests/Inputs/xunit-output/dummy_format.py +++ b/llvm/utils/lit/tests/Inputs/xunit-output/dummy_format.py @@ -1,9 +1,5 @@ import os - -try: - import ConfigParser -except ImportError: - import configparser as ConfigParser +import configparser import lit.formats import lit.Test @@ -16,7 +12,7 @@ def execute(self, test, lit_config): source_path = test.getSourcePath() - cfg = ConfigParser.ConfigParser() + cfg = configparser.ConfigParser() cfg.read(source_path) # Create the basic test result. From 35146363ae0418eeb09b4cf41a601888eaac948b Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Fri, 31 Oct 2025 09:46:22 +0000 Subject: [PATCH 313/539] [lldb][TypeSystem] Fix GetTypeInfo for vector and complex types (#165837) We were setting these bits inverted. Not sure how this bug actually manifests, I just noticed when working on https://github.com/llvm/llvm-project/pull/165707. I suspect these types just aren't very frequently used. --- .../TypeSystem/Clang/TypeSystemClang.cpp | 21 +++++++++------- lldb/unittests/Symbol/TestTypeSystemClang.cpp | 24 +++++++++++++++++++ 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index f5a8d84a3ce50..4ec987c8d0103 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -3965,9 +3965,9 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type, if (complex_type) { clang::QualType complex_element_type(complex_type->getElementType()); if (complex_element_type->isIntegerType()) - complex_type_flags |= eTypeIsFloat; - else if (complex_element_type->isFloatingType()) complex_type_flags |= eTypeIsInteger; + else if (complex_element_type->isFloatingType()) + complex_type_flags |= eTypeIsFloat; } return complex_type_flags; } break; @@ -4062,12 +4062,17 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type, uint32_t vector_type_flags = eTypeHasChildren | eTypeIsVector; const clang::VectorType *vector_type = llvm::dyn_cast( qual_type->getCanonicalTypeInternal()); - if (vector_type) { - if (vector_type->isIntegerType()) - vector_type_flags |= eTypeIsFloat; - else if (vector_type->isFloatingType()) - vector_type_flags |= eTypeIsInteger; - } + if (!vector_type) + return 0; + + QualType element_type = vector_type->getElementType(); + if (element_type.isNull()) + return 0; + + if (element_type->isIntegerType()) + vector_type_flags |= eTypeIsInteger; + else if (element_type->isFloatingType()) + vector_type_flags |= eTypeIsFloat; return vector_type_flags; } default: diff --git a/lldb/unittests/Symbol/TestTypeSystemClang.cpp b/lldb/unittests/Symbol/TestTypeSystemClang.cpp index 1981e912fa4fa..4de595fd62825 100644 --- a/lldb/unittests/Symbol/TestTypeSystemClang.cpp +++ b/lldb/unittests/Symbol/TestTypeSystemClang.cpp @@ -1123,6 +1123,30 @@ TEST_F(TestTypeSystemClang, AddMethodToCXXRecordType_ParmVarDecls) { EXPECT_EQ(method_it->getParamDecl(1)->getDeclContext(), *method_it); } +TEST_F(TestTypeSystemClang, TestGetTypeInfo) { + // Tests TypeSystemClang::GetTypeInfo + + const ASTContext &ast = m_ast->getASTContext(); + + CompilerType complex_int = m_ast->GetType(ast.getComplexType(ast.IntTy)); + EXPECT_EQ(complex_int.GetTypeInfo(), + (eTypeIsInteger | eTypeIsComplex | eTypeIsBuiltIn | eTypeHasValue)); + + CompilerType complex_float = m_ast->GetType(ast.getComplexType(ast.FloatTy)); + EXPECT_EQ(complex_float.GetTypeInfo(), + (eTypeIsFloat | eTypeIsComplex | eTypeIsBuiltIn | eTypeHasValue)); + + CompilerType vector_of_int = + m_ast->GetType(ast.getVectorType(ast.IntTy, 1, VectorKind::Generic)); + EXPECT_EQ(vector_of_int.GetTypeInfo(), + (eTypeIsInteger | eTypeIsVector | eTypeHasChildren)); + + CompilerType vector_of_float = + m_ast->GetType(ast.getVectorType(ast.FloatTy, 1, VectorKind::Generic)); + EXPECT_EQ(vector_of_float.GetTypeInfo(), + (eTypeIsFloat | eTypeIsVector | eTypeHasChildren)); +} + TEST_F(TestTypeSystemClang, AsmLabel_CtorDtor) { // Tests TypeSystemClang::DeclGetMangledName for constructors/destructors // with and without AsmLabels. From 557682ed3ca17f025c69cbd9824f4faea81f383d Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 31 Oct 2025 09:46:36 +0000 Subject: [PATCH 314/539] [clang][docs] Remove Python2 import handler in dump_ast_matchers.py (#163730) LLVM requires Python >=3.8 and in Python 3.0 urllib2 was renamed to urllib. https://docs.python.org/3/whatsnew/3.0.html#library-changes --- clang/docs/tools/dump_ast_matchers.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/clang/docs/tools/dump_ast_matchers.py b/clang/docs/tools/dump_ast_matchers.py index 46b7bb718ba08..5db6826070934 100755 --- a/clang/docs/tools/dump_ast_matchers.py +++ b/clang/docs/tools/dump_ast_matchers.py @@ -6,11 +6,8 @@ import collections import re import os +from urllib.request import urlopen -try: - from urllib.request import urlopen -except ImportError: - from urllib2 import urlopen CLASS_INDEX_PAGE_URL = "https://clang.llvm.org/doxygen/classes.html" try: From b9775a4d7a23074f762a5724b679a1b85e5e976a Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 31 Oct 2025 09:49:16 +0000 Subject: [PATCH 315/539] [clang][utils] Make CmpDriver Python3 compatible (#163740) The majority of this is running 2to3 on it: * print is a function in 3.x * next(it) instead of it.next() Then there was a use of "map(None, iterables..)" which in Python 2 was a way of saying "combine these iterables, and if one is shorter, pad with None". This no longer works in Python3, the equivalent is zip_longest: https://docs.python.org/3/library/itertools.html#itertools.zip_longest fillvalue defaults to None but I made it explicit since it may help someone debugging this script in future. (I doubt it has been used for a very long time) --- clang/utils/CmpDriver | 63 ++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/clang/utils/CmpDriver b/clang/utils/CmpDriver index 12ce7a3250f66..0732baa76d01c 100755 --- a/clang/utils/CmpDriver +++ b/clang/utils/CmpDriver @@ -5,6 +5,7 @@ A simple utility that compares tool invocations and exit codes issued by compiler drivers that support -### (e.g. gcc and clang). """ +from itertools import zip_longest import subprocess def splitArgs(s): @@ -22,7 +23,7 @@ def splitArgs(s): elif inQuote: if c == '\\': current += c - current += it.next() + current += next(it) else: current += c elif not c.isspace(): @@ -135,77 +136,77 @@ def main(): # Compare stdout. if infoA.stdout != infoB.stdout: - print '-- STDOUT DIFFERS -' - print 'A OUTPUT: ',infoA.stdout - print 'B OUTPUT: ',infoB.stdout - print + print('-- STDOUT DIFFERS -') + print('A OUTPUT: ',infoA.stdout) + print('B OUTPUT: ',infoB.stdout) + print() diff = ZipperDiff(infoA.stdout.split('\n'), infoB.stdout.split('\n')) for i,(aElt,bElt) in enumerate(diff.getDiffs()): if aElt is None: - print 'A missing: %s' % bElt + print('A missing: %s' % bElt) elif bElt is None: - print 'B missing: %s' % aElt + print('B missing: %s' % aElt) else: - print 'mismatch: A: %s' % aElt - print ' B: %s' % bElt + print('mismatch: A: %s' % aElt) + print(' B: %s' % bElt) differ = True # Compare stderr. if infoA.stderr != infoB.stderr: - print '-- STDERR DIFFERS -' - print 'A STDERR: ',infoA.stderr - print 'B STDERR: ',infoB.stderr - print + print('-- STDERR DIFFERS -') + print('A STDERR: ',infoA.stderr) + print('B STDERR: ',infoB.stderr) + print() diff = ZipperDiff(infoA.stderr.split('\n'), infoB.stderr.split('\n')) for i,(aElt,bElt) in enumerate(diff.getDiffs()): if aElt is None: - print 'A missing: %s' % bElt + print('A missing: %s' % bElt) elif bElt is None: - print 'B missing: %s' % aElt + print('B missing: %s' % aElt) else: - print 'mismatch: A: %s' % aElt - print ' B: %s' % bElt + print('mismatch: A: %s' % aElt) + print(' B: %s' % bElt) differ = True # Compare commands. - for i,(a,b) in enumerate(map(None, infoA.commands, infoB.commands)): + for i,(a,b) in enumerate(zip_longest(infoA.commands, infoB.commands, fillvalue=None)): if a is None: - print 'A MISSING:',' '.join(b) + print('A MISSING:',' '.join(b)) differ = True continue elif b is None: - print 'B MISSING:',' '.join(a) + print('B MISSING:',' '.join(a)) differ = True continue diff = DriverZipperDiff(a,b) diffs = list(diff.getDiffs()) if diffs: - print '-- COMMAND %d DIFFERS -' % i - print 'A COMMAND:',' '.join(a) - print 'B COMMAND:',' '.join(b) - print + print('-- COMMAND %d DIFFERS -' % i) + print('A COMMAND:',' '.join(a)) + print('B COMMAND:',' '.join(b)) + print() for i,(aElt,bElt) in enumerate(diffs): if aElt is None: - print 'A missing: %s' % bElt + print('A missing: %s' % bElt) elif bElt is None: - print 'B missing: %s' % aElt + print('B missing: %s' % aElt) else: - print 'mismatch: A: %s' % aElt - print ' B: %s' % bElt + print('mismatch: A: %s' % aElt) + print(' B: %s' % bElt) differ = True # Compare result codes. if infoA.exitCode != infoB.exitCode: - print '-- EXIT CODES DIFFER -' - print 'A: ',infoA.exitCode - print 'B: ',infoB.exitCode + print('-- EXIT CODES DIFFER -') + print('A: ',infoA.exitCode) + print('B: ',infoB.exitCode) differ = True if differ: From d71a302f46919ee429f4e1fa767c2361e9d8a29c Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 31 Oct 2025 09:50:14 +0000 Subject: [PATCH 316/539] [llvm][tools][opt-viewer] Remove Python2 compatability code in optrecord.py (#163744) LLVM requires Python >= 3.8. itervalues was unused so I have removed it. --- llvm/tools/opt-viewer/optrecord.py | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/llvm/tools/opt-viewer/optrecord.py b/llvm/tools/opt-viewer/optrecord.py index 8014204a64f45..d49ebae12cd5f 100644 --- a/llvm/tools/opt-viewer/optrecord.py +++ b/llvm/tools/opt-viewer/optrecord.py @@ -19,35 +19,14 @@ from multiprocessing import Lock import os, os.path import subprocess - -try: - # The previously builtin function `intern()` was moved - # to the `sys` module in Python 3. - from sys import intern -except: - pass - +from sys import intern import re import optpmap -try: - dict.iteritems -except AttributeError: - # Python 3 - def itervalues(d): - return iter(d.values()) - - def iteritems(d): - return iter(d.items()) - -else: - # Python 2 - def itervalues(d): - return d.itervalues() - - def iteritems(d): - return d.iteritems() + +def iteritems(d): + return iter(d.items()) def html_file_name(filename): From 84bb030d77c1f13adf2be77b044282b4827c1304 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 31 Oct 2025 10:11:15 +0000 Subject: [PATCH 317/539] [llvm][tools][opt-viewer] Put back missing function Fixes 35e1a2f0a375b4cdc809d2bab911fdb197284f55 itervalues is in fact used in opt-viewer.py. --- llvm/tools/opt-viewer/optrecord.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/tools/opt-viewer/optrecord.py b/llvm/tools/opt-viewer/optrecord.py index d49ebae12cd5f..b9244fd1ae739 100644 --- a/llvm/tools/opt-viewer/optrecord.py +++ b/llvm/tools/opt-viewer/optrecord.py @@ -25,6 +25,10 @@ import optpmap +def itervalues(d): + return iter(d.values()) + + def iteritems(d): return iter(d.items()) From 237df89d9281ee17aaf310d97255365e7ae16c5b Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Fri, 31 Oct 2025 10:18:13 +0000 Subject: [PATCH 318/539] [lldb][TypeSystem] Remove count parameter from TypeSystem::IsFloatingPointType (#165707) Similar motivation to https://github.com/llvm/llvm-project/pull/165702. It was unused in all callsites and inconsistent with other APIs like `IsIntegerType` (which doesn't take a `count` parameter). If we ever need a "how many elements does this type represent", we can implement one with a new TypeSystem API that does exactly that. Some callsites checked for `count == 1` previously, but I suspect what they intended to do is check for whether it's a vector type or complex type, before reading the FP register. I'm somewhat confident that's the case because the `TypeSystemClang::GetTypeInfo` currently incorrectly sets the integer and floating point bits for complex and vector types (will fix separately). But some architectures might choose to pass single-element vectors in scalar registers. I should probably changes these to check the vector element size. All the `count == 2 && is_complex` were redundant because `count == 2` iff `is_complex == true`. So I just removed the count check there. --- lldb/include/lldb/Symbol/CompilerType.h | 2 +- lldb/include/lldb/Symbol/TypeSystem.h | 2 +- lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp | 3 +-- lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp | 20 +++++++++---------- .../ABI/LoongArch/ABISysV_loongarch.cpp | 5 ++--- lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp | 10 ++++------ .../Plugins/ABI/Mips/ABISysV_mips64.cpp | 5 ++--- .../Plugins/ABI/PowerPC/ABISysV_ppc.cpp | 6 ++---- .../Plugins/ABI/PowerPC/ABISysV_ppc64.cpp | 3 +-- .../Plugins/ABI/RISCV/ABISysV_riscv.cpp | 5 ++--- .../Plugins/ABI/SystemZ/ABISysV_s390x.cpp | 3 +-- .../source/Plugins/ABI/X86/ABIMacOSX_i386.cpp | 3 +-- .../source/Plugins/ABI/X86/ABISysV_x86_64.cpp | 9 +++------ .../Plugins/ABI/X86/ABIWindows_x86_64.cpp | 9 +++------ .../SymbolFile/DWARF/DWARFASTParserClang.cpp | 3 +-- .../TypeSystem/Clang/TypeSystemClang.cpp | 10 +++------- .../TypeSystem/Clang/TypeSystemClang.h | 2 +- lldb/source/Symbol/CompilerType.cpp | 9 +++------ 18 files changed, 42 insertions(+), 67 deletions(-) diff --git a/lldb/include/lldb/Symbol/CompilerType.h b/lldb/include/lldb/Symbol/CompilerType.h index 1fcf255123d9f..869c5076ee0a7 100644 --- a/lldb/include/lldb/Symbol/CompilerType.h +++ b/lldb/include/lldb/Symbol/CompilerType.h @@ -144,7 +144,7 @@ class CompilerType { bool IsDefined() const; - bool IsFloatingPointType(uint32_t &count, bool &is_complex) const; + bool IsFloatingPointType(bool &is_complex) const; bool IsFunctionType() const; diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h index 40a80d8d09286..25b208a65349b 100644 --- a/lldb/include/lldb/Symbol/TypeSystem.h +++ b/lldb/include/lldb/Symbol/TypeSystem.h @@ -163,7 +163,7 @@ class TypeSystem : public PluginInterface, virtual bool IsDefined(lldb::opaque_compiler_type_t type) = 0; virtual bool IsFloatingPointType(lldb::opaque_compiler_type_t type, - uint32_t &count, bool &is_complex) = 0; + bool &is_complex) = 0; virtual bool IsFunctionType(lldb::opaque_compiler_type_t type) = 0; diff --git a/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp b/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp index 5b5f6facc924c..8e690218843fa 100644 --- a/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp +++ b/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp @@ -1695,7 +1695,6 @@ Status ABIMacOSX_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -1767,7 +1766,7 @@ Status ABIMacOSX_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); diff --git a/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp b/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp index bb0c4ba3f1b57..7258f5cc9acb5 100644 --- a/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp +++ b/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp @@ -1550,7 +1550,6 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl( bool is_signed; bool is_complex; - uint32_t float_count; bool is_vfp_candidate = false; uint8_t vfp_count = 0; uint8_t vfp_byte_size = 0; @@ -1634,8 +1633,9 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl( if (!GetReturnValuePassedInMemory(thread, reg_ctx, *byte_size, value)) return return_valobj_sp; } - } else if (compiler_type.IsFloatingPointType(float_count, is_complex)) { - if (float_count == 1 && !is_complex) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { + // Vector types are handled above. + if (!is_complex) { switch (*bit_width) { default: return return_valobj_sp; @@ -1681,7 +1681,7 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl( break; } } - } else if (is_complex && float_count == 2) { + } else if (is_complex) { if (IsArmHardFloat(thread)) { is_vfp_candidate = true; vfp_byte_size = *byte_size / 2; @@ -1709,8 +1709,9 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl( vfp_count = (*base_byte_size == 8 ? homogeneous_count : homogeneous_count * 2); } - } else if (base_type.IsFloatingPointType(float_count, is_complex)) { - if (float_count == 1 && !is_complex) { + } else if (base_type.IsFloatingPointType(is_complex)) { + // Vector types are handled above. + if (!is_complex) { is_vfp_candidate = true; if (base_byte_size) vfp_byte_size = *base_byte_size; @@ -1727,10 +1728,10 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl( base_type = compiler_type.GetFieldAtIndex(index, name, nullptr, nullptr, nullptr); - if (base_type.IsFloatingPointType(float_count, is_complex)) { + if (base_type.IsFloatingPointType(is_complex)) { std::optional base_byte_size = llvm::expectedToOptional(base_type.GetByteSize(&thread)); - if (float_count == 2 && is_complex) { + if (is_complex) { if (index != 0 && base_byte_size && vfp_byte_size != *base_byte_size) break; @@ -1841,7 +1842,6 @@ Status ABISysV_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -1884,7 +1884,7 @@ Status ABISysV_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); diff --git a/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp b/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp index 7bf99ce7bddee..4f5e29c0eaac7 100644 --- a/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp +++ b/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp @@ -510,11 +510,10 @@ ValueObjectSP ABISysV_loongarch::GetReturnValueObjectSimple( value, ConstString("")); } if (type_flags & eTypeIsFloat) { - uint32_t float_count = 0; bool is_complex = false; - if (compiler_type.IsFloatingPointType(float_count, is_complex) && - float_count == 1 && !is_complex) { + if (compiler_type.IsFloatingPointType(is_complex) && + !(type_flags & eTypeIsVector) && !is_complex) { return_valobj_sp = GetValObjFromFPRegs(thread, reg_ctx, machine, type_flags, byte_size); return return_valobj_sp; diff --git a/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp b/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp index dd91a05534e37..e03604467ceec 100644 --- a/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp +++ b/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp @@ -708,7 +708,6 @@ Status ABISysV_mips::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -750,7 +749,7 @@ Status ABISysV_mips::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); @@ -797,7 +796,6 @@ ValueObjectSP ABISysV_mips::GetReturnValueObjectImpl( bool is_signed = false; bool is_complex = false; - uint32_t count = 0; // In MIPS register "r2" (v0) holds the integer function return values const RegisterInfo *r2_reg_info = reg_ctx->GetRegisterInfoByName("r2", 0); @@ -860,10 +858,10 @@ ValueObjectSP ABISysV_mips::GetReturnValueObjectImpl( return_valobj_sp = ValueObjectMemory::Create( &thread, "", Address(mem_address, nullptr), return_compiler_type); return return_valobj_sp; - } else if (return_compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (return_compiler_type.IsFloatingPointType(is_complex)) { if (IsSoftFloat(fp_flag)) { uint64_t raw_value = reg_ctx->ReadRegisterAsUnsigned(r2_reg_info, 0); - if (count != 1 && is_complex) + if (is_complex) return return_valobj_sp; switch (*bit_width) { default: @@ -896,7 +894,7 @@ ValueObjectSP ABISysV_mips::GetReturnValueObjectImpl( f0_value.GetData(f0_data); lldb::offset_t offset = 0; - if (count == 1 && !is_complex) { + if (!return_compiler_type.IsVectorType() && !is_complex) { switch (*bit_width) { default: return return_valobj_sp; diff --git a/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp b/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp index baefbfc363d99..0dd9db0948220 100644 --- a/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp +++ b/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp @@ -923,7 +923,6 @@ ValueObjectSP ABISysV_mips64::GetReturnValueObjectImpl( bool sucess = false; std::string name; bool is_complex; - uint32_t count; const uint32_t num_children = return_compiler_type.GetNumFields(); // A structure consisting of one or two FP values (and nothing else) will @@ -937,7 +936,7 @@ ValueObjectSP ABISysV_mips64::GetReturnValueObjectImpl( return_compiler_type.GetFieldAtIndex(idx, name, &field_bit_offset, nullptr, nullptr); - if (field_compiler_type.IsFloatingPointType(count, is_complex)) + if (field_compiler_type.IsFloatingPointType(is_complex)) use_fp_regs = true; else found_non_fp_field = true; @@ -1044,7 +1043,7 @@ ValueObjectSP ABISysV_mips64::GetReturnValueObjectImpl( if (field_compiler_type.IsIntegerOrEnumerationType(is_signed) || field_compiler_type.IsPointerType() || - field_compiler_type.IsFloatingPointType(count, is_complex)) { + field_compiler_type.IsFloatingPointType(is_complex)) { padding = field_byte_offset - integer_bytes; if (integer_bytes < 8) { diff --git a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp index e4bdc44c59c10..0d25faef1c659 100644 --- a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp +++ b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp @@ -426,7 +426,6 @@ Status ABISysV_ppc::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -454,7 +453,7 @@ Status ABISysV_ppc::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); @@ -695,7 +694,6 @@ ValueObjectSP ABISysV_ppc::GetReturnValueObjectImpl( uint64_t field_bit_offset = 0; bool is_signed; bool is_complex; - uint32_t count; CompilerType field_compiler_type = return_compiler_type.GetFieldAtIndex( idx, name, &field_bit_offset, nullptr, nullptr); @@ -741,7 +739,7 @@ ValueObjectSP ABISysV_ppc::GetReturnValueObjectImpl( // return a nullptr return value object. return return_valobj_sp; } - } else if (field_compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (field_compiler_type.IsFloatingPointType(is_complex)) { // Structs with long doubles are always passed in memory. if (*field_bit_width == 128) { is_memory = true; diff --git a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp index f5327a1f403c0..63357618774d4 100644 --- a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp +++ b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp @@ -309,7 +309,6 @@ Status ABISysV_ppc64::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -339,7 +338,7 @@ Status ABISysV_ppc64::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); diff --git a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp index 822c93dbbec3d..53f11b55427aa 100644 --- a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp +++ b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp @@ -643,11 +643,10 @@ ABISysV_riscv::GetReturnValueObjectSimple(Thread &thread, } // Floating point return type. else if (type_flags & eTypeIsFloat) { - uint32_t float_count = 0; bool is_complex = false; - if (compiler_type.IsFloatingPointType(float_count, is_complex) && - float_count == 1 && !is_complex) { + if (compiler_type.IsFloatingPointType(is_complex) && + !(type_flags & eTypeIsVector) && !is_complex) { const uint32_t arch_fp_flags = arch.GetFlags() & ArchSpec::eRISCV_float_abi_mask; return_valobj_sp = GetValObjFromFPRegs( diff --git a/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp b/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp index 5e52b6e4db499..301c3b309ffd5 100644 --- a/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp +++ b/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp @@ -393,7 +393,6 @@ Status ABISysV_s390x::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -423,7 +422,7 @@ Status ABISysV_s390x::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); diff --git a/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp b/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp index eaeed6c04590c..ee79abe55ead0 100644 --- a/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp +++ b/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp @@ -198,7 +198,6 @@ Status ABIMacOSX_i386::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -240,7 +239,7 @@ Status ABIMacOSX_i386::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); diff --git a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp index effb3de8215d6..29fd9f0eceb93 100644 --- a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp +++ b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp @@ -307,7 +307,6 @@ Status ABISysV_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -337,7 +336,7 @@ Status ABISysV_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); @@ -587,7 +586,6 @@ static bool FlattenAggregateType( for (uint32_t idx = 0; idx < num_children; ++idx) { std::string name; bool is_signed; - uint32_t count; bool is_complex; uint64_t field_bit_offset = 0; @@ -606,7 +604,7 @@ static bool FlattenAggregateType( const uint32_t field_type_flags = field_compiler_type.GetTypeInfo(); if (field_compiler_type.IsIntegerOrEnumerationType(is_signed) || field_compiler_type.IsPointerType() || - field_compiler_type.IsFloatingPointType(count, is_complex)) { + field_compiler_type.IsFloatingPointType(is_complex)) { aggregate_field_offsets.push_back(field_byte_offset); aggregate_compiler_types.push_back(field_compiler_type); } else if (field_type_flags & eTypeHasChildren) { @@ -696,7 +694,6 @@ ValueObjectSP ABISysV_x86_64::GetReturnValueObjectImpl( is_memory = false; for (uint32_t idx = 0; idx < num_children; idx++) { bool is_signed; - uint32_t count; bool is_complex; CompilerType field_compiler_type = aggregate_compiler_types[idx]; @@ -736,7 +733,7 @@ ValueObjectSP ABISysV_x86_64::GetReturnValueObjectImpl( // return a nullptr return value object. return return_valobj_sp; } - } else if (field_compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (field_compiler_type.IsFloatingPointType(is_complex)) { // Structs with long doubles are always passed in memory. if (field_bit_width == 128) { is_memory = true; diff --git a/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp b/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp index 339012cffb688..6520af2f643ee 100644 --- a/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp +++ b/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp @@ -312,7 +312,6 @@ Status ABIWindows_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp, Thread *thread = frame_sp->GetThread().get(); bool is_signed; - uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); @@ -342,7 +341,7 @@ Status ABIWindows_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp, "We don't support returning longer than 64 bit " "integer values at present."); } - } else if (compiler_type.IsFloatingPointType(count, is_complex)) { + } else if (compiler_type.IsFloatingPointType(is_complex)) { if (is_complex) error = Status::FromErrorString( "We don't support returning complex values at present"); @@ -558,7 +557,6 @@ static bool FlattenAggregateType( for (uint32_t idx = 0; idx < num_children; ++idx) { std::string name; bool is_signed; - uint32_t count; bool is_complex; uint64_t field_bit_offset = 0; @@ -582,7 +580,7 @@ static bool FlattenAggregateType( const uint32_t field_type_flags = field_compiler_type.GetTypeInfo(); if (field_compiler_type.IsIntegerOrEnumerationType(is_signed) || field_compiler_type.IsPointerType() || - field_compiler_type.IsFloatingPointType(count, is_complex)) { + field_compiler_type.IsFloatingPointType(is_complex)) { aggregate_field_offsets.push_back(field_byte_offset); aggregate_compiler_types.push_back(field_compiler_type); } else if (field_type_flags & eTypeHasChildren) { @@ -672,7 +670,6 @@ ValueObjectSP ABIWindows_x86_64::GetReturnValueObjectImpl( for (uint32_t idx = 0; idx < num_children; idx++) { bool is_signed; bool is_complex; - uint32_t count; CompilerType field_compiler_type = aggregate_compiler_types[idx]; uint32_t field_byte_width = @@ -691,7 +688,7 @@ ValueObjectSP ABIWindows_x86_64::GetReturnValueObjectImpl( uint32_t copy_from_offset = 0; if (field_compiler_type.IsIntegerOrEnumerationType(is_signed) || field_compiler_type.IsPointerType() || - field_compiler_type.IsFloatingPointType(count, is_complex)) { + field_compiler_type.IsFloatingPointType(is_complex)) { copy_from_extractor = &rax_data; copy_from_offset = used_bytes; used_bytes += field_byte_width; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index c049829f37219..47fa27b0a81a7 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -2047,11 +2047,10 @@ static std::optional MakeAPValue(const clang::ASTContext &ast, if (is_integral) return clang::APValue(apint); - uint32_t count; bool is_complex; // FIXME: we currently support a limited set of floating point types. // E.g., 16-bit floats are not supported. - if (!clang_type.IsFloatingPointType(count, is_complex)) + if (!clang_type.IsFloatingPointType(is_complex)) return std::nullopt; return clang::APValue(llvm::APFloat( diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index 4ec987c8d0103..67186542fb705 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -3488,7 +3488,7 @@ bool TypeSystemClang::IsReferenceType(lldb::opaque_compiler_type_t type, } bool TypeSystemClang::IsFloatingPointType(lldb::opaque_compiler_type_t type, - uint32_t &count, bool &is_complex) { + bool &is_complex) { if (type) { clang::QualType qual_type(GetCanonicalQualType(type)); @@ -3497,30 +3497,26 @@ bool TypeSystemClang::IsFloatingPointType(lldb::opaque_compiler_type_t type, clang::BuiltinType::Kind kind = BT->getKind(); if (kind >= clang::BuiltinType::Float && kind <= clang::BuiltinType::LongDouble) { - count = 1; is_complex = false; return true; } } else if (const clang::ComplexType *CT = llvm::dyn_cast( qual_type->getCanonicalTypeInternal())) { - if (IsFloatingPointType(CT->getElementType().getAsOpaquePtr(), count, + if (IsFloatingPointType(CT->getElementType().getAsOpaquePtr(), is_complex)) { - count = 2; is_complex = true; return true; } } else if (const clang::VectorType *VT = llvm::dyn_cast( qual_type->getCanonicalTypeInternal())) { - if (IsFloatingPointType(VT->getElementType().getAsOpaquePtr(), count, + if (IsFloatingPointType(VT->getElementType().getAsOpaquePtr(), is_complex)) { - count = VT->getNumElements(); is_complex = false; return true; } } } - count = 0; is_complex = false; return false; } diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h index 11107c0fea4f6..375891b3cfd2f 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h @@ -651,7 +651,7 @@ class TypeSystemClang : public TypeSystem { bool IsDefined(lldb::opaque_compiler_type_t type) override; - bool IsFloatingPointType(lldb::opaque_compiler_type_t type, uint32_t &count, + bool IsFloatingPointType(lldb::opaque_compiler_type_t type, bool &is_complex) override; unsigned GetPtrAuthKey(lldb::opaque_compiler_type_t type) override; diff --git a/lldb/source/Symbol/CompilerType.cpp b/lldb/source/Symbol/CompilerType.cpp index 73da3127a98a3..c999ab256fc98 100644 --- a/lldb/source/Symbol/CompilerType.cpp +++ b/lldb/source/Symbol/CompilerType.cpp @@ -240,13 +240,11 @@ bool CompilerType::ShouldTreatScalarValueAsAddress() const { return false; } -bool CompilerType::IsFloatingPointType(uint32_t &count, - bool &is_complex) const { +bool CompilerType::IsFloatingPointType(bool &is_complex) const { if (IsValid()) { if (auto type_system_sp = GetTypeSystem()) - return type_system_sp->IsFloatingPointType(m_type, count, is_complex); + return type_system_sp->IsFloatingPointType(m_type, is_complex); } - count = 0; is_complex = false; return false; } @@ -331,9 +329,8 @@ bool CompilerType::IsInteger() const { } bool CompilerType::IsFloat() const { - uint32_t count = 0; bool is_complex = false; - return IsFloatingPointType(count, is_complex); + return IsFloatingPointType(is_complex); } bool CompilerType::IsEnumerationType() const { From 5560116e0434b71057eb58bcf4bb5fea7d73939d Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 31 Oct 2025 10:19:10 +0000 Subject: [PATCH 319/539] [clang][tools][scan-view] Remove Python2 compatibility code in ScanView.py (#163747) All these modules got new names or were moved around in Python 3.0: https://docs.python.org/3/whatsnew/3.0.html#library-changes LLVM requires Python >= 3.8 so we don't need to try Python2 naming. --- clang/tools/scan-view/share/ScanView.py | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/clang/tools/scan-view/share/ScanView.py b/clang/tools/scan-view/share/ScanView.py index c395b9590e0ee..9c110130315ad 100644 --- a/clang/tools/scan-view/share/ScanView.py +++ b/clang/tools/scan-view/share/ScanView.py @@ -1,35 +1,18 @@ -from __future__ import print_function - -try: - from http.server import HTTPServer, SimpleHTTPRequestHandler -except ImportError: - from BaseHTTPServer import HTTPServer - from SimpleHTTPServer import SimpleHTTPRequestHandler +from http.server import HTTPServer, SimpleHTTPRequestHandler import os import sys - -try: - from urlparse import urlparse - from urllib import unquote -except ImportError: - from urllib.parse import urlparse, unquote - +from urllib.parse import urlparse, unquote import posixpath - -if sys.version_info.major >= 3: - from io import StringIO, BytesIO -else: - from io import BytesIO, BytesIO as StringIO - +from io import StringIO, BytesIO import re import shutil import threading import time import socket import itertools +import configparser import Reporter -import configparser ### # Various patterns matched or replaced by server. From c0e9ffaff2f8fcedefc239d5e7a98f09caccfb05 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Fri, 31 Oct 2025 10:23:03 +0000 Subject: [PATCH 320/539] [NFCI] Address post-merge review of #162503 (#165582) --- .../vplan-printing-reductions.ll | 488 ++++-------------- 1 file changed, 113 insertions(+), 375 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll index 291ada86cf797..ef678ff759943 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll @@ -804,9 +804,9 @@ exit: define i32 @print_mulacc_extended_const(ptr %start, ptr %end) { ; CHECK-LABEL: 'print_mulacc_extended_const' ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<%0> = VF -; CHECK-NEXT: Live-in vp<%1> = VF * UF -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<%3> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -814,107 +814,84 @@ define i32 @print_mulacc_extended_const(ptr %start, ptr %end) { ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1> -; CHECK-NEXT: EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1> +; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1> ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9> -; CHECK-NEXT: vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> -; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7> -; CHECK-NEXT: vp<%8> = vector-pointer vp<%next.gep> -; CHECK-NEXT: WIDEN ir<%l> = load vp<%8> -; CHECK-NEXT: EXPRESSION vp<%9> = ir<%red> + reduce.add (mul (ir<%l> zext to i32), (ir<63> zext to i32)) -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul (ir<%l> zext to i32), (ir<63> zext to i32)) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9> -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<[[RDX]]>, vp<[[RDX_NEXT]]> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block) -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb ] -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) -; CHECK-NEXT: IR %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) -; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 -; CHECK-NEXT: IR %l.ext = zext i8 %l to i32 -; CHECK-NEXT: IR %mul = mul i32 %l.ext, 63 -; CHECK-NEXT: IR %red.next = add i32 %red, %mul -; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 -; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { -; CHECK-NEXT: Live-in ir<%1> = original trip-count +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] + %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] + %l = load i8, ptr %ptr.iv, align 1 + %l.ext = zext i8 %l to i32 + %mul = mul i32 %l.ext, 63 + %red.next = add i32 %red, %mul + %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %red.next +} + +define i32 @print_mulacc_extended_const_lhs(ptr %start, ptr %end) { +; CHECK-LABEL: 'print_mulacc_extended_const_lhs' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: vp<%3> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %start2 = ptrtoint ptr %start to i64 -; CHECK-NEXT: IR %end1 = ptrtoint ptr %end to i64 -; CHECK-NEXT: IR %0 = add i64 %end1, 1 -; CHECK-NEXT: IR %1 = sub i64 %0, %start2 -; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4> -; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> -; CHECK-NEXT: Successor(s): ir-bb, vector.ph +; CHECK-NEXT: EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4> -; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf> -; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1> -; CHECK-NEXT: Successor(s): vector.body -; CHECK-EMPTY: -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next> -; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index> -; CHECK-NEXT: WIDEN ir<%l> = load vp<%next.gep> -; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = zext ir<%l> to i32 -; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%l.ext>, ir<63> -; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.add (ir<%mul>) -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%n.vec> -; CHECK-NEXT: Successor(s): middle.block, vector.body +; CHECK-NEXT: vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1> +; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = zext ir<%l> to i32 +; CHECK-NEXT: EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul ir<63>, ir<%l.ext>) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next> -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec> +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<[[RDX]]>, vp<[[RDX_NEXT]]> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb, ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block) -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb ] -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb) -; CHECK-NEXT: IR %red = phi i32 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb) -; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 -; CHECK-NEXT: IR %l.ext = zext i8 %l to i32 -; CHECK-NEXT: IR %mul = mul i32 %l.ext, 63 -; CHECK-NEXT: IR %red.next = add i32 %red, %mul -; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 -; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end -; CHECK-NEXT: No successors -; CHECK-NEXT: } entry: br label %loop @@ -923,7 +900,7 @@ loop: %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] %l = load i8, ptr %ptr.iv, align 1 %l.ext = zext i8 %l to i32 - %mul = mul i32 %l.ext, 63 + %mul = mul i32 63, %l.ext %red.next = add i32 %red, %mul %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 %ec = icmp eq ptr %ptr.iv, %end @@ -937,9 +914,9 @@ exit: define i32 @print_mulacc_not_extended_const(ptr %start, ptr %end) { ; CHECK-LABEL: 'print_mulacc_not_extended_const' ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<%0> = VF -; CHECK-NEXT: Live-in vp<%1> = VF * UF -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<%3> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -947,108 +924,30 @@ define i32 @print_mulacc_not_extended_const(ptr %start, ptr %end) { ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1> -; CHECK-NEXT: EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1> +; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1> ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9> -; CHECK-NEXT: vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> -; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7> -; CHECK-NEXT: vp<%8> = vector-pointer vp<%next.gep> -; CHECK-NEXT: WIDEN ir<%l> = load vp<%8> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 -; CHECK-NEXT: EXPRESSION vp<%9> = ir<%red> + reduce.add (mul ir<%l.ext>, ir<128>) -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul ir<%l.ext>, ir<128>) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9> -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<[[RDX:%.+]]>, vp<[[RDX_NEXT]]> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block) -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb ] -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) -; CHECK-NEXT: IR %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) -; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 -; CHECK-NEXT: IR %l.ext = sext i8 %l to i32 -; CHECK-NEXT: IR %mul = mul i32 %l.ext, 128 -; CHECK-NEXT: IR %red.next = add i32 %red, %mul -; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 -; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { -; CHECK-NEXT: Live-in ir<%1> = original trip-count -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %start2 = ptrtoint ptr %start to i64 -; CHECK-NEXT: IR %end1 = ptrtoint ptr %end to i64 -; CHECK-NEXT: IR %0 = add i64 %end1, 1 -; CHECK-NEXT: IR %1 = sub i64 %0, %start2 -; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4> -; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> -; CHECK-NEXT: Successor(s): ir-bb, vector.ph -; CHECK-EMPTY: -; CHECK-NEXT: vector.ph: -; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4> -; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf> -; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1> -; CHECK-NEXT: Successor(s): vector.body -; CHECK-EMPTY: -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next> -; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index> -; CHECK-NEXT: WIDEN ir<%l> = load vp<%next.gep> -; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 -; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%l.ext>, ir<128> -; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.add (ir<%mul>) -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%n.vec> -; CHECK-NEXT: Successor(s): middle.block, vector.body -; CHECK-EMPTY: -; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next> -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec> -; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb, ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block) -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb ] -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb) -; CHECK-NEXT: IR %red = phi i32 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb) -; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 -; CHECK-NEXT: IR %l.ext = sext i8 %l to i32 -; CHECK-NEXT: IR %mul = mul i32 %l.ext, 128 -; CHECK-NEXT: IR %red.next = add i32 %red, %mul -; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 -; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end -; CHECK-NEXT: No successors -; CHECK-NEXT: } entry: br label %loop @@ -1071,9 +970,9 @@ exit: define i64 @print_ext_mulacc_extended_const(ptr %start, ptr %end) { ; CHECK-LABEL: 'print_ext_mulacc_extended_const' ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<%0> = VF -; CHECK-NEXT: Live-in vp<%1> = VF * UF -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<%3> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -1081,109 +980,29 @@ define i64 @print_ext_mulacc_extended_const(ptr %start, ptr %end) { ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1> -; CHECK-NEXT: EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1> +; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1> ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9> -; CHECK-NEXT: vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> -; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7> -; CHECK-NEXT: vp<%8> = vector-pointer vp<%next.gep> -; CHECK-NEXT: WIDEN ir<%l> = load vp<%8> -; CHECK-NEXT: EXPRESSION vp<%9> = ir<%red> + reduce.add (mul (ir<%l> zext to i64), (ir<63> zext to i64)) -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul (ir<%l> zext to i64), (ir<63> zext to i64)) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9> -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<[[RDX]]>, vp<[[RDX_NEXT]]> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block) -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb ] -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) -; CHECK-NEXT: IR %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) -; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 -; CHECK-NEXT: IR %l.ext = zext i8 %l to i32 -; CHECK-NEXT: IR %mul = mul i32 %l.ext, 63 -; CHECK-NEXT: IR %mul.ext = zext i32 %mul to i64 -; CHECK-NEXT: IR %red.next = add i64 %red, %mul.ext -; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 -; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { -; CHECK-NEXT: Live-in ir<%1> = original trip-count -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %start2 = ptrtoint ptr %start to i64 -; CHECK-NEXT: IR %end1 = ptrtoint ptr %end to i64 -; CHECK-NEXT: IR %0 = add i64 %end1, 1 -; CHECK-NEXT: IR %1 = sub i64 %0, %start2 -; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4> -; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> -; CHECK-NEXT: Successor(s): ir-bb, vector.ph -; CHECK-EMPTY: -; CHECK-NEXT: vector.ph: -; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4> -; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf> -; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1> -; CHECK-NEXT: Successor(s): vector.body -; CHECK-EMPTY: -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next> -; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index> -; CHECK-NEXT: WIDEN ir<%l> = load vp<%next.gep> -; CHECK-NEXT: WIDEN-CAST vp<%4> = zext ir<%l> to i64 -; CHECK-NEXT: WIDEN ir<%mul> = mul vp<%4>, ir<63> -; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.add (ir<%mul>) -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%n.vec> -; CHECK-NEXT: Successor(s): middle.block, vector.body -; CHECK-EMPTY: -; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%6> = compute-reduction-result ir<%red>, ir<%red.next> -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec> -; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb, ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%6> from middle.block) -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%6>, middle.block ], [ ir<0>, ir-bb ] -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb) -; CHECK-NEXT: IR %red = phi i64 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb) -; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 -; CHECK-NEXT: IR %l.ext = zext i8 %l to i32 -; CHECK-NEXT: IR %mul = mul i32 %l.ext, 63 -; CHECK-NEXT: IR %mul.ext = zext i32 %mul to i64 -; CHECK-NEXT: IR %red.next = add i64 %red, %mul.ext -; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 -; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end -; CHECK-NEXT: No successors -; CHECK-NEXT: } entry: br label %loop @@ -1207,9 +1026,9 @@ exit: define i64 @print_ext_mulacc_not_extended_const(ptr %start, ptr %end) { ; CHECK-LABEL: 'print_ext_mulacc_not_extended_const' ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<%0> = VF -; CHECK-NEXT: Live-in vp<%1> = VF * UF -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<%3> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -1217,112 +1036,31 @@ define i64 @print_ext_mulacc_not_extended_const(ptr %start, ptr %end) { ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1> -; CHECK-NEXT: EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1> +; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1> ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9> -; CHECK-NEXT: vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> -; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7> -; CHECK-NEXT: vp<%8> = vector-pointer vp<%next.gep> -; CHECK-NEXT: WIDEN ir<%l> = load vp<%8> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 ; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%l.ext>, ir<128> -; CHECK-NEXT: EXPRESSION vp<%9> = ir<%red> + reduce.add (ir<%mul> sext to i64) -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (ir<%mul> sext to i64) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9> -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> -; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block) -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb ] -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) -; CHECK-NEXT: IR %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) -; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 -; CHECK-NEXT: IR %l.ext = sext i8 %l to i32 -; CHECK-NEXT: IR %mul = mul i32 %l.ext, 128 -; CHECK-NEXT: IR %mul.ext = sext i32 %mul to i64 -; CHECK-NEXT: IR %red.next = add i64 %red, %mul.ext -; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 -; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { -; CHECK-NEXT: Live-in ir<%1> = original trip-count -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %start2 = ptrtoint ptr %start to i64 -; CHECK-NEXT: IR %end1 = ptrtoint ptr %end to i64 -; CHECK-NEXT: IR %0 = add i64 %end1, 1 -; CHECK-NEXT: IR %1 = sub i64 %0, %start2 -; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4> -; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> -; CHECK-NEXT: Successor(s): ir-bb, vector.ph -; CHECK-EMPTY: -; CHECK-NEXT: vector.ph: -; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4> -; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf> -; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1> -; CHECK-NEXT: Successor(s): vector.body -; CHECK-EMPTY: -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next> -; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index> -; CHECK-NEXT: WIDEN ir<%l> = load vp<%next.gep> -; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 -; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%l.ext>, ir<128> -; CHECK-NEXT: WIDEN-CAST ir<%mul.ext> = sext ir<%mul> to i64 -; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.add (ir<%mul.ext>) -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%n.vec> -; CHECK-NEXT: Successor(s): middle.block, vector.body -; CHECK-EMPTY: -; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next> -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec> +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<[[RDX]]>, vp<[[RDX_NEXT]]> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Successor(s): ir-bb, ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block) -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb ] -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb) -; CHECK-NEXT: IR %red = phi i64 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb) -; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 -; CHECK-NEXT: IR %l.ext = sext i8 %l to i32 -; CHECK-NEXT: IR %mul = mul i32 %l.ext, 128 -; CHECK-NEXT: IR %mul.ext = sext i32 %mul to i64 -; CHECK-NEXT: IR %red.next = add i64 %red, %mul.ext -; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 -; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end -; CHECK-NEXT: No successors -; CHECK-NEXT: } entry: br label %loop From db3deea0770be84d4401939d3df37aa98f5f9310 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Fri, 31 Oct 2025 10:25:58 +0000 Subject: [PATCH 321/539] [llvm][DebugInfo][ObjC] Make sure we link backing ivars to their DW_TAG_APPLE_property (#165409) Depends on: * https://github.com/llvm/llvm-project/pull/165373 When an Objective-C property has a backing ivar, we would previously not add a `DW_AT_APPLE_property` to the ivar's `DW_TAG_member`. This is what was intended based on the [Objective-C DebugInfo docs](https://github.com/llvm/llvm-project/blob/main/llvm/docs/SourceLevelDebugging.rst#proposal) but is not what LLVM currently generates. LLDB currently doesn't ever try linking the `ObjCPropertyDecl`s to their `ObjCIvarDecl`s, but if we wanted to, this debug-info patch is a pre-requisite. --- llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp | 2 +- llvm/test/DebugInfo/Generic/objc-property.ll | 26 ++++++++++++-------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 555c56fd322bb..b16e131529ac3 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -1120,7 +1120,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) { constructMemberDIE(Buffer, DDTy); } } else if (auto *Property = dyn_cast(Element)) { - DIE &ElemDie = createAndAddDIE(Property->getTag(), Buffer); + DIE &ElemDie = createAndAddDIE(Property->getTag(), Buffer, Property); StringRef PropertyName = Property->getName(); addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName); if (Property->getType()) diff --git a/llvm/test/DebugInfo/Generic/objc-property.ll b/llvm/test/DebugInfo/Generic/objc-property.ll index 007d1fe698b30..1ee792941bcbb 100644 --- a/llvm/test/DebugInfo/Generic/objc-property.ll +++ b/llvm/test/DebugInfo/Generic/objc-property.ll @@ -5,33 +5,33 @@ ; CHECK: DW_TAG_structure_type ; CHECK: DW_AT_name ("Foo") ; -; CHECK: DW_TAG_APPLE_property +; CHECK: 0x[[AUTO_SYNTH:[0-9a-f]+]]: DW_TAG_APPLE_property ; CHECK: DW_AT_APPLE_property_name ("autoSynthProp") ; CHECK: DW_AT_APPLE_property_attribute ; CHECK-SAME: DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, ; CHECK-SAME: DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained ; -; CHECK: DW_TAG_APPLE_property +; CHECK: 0x[[SYNTH:[0-9a-f]+]]: DW_TAG_APPLE_property ; CHECK: DW_AT_APPLE_property_name ("synthProp") ; CHECK: DW_AT_APPLE_property_attribute ; CHECK-SAME: DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, ; CHECK-SAME: DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained ; -; CHECK: DW_TAG_APPLE_property +; CHECK: 0x[[GET:[0-9a-f]+]]: DW_TAG_APPLE_property ; CHECK: DW_AT_APPLE_property_name ("customGetterProp") ; CHECK: DW_AT_APPLE_property_getter ("customGetter") ; CHECK: DW_AT_APPLE_property_attribute ; CHECK-SAME: DW_APPLE_PROPERTY_getter, DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, ; CHECK-SAME: DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained ; -; CHECK: DW_TAG_APPLE_property +; CHECK: 0x[[SET:[0-9a-f]+]]: DW_TAG_APPLE_property ; CHECK: DW_AT_APPLE_property_name ("customSetterProp") ; CHECK: DW_AT_APPLE_property_setter ("customSetter:") ; CHECK: DW_AT_APPLE_property_attribute ; CHECK-SAME: DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, ; CHECK-SAME: DW_APPLE_PROPERTY_setter, DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained ; -; CHECK: DW_TAG_APPLE_property +; CHECK: 0x[[ACCESSORS:[0-9a-f]+]]: DW_TAG_APPLE_property ; CHECK: DW_AT_APPLE_property_name ("customAccessorsProp") ; CHECK: DW_AT_APPLE_property_getter ("customGetter") ; CHECK: DW_AT_APPLE_property_setter ("customSetter:") @@ -39,15 +39,21 @@ ; CHECK-SAME: DW_APPLE_PROPERTY_getter, DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, ; CHECK-SAME: DW_APPLE_PROPERTY_setter, DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained ; -; FIXME: missing link between DW_TAG_member and the associated DW_TAG_APPLE_property ; CHECK: DW_TAG_member -; CHECK-NOT: DW_AT_APPLE_property +; CHECK: DW_AT_name ("someBackingIvar") +; CHECK: DW_AT_APPLE_property (0x[[SYNTH]] "synthProp") +; ; CHECK: DW_TAG_member -; CHECK-NOT: DW_AT_APPLE_property +; CHECK: DW_AT_name ("_autoSynthProp") +; CHECK: DW_AT_APPLE_property (0x[[AUTO_SYNTH]] "autoSynthProp") +; ; CHECK: DW_TAG_member -; CHECK-NOT: DW_AT_APPLE_property +; CHECK: DW_AT_name ("_customGetterProp") +; CHECK: DW_AT_APPLE_property (0x[[GET]] "customGetterProp") +; ; CHECK: DW_TAG_member -; CHECK-NOT: DW_AT_APPLE_property +; CHECK: DW_AT_name ("_customSetterProp") +; CHECK: DW_AT_APPLE_property (0x[[SET]] "customSetterProp") !llvm.module.flags = !{!0, !1} !llvm.dbg.cu = !{!2} From ba6980b50b320d5a6b3e14c2de0426b7d5dec7cb Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Fri, 31 Oct 2025 11:27:55 +0100 Subject: [PATCH 322/539] [SDAG] Set InBounds when when computing offsets into memory objects (#165425) When a load or store accesses N bytes starting from a pointer P, and we want to compute an offset pointer within these N bytes after P, we know that the arithmetic to add the offset must be inbounds. This is for example relevant when legalizing too-wide memory accesses, when lowering memcpy&Co., or when optimizing "vector-load -> extractelement" into an offset load. For SWDEV-516125. --- llvm/include/llvm/CodeGen/SelectionDAG.h | 12 +++++--- llvm/include/llvm/CodeGen/TargetLowering.h | 30 +++++++++++++++---- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 +++- .../CodeGen/SelectionDAG/TargetLowering.cpp | 25 +++++++++------- 4 files changed, 50 insertions(+), 22 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index df6ce0fe1b037..1a5ffb38f2568 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1113,7 +1113,8 @@ class SelectionDAG { SDValue Mask, SDValue EVL); /// Returns sum of the base pointer and offset. - /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap by default. + /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap and InBounds by + /// default. LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags = SDNodeFlags()); @@ -1123,15 +1124,18 @@ class SelectionDAG { /// Create an add instruction with appropriate flags when used for /// addressing some offset of an object. i.e. if a load is split into multiple - /// components, create an add nuw from the base pointer to the offset. + /// components, create an add nuw (or ptradd nuw inbounds) from the base + /// pointer to the offset. SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset) { - return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap); + return getMemBasePlusOffset( + Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds); } SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, SDValue Offset) { // The object itself can't wrap around the address space, so it shouldn't be // possible for the adds of the offsets to the split parts to overflow. - return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap); + return getMemBasePlusOffset( + Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds); } /// Return a new CALLSEQ_START node, that starts new call frame, in which diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 1920b98c8a1ef..78f63b4406eb0 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5649,17 +5649,35 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase { /// Get a pointer to vector element \p Idx located in memory for a vector of /// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of /// bounds the returned pointer is unspecified, but will be within the vector - /// bounds. - SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, - SDValue Index) const; + /// bounds. \p PtrArithFlags can be used to mark that arithmetic within the + /// vector in memory is known to not wrap or to be inbounds. + SDValue getVectorElementPointer( + SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index, + const SDNodeFlags PtrArithFlags = SDNodeFlags()) const; + + /// Get a pointer to vector element \p Idx located in memory for a vector of + /// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of + /// bounds the returned pointer is unspecified, but will be within the vector + /// bounds. \p VecPtr is guaranteed to point to the beginning of a memory + /// location large enough for the vector. + SDValue getInboundsVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, + EVT VecVT, SDValue Index) const { + return getVectorElementPointer(DAG, VecPtr, VecVT, Index, + SDNodeFlags::NoUnsignedWrap | + SDNodeFlags::InBounds); + } /// Get a pointer to a sub-vector of type \p SubVecVT at index \p Idx located /// in memory for a vector of type \p VecVT starting at a base address of /// \p VecPtr. If \p Idx plus the size of \p SubVecVT is out of bounds the /// returned pointer is unspecified, but the value returned will be such that - /// the entire subvector would be within the vector bounds. - SDValue getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, - EVT SubVecVT, SDValue Index) const; + /// the entire subvector would be within the vector bounds. \p PtrArithFlags + /// can be used to mark that arithmetic within the vector in memory is known + /// to not wrap or to be inbounds. + SDValue + getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, + EVT SubVecVT, SDValue Index, + const SDNodeFlags PtrArithFlags = SDNodeFlags()) const; /// Method for building the DAG expansion of ISD::[US][MIN|MAX]. This /// method accepts integers as its arguments. diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 893556bd85240..bdd6bf025b645 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -22760,7 +22760,10 @@ SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) { NewPtr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(COffset), DL); PointerInfo = ST->getPointerInfo().getWithOffset(COffset); } else { - NewPtr = TLI.getVectorElementPointer(DAG, Ptr, Value.getValueType(), Idx); + // The original DAG loaded the entire vector from memory, so arithmetic + // within it must be inbounds. + NewPtr = TLI.getInboundsVectorElementPointer(DAG, Ptr, Value.getValueType(), + Idx); } return DAG.getStore(Chain, DL, Elt, NewPtr, PointerInfo, ST->getAlign(), diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index da4e40953b39a..9bdf82210fed1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -10668,19 +10668,20 @@ static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, SDValue Idx, DAG.getConstant(MaxIndex, dl, IdxVT)); } -SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG, - SDValue VecPtr, EVT VecVT, - SDValue Index) const { +SDValue +TargetLowering::getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, + EVT VecVT, SDValue Index, + const SDNodeFlags PtrArithFlags) const { return getVectorSubVecPointer( DAG, VecPtr, VecVT, EVT::getVectorVT(*DAG.getContext(), VecVT.getVectorElementType(), 1), - Index); + Index, PtrArithFlags); } -SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, - SDValue VecPtr, EVT VecVT, - EVT SubVecVT, - SDValue Index) const { +SDValue +TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, + EVT VecVT, EVT SubVecVT, SDValue Index, + const SDNodeFlags PtrArithFlags) const { SDLoc dl(Index); // Make sure the index type is big enough to compute in. Index = DAG.getZExtOrTrunc(Index, dl, VecPtr.getValueType()); @@ -10704,7 +10705,7 @@ SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index, DAG.getConstant(EltSize, dl, IdxVT)); - return DAG.getMemBasePlusOffset(VecPtr, Index, dl); + return DAG.getMemBasePlusOffset(VecPtr, Index, dl, PtrArithFlags); } //===----------------------------------------------------------------------===// @@ -12382,8 +12383,10 @@ SDValue TargetLowering::scalarizeExtractedVectorLoad(EVT ResultVT, !IsFast) return SDValue(); - SDValue NewPtr = - getVectorElementPointer(DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo); + // The original DAG loaded the entire vector from memory, so arithmetic + // within it must be inbounds. + SDValue NewPtr = getInboundsVectorElementPointer( + DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo); // We are replacing a vector load with a scalar load. The new load must have // identical memory op ordering to the original. From d492e6ecbf5755278fb815a41b8363196795356d Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Fri, 31 Oct 2025 11:14:08 +0000 Subject: [PATCH 323/539] [LLVM][ConstantFolding] Extend constantFoldVectorReduce to include scalable vectors. (#165437) --- llvm/lib/Analysis/ConstantFolding.cpp | 44 ++++++-- .../InstSimplify/ConstProp/vecreduce.ll | 104 +++++++++--------- 2 files changed, 86 insertions(+), 62 deletions(-) diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index e9e2e7d0316c7..da32542cf7870 100755 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -2163,18 +2163,42 @@ Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double), } Constant *constantFoldVectorReduce(Intrinsic::ID IID, Constant *Op) { - FixedVectorType *VT = dyn_cast(Op->getType()); - if (!VT) - return nullptr; - - // This isn't strictly necessary, but handle the special/common case of zero: - // all integer reductions of a zero input produce zero. - if (isa(Op)) - return ConstantInt::get(VT->getElementType(), 0); + auto *OpVT = cast(Op->getType()); // This is the same as the underlying binops - poison propagates. - if (isa(Op) || Op->containsPoisonElement()) - return PoisonValue::get(VT->getElementType()); + if (Op->containsPoisonElement()) + return PoisonValue::get(OpVT->getElementType()); + + // Shortcut non-accumulating reductions. + if (Constant *SplatVal = Op->getSplatValue()) { + switch (IID) { + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_umin: + case Intrinsic::vector_reduce_umax: + return SplatVal; + case Intrinsic::vector_reduce_add: + if (SplatVal->isNullValue()) + return SplatVal; + break; + case Intrinsic::vector_reduce_mul: + if (SplatVal->isNullValue() || SplatVal->isOneValue()) + return SplatVal; + break; + case Intrinsic::vector_reduce_xor: + if (SplatVal->isNullValue()) + return SplatVal; + if (OpVT->getElementCount().isKnownMultipleOf(2)) + return Constant::getNullValue(OpVT->getElementType()); + break; + } + } + + FixedVectorType *VT = dyn_cast(OpVT); + if (!VT) + return nullptr; // TODO: Handle undef. auto *EltC = dyn_cast_or_null(Op->getAggregateElement(0U)); diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll index 77a7f0d4e4acf..479b3f8ea4128 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll @@ -12,8 +12,7 @@ define i32 @add_0() { define i32 @add_0_scalable_vector() { ; CHECK-LABEL: @add_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.add.nxv8i32( zeroinitializer) ret i32 %x @@ -89,8 +88,7 @@ define i32 @add_poison() { define i32 @add_poison_scalable_vector() { ; CHECK-LABEL: @add_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.add.nxv8i32( poison) ret i32 %x @@ -123,8 +121,7 @@ define i32 @mul_0() { define i32 @mul_0_scalable_vector() { ; CHECK-LABEL: @mul_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32( zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.mul.nxv8i32( zeroinitializer) ret i32 %x @@ -140,13 +137,29 @@ define i32 @mul_1() { define i32 @mul_1_scalable_vector() { ; CHECK-LABEL: @mul_1_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32( splat (i32 1)) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 1 ; %x = call i32 @llvm.vector.reduce.mul.nxv8i32( splat (i32 1)) ret i32 %x } +define i32 @mul_2() { +; CHECK-LABEL: @mul_2( +; CHECK-NEXT: ret i32 256 +; + %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @mul_2_scalable_vector() { +; CHECK-LABEL: @mul_2_scalable_vector( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32( splat (i32 2)) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.mul.nxv8i32( splat (i32 2)) + ret i32 %x +} + define i32 @mul_inc() { ; CHECK-LABEL: @mul_inc( ; CHECK-NEXT: ret i32 40320 @@ -200,8 +213,7 @@ define i32 @mul_poison() { define i32 @mul_poison_scalable_vector() { ; CHECK-LABEL: @mul_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32( poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.mul.nxv8i32( poison) ret i32 %x @@ -225,8 +237,7 @@ define i32 @and_0() { define i32 @and_0_scalable_vector() { ; CHECK-LABEL: @and_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32( zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.and.nxv8i32( zeroinitializer) ret i32 %x @@ -242,8 +253,7 @@ define i32 @and_1() { define i32 @and_1_scalable_vector() { ; CHECK-LABEL: @and_1_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32( splat (i32 1)) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 1 ; %x = call i32 @llvm.vector.reduce.and.nxv8i32( splat (i32 1)) ret i32 %x @@ -302,8 +312,7 @@ define i32 @and_poison() { define i32 @and_poison_scalable_vector() { ; CHECK-LABEL: @and_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32( poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.and.nxv8i32( poison) ret i32 %x @@ -327,8 +336,7 @@ define i32 @or_0() { define i32 @or_0_scalable_vector() { ; CHECK-LABEL: @or_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32( zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.or.nxv8i32( zeroinitializer) ret i32 %x @@ -344,8 +352,7 @@ define i32 @or_1() { define i32 @or_1_scalable_vector() { ; CHECK-LABEL: @or_1_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32( splat (i32 1)) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 1 ; %x = call i32 @llvm.vector.reduce.or.nxv8i32( splat (i32 1)) ret i32 %x @@ -404,8 +411,7 @@ define i32 @or_poison() { define i32 @or_poison_scalable_vector() { ; CHECK-LABEL: @or_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32( poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.or.nxv8i32( poison) ret i32 %x @@ -429,8 +435,7 @@ define i32 @xor_0() { define i32 @xor_0_scalable_vector() { ; CHECK-LABEL: @xor_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32( zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.xor.nxv8i32( zeroinitializer) ret i32 %x @@ -446,13 +451,21 @@ define i32 @xor_1() { define i32 @xor_1_scalable_vector() { ; CHECK-LABEL: @xor_1_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32( splat (i32 1)) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.xor.nxv8i32( splat(i32 1)) ret i32 %x } +define i32 @xor_1_scalable_vector_lane_count_not_known_even() { +; CHECK-LABEL: @xor_1_scalable_vector_lane_count_not_known_even( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv1i32( splat (i32 1)) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.vector.reduce.xor.nxv8i32( splat(i32 1)) + ret i32 %x +} + define i32 @xor_inc() { ; CHECK-LABEL: @xor_inc( ; CHECK-NEXT: ret i32 10 @@ -506,8 +519,7 @@ define i32 @xor_poison() { define i32 @xor_poison_scalable_vector() { ; CHECK-LABEL: @xor_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32( poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.xor.nxv8i32( poison) ret i32 %x @@ -531,8 +543,7 @@ define i32 @smin_0() { define i32 @smin_0_scalable_vector() { ; CHECK-LABEL: @smin_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32( zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.smin.nxv8i32( zeroinitializer) ret i32 %x @@ -548,8 +559,7 @@ define i32 @smin_1() { define i32 @smin_1_scalable_vector() { ; CHECK-LABEL: @smin_1_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32( splat (i32 1)) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 1 ; %x = call i32 @llvm.vector.reduce.smin.nxv8i32( splat(i32 1)) ret i32 %x @@ -608,8 +618,7 @@ define i32 @smin_poison() { define i32 @smin_poison_scalable_vector() { ; CHECK-LABEL: @smin_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32( poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.smin.nxv8i32( poison) ret i32 %x @@ -633,8 +642,7 @@ define i32 @smax_0() { define i32 @smax_0_scalable_vector() { ; CHECK-LABEL: @smax_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32( zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.smax.nxv8i32( zeroinitializer) ret i32 %x @@ -650,8 +658,7 @@ define i32 @smax_1() { define i32 @smax_1_scalable_vector() { ; CHECK-LABEL: @smax_1_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32( splat (i32 1)) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 1 ; %x = call i32 @llvm.vector.reduce.smax.nxv8i32( splat(i32 1)) ret i32 %x @@ -710,8 +717,7 @@ define i32 @smax_poison() { define i32 @smax_poison_scalable_vector() { ; CHECK-LABEL: @smax_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32( poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.smax.nxv8i32( poison) ret i32 %x @@ -735,8 +741,7 @@ define i32 @umin_0() { define i32 @umin_0_scalable_vector() { ; CHECK-LABEL: @umin_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32( zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.umin.nxv8i32( zeroinitializer) ret i32 %x @@ -752,8 +757,7 @@ define i32 @umin_1() { define i32 @umin_1_scalable_vector() { ; CHECK-LABEL: @umin_1_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32( splat (i32 1)) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 1 ; %x = call i32 @llvm.vector.reduce.umin.nxv8i32( splat (i32 1)) ret i32 %x @@ -812,8 +816,7 @@ define i32 @umin_poison() { define i32 @umin_poison_scalable_vector() { ; CHECK-LABEL: @umin_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32( poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.umin.nxv8i32( poison) ret i32 %x @@ -837,8 +840,7 @@ define i32 @umax_0() { define i32 @umax_0_scalable_vector() { ; CHECK-LABEL: @umax_0_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32( zeroinitializer) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 0 ; %x = call i32 @llvm.vector.reduce.umax.nxv8i32( zeroinitializer) ret i32 %x @@ -854,8 +856,7 @@ define i32 @umax_1() { define i32 @umax_1_scalable_vector() { ; CHECK-LABEL: @umax_1_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32( splat (i32 1)) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 1 ; %x = call i32 @llvm.vector.reduce.umax.nxv8i32( splat(i32 1)) ret i32 %x @@ -914,8 +915,7 @@ define i32 @umax_poison() { define i32 @umax_poison_scalable_vector() { ; CHECK-LABEL: @umax_poison_scalable_vector( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32( poison) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 poison ; %x = call i32 @llvm.vector.reduce.umax.nxv8i32( poison) ret i32 %x From 1ab672aff404d5794ed21ba7b68ae63e3b4c7344 Mon Sep 17 00:00:00 2001 From: Abhinav Garg <39309352+abhigargrepo@users.noreply.github.com> Date: Fri, 31 Oct 2025 16:45:40 +0530 Subject: [PATCH 324/539] [AMDGPU][GlobalISel] Add register bank legalization for G_FADD (#163407) This patch adds register bank legalization support for G_FADD opcodes in the AMDGPU GlobalISel pipeline. Added new reg bank type UniInVgprS64. This patch also adds a combine logic for ReadAnyLane + Trunc + AnyExt. --------- Co-authored-by: Abhinav Garg --- .../Target/AMDGPU/AMDGPURegBankLegalize.cpp | 21 ++- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 22 +++ .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 2 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 13 +- .../AMDGPU/AMDGPURegBankLegalizeRules.h | 5 + llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll | 165 ++++++++++++++++++ 6 files changed, 225 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index e1879598f098a..907f8300de6d2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" @@ -34,9 +35,17 @@ using namespace llvm; using namespace AMDGPU; +using namespace llvm::MIPatternMatch; namespace { +// AMDGPU-specific pattern matchers +template +inline UnaryOp_match +m_GAMDGPUReadAnyLane(const SrcTy &Src) { + return UnaryOp_match(Src); +} + class AMDGPURegBankLegalize : public MachineFunctionPass { public: static char ID; @@ -160,10 +169,18 @@ AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) { Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { // Src = G_AMDGPU_READANYLANE RALSrc - auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); - if (RAL) + Register RALSrc; + if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))) return RALSrc; + // TruncSrc = G_AMDGPU_READANYLANE RALSrc + // AextSrc = G_TRUNC TruncSrc + // Src = G_ANYEXT AextSrc + if (mi_match(Src, MRI, + m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) { + return RALSrc; + } + // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc // LoSgpr = G_AMDGPU_READANYLANE LoVgpr // HiSgpr = G_AMDGPU_READANYLANE HiVgpr diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index b84c30ecaac0b..dc8fa7f0eef49 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -626,6 +626,23 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) { MI.eraseFromParent(); } +void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + assert(MRI.getType(Dst) == V2S16); + auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg()); + auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg()); + unsigned Opc = MI.getOpcode(); + auto Flags = MI.getFlags(); + auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32); + auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32); + auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32); + auto Op2Hi = B.buildTrunc(SgprRB_S16, Op2Hi32); + auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags); + auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags); + B.buildMergeLikeInstr(Dst, {Lo, Hi}); + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(Dst); @@ -698,6 +715,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, return lowerUnpackBitShift(MI); case UnpackMinMax: return lowerUnpackMinMax(MI); + case ScalarizeToS16: + return lowerSplitTo16(MI); case Ext32To64: { const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg()); MachineInstrBuilder Hi; @@ -849,6 +868,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { return LLT::scalar(32); case Sgpr64: case Vgpr64: + case UniInVgprS64: return LLT::scalar(64); case Sgpr128: case Vgpr128: @@ -972,6 +992,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case UniInVcc: case UniInVgprS16: case UniInVgprS32: + case UniInVgprS64: case UniInVgprV2S16: case UniInVgprV4S32: case UniInVgprB32: @@ -1104,6 +1125,7 @@ void RegBankLegalizeHelper::applyMappingDst( break; } case UniInVgprS32: + case UniInVgprS64: case UniInVgprV2S16: case UniInVgprV4S32: { assert(Ty == getTyFromID(MethodIDs[OpIdx])); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index ad3ff1d374ec1..e7598f888e4b5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -72,6 +72,7 @@ class RegBankLegalizeHelper { static constexpr LLT P6 = LLT::pointer(6, 32); MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32}; + MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16}; MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32}; MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1}; @@ -121,6 +122,7 @@ class RegBankLegalizeHelper { void lowerV_BFE(MachineInstr &MI); void lowerS_BFE(MachineInstr &MI); void lowerSplitTo32(MachineInstr &MI); + void lowerSplitTo16(MachineInstr &MI); void lowerSplitTo32Select(MachineInstr &MI); void lowerSplitTo32SExtInReg(MachineInstr &MI); void lowerUnpackMinMax(MachineInstr &MI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 01abd358ff595..b22e9bdc334d7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -918,9 +918,20 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, bool hasSALUFloat = ST->hasSALUFloatInsts(); addRulesForGOpcs({G_FADD}, Standard) + .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat) + .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat) .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat) - .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}) + .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16}, + hasSALUFloat) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}}) + .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}}); addRulesForGOpcs({G_FPTOUI}) .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 030bd75f8cd10..e6df5d87a2edc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -92,8 +92,10 @@ enum UniformityLLTOpPredicateID { V4S32, UniV2S16, + UniV2S32, DivV2S16, + DivV2S32, // B types B32, @@ -178,7 +180,9 @@ enum RegBankLLTMappingApplyID { UniInVcc, UniInVgprS16, UniInVgprS32, + UniInVgprS64, UniInVgprV2S16, + UniInVgprV2S32, UniInVgprV4S32, UniInVgprB32, UniInVgprB64, @@ -217,6 +221,7 @@ enum LoweringMethodID { V_BFE, VgprToVccCopy, SplitTo32, + ScalarizeToS16, SplitTo32Select, SplitTo32SExtInReg, Ext32To64, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll new file mode 100644 index 0000000000000..e440beed1da79 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll @@ -0,0 +1,165 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s + +define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) { +; GFX11-FAKE16-LABEL: fadd_s16_uniform: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1 +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: fadd_s16_uniform: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fadd_s16_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_add_f16 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog + %fadd = fadd half %a, %b + ret half %fadd +} + +define amdgpu_ps half @fadd_s16_div(half %a, half %b) { +; GFX11-FAKE16-LABEL: fadd_s16_div: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: fadd_s16_div: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: fadd_s16_div: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX12-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: fadd_s16_div: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX12-TRUE16-NEXT: ; return to shader part epilog + %fadd = fadd half %a, %b + ret half %fadd +} + +define amdgpu_ps float @fadd_s32_uniform(float inreg %a, float inreg %b) { +; GFX11-LABEL: fadd_s32_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fadd_s32_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_add_f32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog + %fadd = fadd float %a, %b + ret float %fadd +} + +define amdgpu_ps float @fadd_s32_div(float %a, float %b) { +; GCN-LABEL: fadd_s32_div: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-NEXT: ; return to shader part epilog + %fadd = fadd float %a, %b + ret float %fadd +} + +define amdgpu_ps void @fadd_s64_uniform(double inreg %a, double inreg %b, ptr addrspace(1) %ptr) { +; GFX11-LABEL: fadd_s64_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f64 v[2:3], s[0:1], s[2:3] +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fadd_s64_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_add_f64_e64 v[2:3], s[0:1], s[2:3] +; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-NEXT: s_endpgm + %fadd = fadd double %a, %b + store double %fadd, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_ps void @fadd_s64_div(double %a, double %b, ptr addrspace(1) %ptr) { +; GFX11-LABEL: fadd_s64_div: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: global_store_b64 v[4:5], v[0:1], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fadd_s64_div: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: global_store_b64 v[4:5], v[0:1], off +; GFX12-NEXT: s_endpgm + %fadd = fadd double %a, %b + store double %fadd, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_ps <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) { +; GFX11-LABEL: fadd_v2s16_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_add_f16 v0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fadd_v2s16_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_lshr_b32 s2, s0, 16 +; GFX12-NEXT: s_lshr_b32 s3, s1, 16 +; GFX12-NEXT: s_add_f16 s0, s0, s1 +; GFX12-NEXT: s_add_f16 s1, s2, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog + %fadd = fadd <2 x half> %a, %b + ret <2 x half> %fadd +} + +define amdgpu_ps <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) { +; GCN-LABEL: fadd_v2s16_div: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_add_f16 v0, v0, v1 +; GCN-NEXT: ; return to shader part epilog + %fadd = fadd <2 x half> %a, %b + ret <2 x half> %fadd +} + +define amdgpu_ps <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) { +; GFX11-LABEL: fadd_v2s32_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v0, s0, s2 +; GFX11-NEXT: v_add_f32_e64 v1, s1, s3 +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fadd_v2s32_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_add_f32 s0, s0, s2 +; GFX12-NEXT: s_add_f32 s1, s1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: ; return to shader part epilog + %fadd = fadd <2 x float> %a, %b + ret <2 x float> %fadd +} + +define amdgpu_ps <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) { +; GCN-LABEL: fadd_v2s32_div: +; GCN: ; %bb.0: +; GCN-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3 +; GCN-NEXT: ; return to shader part epilog + %fadd = fadd <2 x float> %a, %b + ret <2 x float> %fadd +} From 5e0544e6c84ded4a743eb9c55db4d11472cf6e57 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Fri, 31 Oct 2025 12:16:24 +0100 Subject: [PATCH 325/539] [libc++] Update our documentation on the supported compilers (#165684) --- libcxx/docs/index.rst | 2 +- libcxx/include/__configuration/compiler.h | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst index 495ccceb31cef..03dfb9d41aa1a 100644 --- a/libcxx/docs/index.rst +++ b/libcxx/docs/index.rst @@ -132,7 +132,7 @@ velocity, libc++ drops support for older compilers as newer ones are released. ============ =================== ========================== ===================== Compiler Versions Restrictions Support policy ============ =================== ========================== ===================== -Clang 19, 20, 21-git latest two stable releases per `LLVM's release page `_ and the development version +Clang 20, 21, 22-git latest two stable releases per `LLVM's release page `_ and the development version AppleClang 26.0 latest stable release per `Xcode's release page `_ Open XL 17.1.3 (AIX) latest stable release per `Open XL's documentation page `_ GCC 15 In C++11 or later only latest stable release per `GCC's release page `_ diff --git a/libcxx/include/__configuration/compiler.h b/libcxx/include/__configuration/compiler.h index 11c07ed0dc474..7cd81e03b05ba 100644 --- a/libcxx/include/__configuration/compiler.h +++ b/libcxx/include/__configuration/compiler.h @@ -33,16 +33,16 @@ // Warn if a compiler version is used that is not supported anymore // LLVM RELEASE Update the minimum compiler versions # if defined(_LIBCPP_CLANG_VER) -# if _LIBCPP_CLANG_VER < 1900 -# warning "Libc++ only supports Clang 19 and later" +# if _LIBCPP_CLANG_VER < 2001 +# warning "Libc++ only supports Clang 20 and later" # endif # elif defined(_LIBCPP_APPLE_CLANG_VER) -# if _LIBCPP_APPLE_CLANG_VER < 1600 -# warning "Libc++ only supports AppleClang 15 and later" +# if _LIBCPP_APPLE_CLANG_VER < 1700 +# warning "Libc++ only supports AppleClang 26 and later" # endif # elif defined(_LIBCPP_GCC_VER) -# if _LIBCPP_GCC_VER < 1400 -# warning "Libc++ only supports GCC 14 and later" +# if _LIBCPP_GCC_VER < 1500 +# warning "Libc++ only supports GCC 15 and later" # endif # endif From 00bf3fd971a44278528becb9fab595c70e2442ab Mon Sep 17 00:00:00 2001 From: Cullen Rhodes Date: Fri, 31 Oct 2025 11:25:15 +0000 Subject: [PATCH 326/539] [llvm][docs] Remove guidance on adding release:reviewed label (#164395) "How To Release LLVM To The Public" [1] mentions to add the release:reviewed label once a bug has been reviewed, but looking at the label [2] it seems this hasn't been followed for quite a long time, so I propose we remove it. [1] https://llvm.org/docs/HowToReleaseLLVM.html#triaging-bug-reports-for-releases [2] https://github.com/llvm/llvm-project/issues?q=label%3Arelease%3Areviewed --- llvm/docs/HowToReleaseLLVM.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/docs/HowToReleaseLLVM.rst b/llvm/docs/HowToReleaseLLVM.rst index 171bf889256cd..c269cc4c54bcc 100644 --- a/llvm/docs/HowToReleaseLLVM.rst +++ b/llvm/docs/HowToReleaseLLVM.rst @@ -311,10 +311,10 @@ This section describes how to triage bug reports: to backport. You should also review the bug yourself to ensure that it meets the requirements for committing to the release branch. -#. Once a bug has been reviewed, add the release:reviewed label and update the - issue's status to "Needs Merge". Check the pull request associated with the - issue. If all the tests pass, then the pull request can be merged. If not, - then add a comment on the issue asking someone to take a look at the failures. +#. Once a bug has been reviewed, update the status to "Needs Merge". Check the + pull request associated with the issue. If all the tests pass, then the pull + request can be merged. If not, then add a comment on the issue asking + someone to take a look at the failures. Release Patch Rules From b4b5b364d2b4bcc5f8548830a0a44e9958e1fa6b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 31 Oct 2025 11:32:01 +0000 Subject: [PATCH 327/539] [X86] combineTruncate - trunc(srl(load(p),amt)) -> load(p+amt/8) - ensure there isn't an interdependency between the load and amt (#165850) Fixes #165755 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 +++- llvm/test/CodeGen/X86/pr165755.ll | 26 +++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/X86/pr165755.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 49beadae63f03..9525e03baa167 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -54768,9 +54768,11 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); // Check the shift amount is byte aligned. // Check the truncation doesn't use any shifted in (zero) top bits. + // Check the shift amount doesn't depend on the original load. if (KnownAmt.countMinTrailingZeros() >= 3 && KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() - - VT.getSizeInBits())) { + VT.getSizeInBits()) && + !Ld->isPredecessorOf(ShAmt.getNode())) { EVT PtrVT = Ld->getBasePtr().getValueType(); SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT); SDValue PtrByteOfs = diff --git a/llvm/test/CodeGen/X86/pr165755.ll b/llvm/test/CodeGen/X86/pr165755.ll new file mode 100644 index 0000000000000..3ab484f676c45 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr165755.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=X64 + +define i32 @PR165755(ptr %p0) { +; X86-LABEL: PR165755: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %eax +; X86-NEXT: movb $0, (%ecx) +; X86-NEXT: retl +; +; X64-LABEL: PR165755: +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movb $0, (%rdi) +; X64-NEXT: retq + %ld64 = load i64, ptr %p0, align 8 + store i8 0, ptr %p0, align 1 + %ld32 = load i32, ptr %p0, align 8 + %mask = and i32 %ld32, 32 + %zext = zext i32 %mask to i64 + %srl = lshr i64 %ld64, %zext + %res = trunc i64 %srl to i32 + ret i32 %res +} From eb7f5b8b2d92d00443a7c6a5a14f06155966c927 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 31 Oct 2025 11:32:37 +0000 Subject: [PATCH 328/539] [lld][test] Fix file cleanup in aarch64-build-attributes.s (#164396) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This test seems to have taken the lit documentation at its word: https://llvm.org/docs/CommandGuide/lit.html#substitutions "%t temporary file name unique to the test" %t is in fact the **path** of a file. As suggested by the line below that describing %basename_t. This test (I assume) assumed it was just the filename itself and so left a layout of: ``` $ tree tools/lld/test/ tools/lld/test/ ├── CMakeFiles ├── ELF │   └── Output │   ├── aarch64-build-attributes.s.tmp │   │   ├── pauth-bti-gcs.s │   │   └── pauth-bti-pac.s │   ├── aarch64-build-attributes.s.tmp.merged.o │   ├── aarch64-build-attributes.s.tmp1.o │   ├── aarch64-build-attributes.s.tmp2.o │   └── aarch64-build-attributes.s.tmp3.o ├── Unit │   └── lit.site.cfg.py ├── cmake_install.cmake └── lit.site.cfg.py ``` Note how the 2 .s files are in the temp dir but the .o files are not. This is fine, it works, but it's going to cost someone time to unpick when this test actually does fail. To fix this, remove %t from all the temp file names so they are created in the temp dir, which is cleaned before each run. New layout: ``` $ tree tools/lld/test/ tools/lld/test/ ├── CMakeFiles ├── ELF │   └── Output │   └── aarch64-build-attributes.s.tmp │   ├── 1.o │   ├── 2.o │   ├── 3.o │   ├── merged.o │   ├── pauth-bti-gcs.s │   └── pauth-bti-pac.s ├── Unit │   └── lit.site.cfg.py ├── cmake_install.cmake └── lit.site.cfg.py ``` --- lld/test/ELF/aarch64-build-attributes.s | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lld/test/ELF/aarch64-build-attributes.s b/lld/test/ELF/aarch64-build-attributes.s index f2d542150897e..3d333bf6ccf2f 100644 --- a/lld/test/ELF/aarch64-build-attributes.s +++ b/lld/test/ELF/aarch64-build-attributes.s @@ -1,11 +1,11 @@ // REQUIRES: aarch64 // RUN: rm -rf %t && split-file %s %t && cd %t -// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o %t1.o -// RUN: llvm-mc -triple=aarch64 -filetype=obj pauth-bti-gcs.s -o %t2.o -// RUN: llvm-mc -triple=aarch64 -filetype=obj pauth-bti-pac.s -o %t3.o -// RUN: ld.lld -r %t1.o %t2.o %t3.o -o %t.merged.o -// RUN: llvm-readelf -n %t.merged.o | FileCheck %s --check-prefix=NOTE +// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o 1.o +// RUN: llvm-mc -triple=aarch64 -filetype=obj pauth-bti-gcs.s -o 2.o +// RUN: llvm-mc -triple=aarch64 -filetype=obj pauth-bti-pac.s -o 3.o +// RUN: ld.lld -r 1.o 2.o 3.o -o merged.o +// RUN: llvm-readelf -n merged.o | FileCheck %s --check-prefix=NOTE /// This test merges three object files with AArch64 build attributes. /// All contain identical PAuth ABI info (platform/version), which must be preserved. From 066939eb0d99ca5720c920b51175f93995f39807 Mon Sep 17 00:00:00 2001 From: Morris Hafner Date: Fri, 31 Oct 2025 19:42:48 +0800 Subject: [PATCH 329/539] [CIR] Fix multiple returns in switch statements (#164468) Add support for multiple return statements in switch statements. Cases in switch statements don't have their own scopes but are distinct regions nonetheless. Insert multiple return blocks for each case and handle them in the cleanup code. --- clang/lib/CIR/CodeGen/CIRGenFunction.cpp | 36 ++++++---- clang/lib/CIR/CodeGen/CIRGenFunction.h | 81 ++++++++++++++-------- clang/test/CIR/CodeGen/switch.cpp | 87 ++++++++++++++++++++++++ 3 files changed, 164 insertions(+), 40 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp index 71ff20a3b0e43..5d5209b9ffb60 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp @@ -242,12 +242,19 @@ void CIRGenFunction::LexicalScope::cleanup() { } }; - if (returnBlock != nullptr) { - // Write out the return block, which loads the value from `__retval` and - // issues the `cir.return`. + // Cleanup are done right before codegen resumes a scope. This is where + // objects are destroyed. Process all return blocks. + // TODO(cir): Handle returning from a switch statement through a cleanup + // block. We can't simply jump to the cleanup block, because the cleanup block + // is not part of the case region. Either reemit all cleanups in the return + // block or wait for MLIR structured control flow to support early exits. + llvm::SmallVector retBlocks; + for (mlir::Block *retBlock : localScope->getRetBlocks()) { mlir::OpBuilder::InsertionGuard guard(builder); - builder.setInsertionPointToEnd(returnBlock); - (void)emitReturn(*returnLoc); + builder.setInsertionPointToEnd(retBlock); + retBlocks.push_back(retBlock); + mlir::Location retLoc = localScope->getRetLoc(retBlock); + emitReturn(retLoc); } auto insertCleanupAndLeave = [&](mlir::Block *insPt) { @@ -274,19 +281,22 @@ void CIRGenFunction::LexicalScope::cleanup() { if (localScope->depth == 0) { // Reached the end of the function. - if (returnBlock != nullptr) { - if (returnBlock->getUses().empty()) { - returnBlock->erase(); + // Special handling only for single return block case + if (localScope->getRetBlocks().size() == 1) { + mlir::Block *retBlock = localScope->getRetBlocks()[0]; + mlir::Location retLoc = localScope->getRetLoc(retBlock); + if (retBlock->getUses().empty()) { + retBlock->erase(); } else { // Thread return block via cleanup block. if (cleanupBlock) { - for (mlir::BlockOperand &blockUse : returnBlock->getUses()) { + for (mlir::BlockOperand &blockUse : retBlock->getUses()) { cir::BrOp brOp = mlir::cast(blockUse.getOwner()); brOp.setSuccessor(cleanupBlock); } } - cir::BrOp::create(builder, *returnLoc, returnBlock); + cir::BrOp::create(builder, retLoc, retBlock); return; } } @@ -324,8 +334,10 @@ void CIRGenFunction::LexicalScope::cleanup() { bool entryBlock = builder.getInsertionBlock()->isEntryBlock(); if (!entryBlock && curBlock->empty()) { curBlock->erase(); - if (returnBlock != nullptr && returnBlock->getUses().empty()) - returnBlock->erase(); + for (mlir::Block *retBlock : retBlocks) { + if (retBlock->getUses().empty()) + retBlock->erase(); + } return; } diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index c3fcd1a69a88e..e5cecaa573a6e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -1103,44 +1103,69 @@ class CIRGenFunction : public CIRGenTypeCache { // --- private: - // `returnBlock`, `returnLoc`, and all the functions that deal with them - // will change and become more complicated when `switch` statements are - // upstreamed. `case` statements within the `switch` are in the same scope - // but have their own regions. Therefore the LexicalScope will need to - // keep track of multiple return blocks. - mlir::Block *returnBlock = nullptr; - std::optional returnLoc; - - // See the comment on `getOrCreateRetBlock`. + // On switches we need one return block per region, since cases don't + // have their own scopes but are distinct regions nonetheless. + + // TODO: This implementation should change once we have support for early + // exits in MLIR structured control flow (llvm-project#161575) + llvm::SmallVector retBlocks; + llvm::DenseMap retLocs; + llvm::DenseMap retBlockInCaseIndex; + std::optional normalRetBlockIndex; + + // There's usually only one ret block per scope, but this needs to be + // get or create because of potential unreachable return statements, note + // that for those, all source location maps to the first one found. mlir::Block *createRetBlock(CIRGenFunction &cgf, mlir::Location loc) { - assert(returnBlock == nullptr && "only one return block per scope"); - // Create the cleanup block but don't hook it up just yet. + assert((isa_and_nonnull( + cgf.builder.getBlock()->getParentOp()) || + retBlocks.size() == 0) && + "only switches can hold more than one ret block"); + + // Create the return block but don't hook it up just yet. mlir::OpBuilder::InsertionGuard guard(cgf.builder); - returnBlock = - cgf.builder.createBlock(cgf.builder.getBlock()->getParent()); - updateRetLoc(returnBlock, loc); - return returnBlock; + auto *b = cgf.builder.createBlock(cgf.builder.getBlock()->getParent()); + retBlocks.push_back(b); + updateRetLoc(b, loc); + return b; } cir::ReturnOp emitReturn(mlir::Location loc); void emitImplicitReturn(); public: - mlir::Block *getRetBlock() { return returnBlock; } - mlir::Location getRetLoc(mlir::Block *b) { return *returnLoc; } - void updateRetLoc(mlir::Block *b, mlir::Location loc) { returnLoc = loc; } - - // Create the return block for this scope, or return the existing one. - // This get-or-create logic is necessary to handle multiple return - // statements within the same scope, which can happen if some of them are - // dead code or if there is a `goto` into the middle of the scope. + llvm::ArrayRef getRetBlocks() { return retBlocks; } + mlir::Location getRetLoc(mlir::Block *b) { return retLocs.at(b); } + void updateRetLoc(mlir::Block *b, mlir::Location loc) { + retLocs.insert_or_assign(b, loc); + } + mlir::Block *getOrCreateRetBlock(CIRGenFunction &cgf, mlir::Location loc) { - if (returnBlock == nullptr) { - returnBlock = createRetBlock(cgf, loc); - return returnBlock; + // Check if we're inside a case region + if (auto caseOp = mlir::dyn_cast_if_present( + cgf.builder.getBlock()->getParentOp())) { + auto iter = retBlockInCaseIndex.find(caseOp); + if (iter != retBlockInCaseIndex.end()) { + // Reuse existing return block + mlir::Block *ret = retBlocks[iter->second]; + updateRetLoc(ret, loc); + return ret; + } + // Create new return block + mlir::Block *ret = createRetBlock(cgf, loc); + retBlockInCaseIndex[caseOp] = retBlocks.size() - 1; + return ret; } - updateRetLoc(returnBlock, loc); - return returnBlock; + + if (normalRetBlockIndex) { + mlir::Block *ret = retBlocks[*normalRetBlockIndex]; + updateRetLoc(ret, loc); + return ret; + } + + mlir::Block *ret = createRetBlock(cgf, loc); + normalRetBlockIndex = retBlocks.size() - 1; + return ret; } mlir::Block *getEntryBlock() { return entryBlock; } diff --git a/clang/test/CIR/CodeGen/switch.cpp b/clang/test/CIR/CodeGen/switch.cpp index e13aa8f4f4953..3824be0d08c2f 100644 --- a/clang/test/CIR/CodeGen/switch.cpp +++ b/clang/test/CIR/CodeGen/switch.cpp @@ -1183,3 +1183,90 @@ int nested_switch(int a) { // OGCG: [[IFEND10]]: // OGCG: br label %[[EPILOG]] // OGCG: [[EPILOG]]: + +int sw_return_multi_cases(int x) { + switch (x) { + case 0: + return 0; + case 1: + return 1; + case 2: + return 2; + default: + return -1; + } +} + +// CIR-LABEL: cir.func{{.*}} @_Z21sw_return_multi_casesi +// CIR: cir.switch (%{{.*}} : !s32i) { +// CIR-NEXT: cir.case(equal, [#cir.int<0> : !s32i]) { +// CIR: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i +// CIR: cir.store{{.*}} %[[ZERO]], %{{.*}} : !s32i, !cir.ptr +// CIR: %[[RET0:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !s32i +// CIR-NEXT: cir.return %[[RET0]] : !s32i +// CIR-NEXT: } +// CIR-NEXT: cir.case(equal, [#cir.int<1> : !s32i]) { +// CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i +// CIR: cir.store{{.*}} %[[ONE]], %{{.*}} : !s32i, !cir.ptr +// CIR: %[[RET1:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !s32i +// CIR-NEXT: cir.return %[[RET1]] : !s32i +// CIR-NEXT: } +// CIR-NEXT: cir.case(equal, [#cir.int<2> : !s32i]) { +// CIR: %[[TWO:.*]] = cir.const #cir.int<2> : !s32i +// CIR: cir.store{{.*}} %[[TWO]], %{{.*}} : !s32i, !cir.ptr +// CIR: %[[RET2:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !s32i +// CIR-NEXT: cir.return %[[RET2]] : !s32i +// CIR-NEXT: } +// CIR-NEXT: cir.case(default, []) { +// CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i +// CIR: %[[NEG:.*]] = cir.unary(minus, %[[ONE]]) {{.*}} : !s32i, !s32i +// CIR: cir.store{{.*}} %[[NEG]], %{{.*}} : !s32i, !cir.ptr +// CIR: %[[RETDEF:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr, !s32i +// CIR-NEXT: cir.return %[[RETDEF]] : !s32i +// CIR-NEXT: } +// CIR-NEXT: cir.yield + +// LLVM-LABEL: define{{.*}} i32 @_Z21sw_return_multi_casesi +// LLVM: switch i32 %{{.*}}, label %[[DEFAULT:.*]] [ +// LLVM-DAG: i32 0, label %[[CASE0:.*]] +// LLVM-DAG: i32 1, label %[[CASE1:.*]] +// LLVM-DAG: i32 2, label %[[CASE2:.*]] +// LLVM: ] +// LLVM: [[CASE0]]: +// LLVM: store i32 0, ptr %{{.*}}, align 4 +// LLVM: %{{.*}} = load i32, ptr %{{.*}}, align 4 +// LLVM: ret i32 %{{.*}} +// LLVM: [[CASE1]]: +// LLVM: store i32 1, ptr %{{.*}}, align 4 +// LLVM: %{{.*}} = load i32, ptr %{{.*}}, align 4 +// LLVM: ret i32 %{{.*}} +// LLVM: [[CASE2]]: +// LLVM: store i32 2, ptr %{{.*}}, align 4 +// LLVM: %{{.*}} = load i32, ptr %{{.*}}, align 4 +// LLVM: ret i32 %{{.*}} +// LLVM: [[DEFAULT]]: +// LLVM: store i32 -1, ptr %{{.*}}, align 4 +// LLVM: %{{.*}} = load i32, ptr %{{.*}}, align 4 +// LLVM: ret i32 %{{.*}} + +// OGCG-LABEL: define{{.*}} i32 @_Z21sw_return_multi_casesi +// OGCG: entry: +// OGCG: %[[RETVAL:.*]] = alloca i32, align 4 +// OGCG: %[[X_ADDR:.*]] = alloca i32, align 4 +// OGCG: %[[X_VAL:.*]] = load i32, ptr %[[X_ADDR]], align 4 +// OGCG: switch i32 %[[X_VAL]], label %[[DEFAULT:.*]] [ +// OGCG-DAG: i32 0, label %[[SW0:.*]] +// OGCG-DAG: i32 1, label %[[SW1:.*]] +// OGCG-DAG: i32 2, label %[[SW2:.*]] +// OGCG: ] +// OGCG: [[SW0]]: +// OGCG: br label %[[RETURN:.*]] +// OGCG: [[SW1]]: +// OGCG: br label %[[RETURN]] +// OGCG: [[SW2]]: +// OGCG: br label %[[RETURN]] +// OGCG: [[DEFAULT]]: +// OGCG: br label %[[RETURN]] +// OGCG: [[RETURN]]: +// OGCG: %[[RETVAL_LOAD:.*]] = load i32, ptr %[[RETVAL]], align 4 +// OGCG: ret i32 %[[RETVAL_LOAD]] From da5fd70284862238d8e65bbd2d48c04a77fea4b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Pir=C3=B3g?= Date: Fri, 31 Oct 2025 12:50:21 +0100 Subject: [PATCH 330/539] [X86] Remove AMX-TRANSPOSE (#165556) Per Intel Architecture Instruction Set Extensions Programming Reference rev. 59 (https://cdrdv2.intel.com/v1/dl/getContent/671368), Revision History entry for revision -59, AMX-TRANSPOSE was removed --- clang/include/clang/Basic/BuiltinsX86_64.td | 89 - clang/include/clang/Driver/Options.td | 2 - clang/lib/Basic/Targets/X86.cpp | 6 - clang/lib/Basic/Targets/X86.h | 1 - clang/lib/CodeGen/TargetBuiltins/X86.cpp | 68 - clang/lib/Headers/CMakeLists.txt | 6 - clang/lib/Headers/amxbf16transposeintrin.h | 94 - clang/lib/Headers/amxcomplextransposeintrin.h | 303 - clang/lib/Headers/amxfp16transposeintrin.h | 94 - clang/lib/Headers/amxintrin.h | 2 - clang/lib/Headers/amxmovrstransposeintrin.h | 200 - clang/lib/Headers/amxtf32transposeintrin.h | 105 - clang/lib/Headers/amxtransposeintrin.h | 248 - clang/lib/Headers/immintrin.h | 12 - clang/lib/Sema/SemaX86.cpp | 17 - clang/test/CodeGen/X86/amx_movrs_tranpose.c | 53 - .../test/CodeGen/X86/amx_movrs_tranpose_api.c | 81 - .../CodeGen/X86/amx_movrs_transpose_errors.c | 22 - clang/test/CodeGen/X86/amx_tf32.c | 5 - clang/test/CodeGen/X86/amx_tf32_api.c | 7 - clang/test/CodeGen/X86/amx_tf32_errors.c | 8 - clang/test/CodeGen/X86/amx_transpose.c | 75 - clang/test/CodeGen/X86/amx_transpose_api.c | 114 - clang/test/CodeGen/X86/amx_transpose_errors.c | 75 - clang/test/Driver/x86-target-features.c | 7 - .../Preprocessor/predefined-arch-macros.c | 2 - clang/test/Preprocessor/x86_target_features.c | 12 - llvm/include/llvm/CodeGen/TileShapeInfo.h | 88 +- llvm/include/llvm/IR/IntrinsicsX86.td | 104 - .../Support/X86DisassemblerDecoderCommon.h | 1 - .../llvm/TargetParser/X86TargetParser.def | 1 - llvm/lib/Target/X86/AsmParser/X86Operand.h | 31 - .../X86/Disassembler/X86Disassembler.cpp | 5 - .../X86/Disassembler/X86DisassemblerDecoder.h | 7 - .../X86/MCTargetDesc/X86InstPrinterCommon.cpp | 19 - .../X86/MCTargetDesc/X86InstPrinterCommon.h | 1 - llvm/lib/Target/X86/X86.td | 6 +- llvm/lib/Target/X86/X86ExpandPseudo.cpp | 190 +- llvm/lib/Target/X86/X86FastPreTileConfig.cpp | 40 +- llvm/lib/Target/X86/X86FastTileConfig.cpp | 25 +- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 78 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 169 +- llvm/lib/Target/X86/X86InstrAMX.td | 208 - llvm/lib/Target/X86/X86InstrInfo.cpp | 13 +- llvm/lib/Target/X86/X86InstrOperands.td | 7 - llvm/lib/Target/X86/X86InstrPredicates.td | 1 - llvm/lib/Target/X86/X86LowerAMXType.cpp | 203 +- llvm/lib/Target/X86/X86PreTileConfig.cpp | 26 +- llvm/lib/Target/X86/X86RegisterInfo.cpp | 70 +- llvm/lib/Target/X86/X86RegisterInfo.td | 9 - llvm/lib/Target/X86/X86TileConfig.cpp | 83 +- llvm/lib/TargetParser/Host.cpp | 1 - llvm/lib/TargetParser/X86TargetParser.cpp | 3 +- .../Inputs/reference_x86_vocab_print.txt | 22 - .../reference_x86_vocab_wo=0.5_print.txt | 22 - llvm/test/CodeGen/X86/amx-tf32-internal.ll | 7 +- llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll | 12 +- .../X86/amx_movrs_transpose_intrinsics.ll | 122 - .../CodeGen/X86/amx_tile_pair_O2_to_O0.ll | 136 - .../X86/amx_tile_pair_configure_O0.mir | 165 - .../X86/amx_tile_pair_configure_O2.mir | 153 - llvm/test/CodeGen/X86/amx_tile_pair_copy.mir | 97 - .../X86/amx_tile_pair_lower_type_O0.ll | 87 - .../X86/amx_tile_pair_lower_type_O2.ll | 61 - .../X86/amx_tile_pair_preconfigure_O0.mir | 134 - .../X86/amx_tile_pair_preconfigure_O2.mir | 113 - .../CodeGen/X86/amx_transpose_intrinsics.ll | 371 - llvm/test/CodeGen/X86/ipra-reg-usage.ll | 4 +- .../Disassembler/X86/AMX/x86-64-amx-movrs.txt | 128 - .../Disassembler/X86/AMX/x86-64-amx-tf32.txt | 8 - .../MC/Disassembler/X86/amx-transpose-att.txt | 154 - llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s | 128 - llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s | 128 - llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s | 7 - llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s | 7 - llvm/test/MC/X86/amx-transpose-att.s | 153 - llvm/test/MC/X86/amx-transpose-intel.s | 153 - llvm/test/TableGen/x86-instr-mapping.inc | 8 - .../llvm-ir2vec/output/reference_triplets.txt | 52 +- .../output/reference_x86_entities.txt | 11444 ++++++++-------- llvm/unittests/CodeGen/InstrRefLDVTest.cpp | 2 +- llvm/utils/TableGen/X86RecognizableInstr.cpp | 1 - 82 files changed, 5835 insertions(+), 11141 deletions(-) delete mode 100644 clang/lib/Headers/amxbf16transposeintrin.h delete mode 100644 clang/lib/Headers/amxcomplextransposeintrin.h delete mode 100644 clang/lib/Headers/amxfp16transposeintrin.h delete mode 100644 clang/lib/Headers/amxmovrstransposeintrin.h delete mode 100644 clang/lib/Headers/amxtf32transposeintrin.h delete mode 100644 clang/lib/Headers/amxtransposeintrin.h delete mode 100755 clang/test/CodeGen/X86/amx_movrs_tranpose.c delete mode 100755 clang/test/CodeGen/X86/amx_movrs_tranpose_api.c delete mode 100755 clang/test/CodeGen/X86/amx_movrs_transpose_errors.c delete mode 100644 clang/test/CodeGen/X86/amx_transpose.c delete mode 100644 clang/test/CodeGen/X86/amx_transpose_api.c delete mode 100644 clang/test/CodeGen/X86/amx_transpose_errors.c delete mode 100755 llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll delete mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll delete mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir delete mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir delete mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_copy.mir delete mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll delete mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll delete mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir delete mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir delete mode 100644 llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll delete mode 100644 llvm/test/MC/Disassembler/X86/amx-transpose-att.txt delete mode 100644 llvm/test/MC/X86/amx-transpose-att.s delete mode 100644 llvm/test/MC/X86/amx-transpose-intel.s diff --git a/clang/include/clang/Basic/BuiltinsX86_64.td b/clang/include/clang/Basic/BuiltinsX86_64.td index 275278c5ac089..062060e6afbbe 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.td +++ b/clang/include/clang/Basic/BuiltinsX86_64.td @@ -239,57 +239,6 @@ let Features = "amx-complex", Attributes = [NoThrow] in { def tcmmrlfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; } -let Features = "amx-transpose", Attributes = [NoThrow] in { - def t2rpntlvwz0_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">; -} - -let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in { - def t2rpntlvwz0rs_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">; -} - -let Features = "amx-transpose", Attributes = [NoThrow] in { - def t2rpntlvwz0t1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">; -} - -let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in { - def t2rpntlvwz0rst1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">; -} - -let Features = "amx-transpose", Attributes = [NoThrow] in { - def t2rpntlvwz1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">; -} - -let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in { - def t2rpntlvwz1rs_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">; -} - -let Features = "amx-transpose", Attributes = [NoThrow] in { - def t2rpntlvwz1t1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">; -} - -let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in { - def t2rpntlvwz1rst1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">; -} - -let Features = "amx-transpose", Attributes = [NoThrow] in { - def ttransposed_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, _Vector<256, int>)">; -} - -let Features = "amx-bf16,amx-transpose", Attributes = [NoThrow] in { - def ttdpbf16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; -} - -let Features = "amx-fp16,amx-transpose", Attributes = [NoThrow] in { - def ttdpfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; -} - -let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in { - def ttcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; - def ttcmmrlfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; - def tconjtcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; - def tconjtfp16_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, _Vector<256, int>)">; -} - let Features = "amx-avx512,avx10.2", Attributes = [NoThrow] in { def tcvtrowd2ps_internal : X86Builtin<"_Vector<16, float>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">; def tcvtrowps2bf16h_internal : X86Builtin<"_Vector<32, __bf16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">; @@ -303,10 +252,6 @@ let Features = "amx-tf32", Attributes = [NoThrow] in { def tmmultf32ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; } -let Features = "amx-tf32,amx-transpose", Attributes = [NoThrow] in { - def ttmmultf32ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; -} - let Features = "amx-fp8", Attributes = [NoThrow] in { def tdpbf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; def tdpbhf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">; @@ -321,13 +266,6 @@ let Features = "amx-tile", Attributes = [NoThrow] in { def tilezero : X86Builtin<"void(unsigned char)">; } -let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in { - def t2rpntlvwz0rs : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; - def t2rpntlvwz0rst1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; - def t2rpntlvwz1rs : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; - def t2rpntlvwz1rst1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; -} - let Features = "amx-movrs", Attributes = [NoThrow] in { def tileloaddrs64 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; def tileloaddrst164 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; @@ -359,29 +297,6 @@ let Features = "amx-complex", Attributes = [NoThrow] in { def tcmmrlfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; } -let Features = "amx-transpose", Attributes = [NoThrow] in { - def t2rpntlvwz0 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; - def t2rpntlvwz0t1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; - def t2rpntlvwz1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; - def t2rpntlvwz1t1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">; - def ttransposed : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char)">; -} - -let Features = "amx-bf16,amx-transpose", Attributes = [NoThrow] in { - def ttdpbf16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; -} - -let Features = "amx-fp16,amx-transpose", Attributes = [NoThrow] in { - def ttdpfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; -} - -let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in { - def ttcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; - def ttcmmrlfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; - def tconjtcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; - def tconjtfp16 : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char)">; -} - let Features = "amx-avx512,avx10.2", Attributes = [NoThrow] in { def tcvtrowd2ps : X86Builtin<"_Vector<16, float>(_Constant unsigned char, unsigned int)">; def tcvtrowps2bf16h : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, unsigned int)">; @@ -406,10 +321,6 @@ let Features = "amx-tf32", Attributes = [NoThrow] in { def tmmultf32ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; } -let Features = "amx-tf32,amx-transpose", Attributes = [NoThrow] in { - def ttmmultf32ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">; -} - let Features = "prefetchi", Attributes = [NoThrow, Const] in { def prefetchi : X86Builtin<"void(void const *, unsigned int)">; } diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index cb5cb888c6da7..7f33f31eeea67 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6695,8 +6695,6 @@ def mamx_tf32 : Flag<["-"], "mamx-tf32">, Group; def mno_amx_tf32 : Flag<["-"], "mno-amx-tf32">, Group; def mamx_tile : Flag<["-"], "mamx-tile">, Group; def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group; -def mamx_transpose : Flag<["-"], "mamx-transpose">, Group; -def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group; def mamx_movrs: Flag<["-"], "mamx-movrs">, Group; def mno_amx_movrs: Flag<["-"], "mno-amx-movrs">, Group; def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index e71f10c4c16fc..7a90c89dd7dc0 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -396,8 +396,6 @@ bool X86TargetInfo::handleTargetFeatures(std::vector &Features, HasAMXFP8 = true; } else if (Feature == "+amx-movrs") { HasAMXMOVRS = true; - } else if (Feature == "+amx-transpose") { - HasAMXTRANSPOSE = true; } else if (Feature == "+amx-avx512") { HasAMXAVX512 = true; } else if (Feature == "+amx-tf32") { @@ -925,8 +923,6 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AMX_FP8__"); if (HasAMXMOVRS) Builder.defineMacro("__AMX_MOVRS__"); - if (HasAMXTRANSPOSE) - Builder.defineMacro("__AMX_TRANSPOSE__"); if (HasAMXAVX512) Builder.defineMacro("__AMX_AVX512__"); if (HasAMXTF32) @@ -1068,7 +1064,6 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("amx-movrs", true) .Case("amx-tf32", true) .Case("amx-tile", true) - .Case("amx-transpose", true) .Case("avx", true) .Case("avx10.1", true) .Case("avx10.2", true) @@ -1189,7 +1184,6 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("amx-movrs", HasAMXMOVRS) .Case("amx-tf32", HasAMXTF32) .Case("amx-tile", HasAMXTILE) - .Case("amx-transpose", HasAMXTRANSPOSE) .Case("avx", SSELevel >= AVX) .Case("avx10.1", HasAVX10_1) .Case("avx10.2", HasAVX10_2) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index be3a473174370..e7da2622e78b5 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -160,7 +160,6 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAMXCOMPLEX = false; bool HasAMXFP8 = false; bool HasAMXMOVRS = false; - bool HasAMXTRANSPOSE = false; bool HasAMXAVX512 = false; bool HasAMXTF32 = false; bool HasSERIALIZE = false; diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index b924407b6ddd7..2381b2e7cf2cf 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -2931,74 +2931,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, // instruction, but it will create a memset that won't be optimized away. return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true); } - // Corresponding to intrisics which will return 2 tiles (tile0_tile1). - case X86::BI__builtin_ia32_t2rpntlvwz0_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: { - Intrinsic::ID IID; - switch (BuiltinID) { - default: - llvm_unreachable("Unsupported intrinsic!"); - case X86::BI__builtin_ia32_t2rpntlvwz0_internal: - IID = Intrinsic::x86_t2rpntlvwz0_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: - IID = Intrinsic::x86_t2rpntlvwz0rs_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: - IID = Intrinsic::x86_t2rpntlvwz0t1_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: - IID = Intrinsic::x86_t2rpntlvwz0rst1_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz1_internal: - IID = Intrinsic::x86_t2rpntlvwz1_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: - IID = Intrinsic::x86_t2rpntlvwz1rs_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: - IID = Intrinsic::x86_t2rpntlvwz1t1_internal; - break; - case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: - IID = Intrinsic::x86_t2rpntlvwz1rst1_internal; - break; - } - - // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride) - Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), - {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]}); - - auto *PtrTy = E->getArg(3)->getType()->getAs(); - assert(PtrTy && "arg3 must be of pointer type"); - QualType PtreeTy = PtrTy->getPointeeType(); - llvm::Type *TyPtee = ConvertType(PtreeTy); - - // Bitcast amx type (x86_amx) to vector type (256 x i32) - // Then store tile0 into DstPtr0 - Value *T0 = Builder.CreateExtractValue(Call, 0); - Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, - {TyPtee}, {T0}); - Builder.CreateDefaultAlignedStore(VecT0, Ops[3]); - - // Then store tile1 into DstPtr1 - Value *T1 = Builder.CreateExtractValue(Call, 1); - Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, - {TyPtee}, {T1}); - Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]); - - // Note: Here we escape directly use x86_tilestored64_internal to store - // the results due to it can't make sure the Mem written scope. This may - // cause shapes reloads after first amx intrinsic, which current amx reg- - // ister allocation has no ability to handle it. - - return Store; - } case X86::BI__ud2: // llvm.trap makes a ud2a instruction on x86. return EmitTrapCall(Intrinsic::trap); diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 18589125697b0..33fff7645df65 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -162,18 +162,12 @@ set(x86_files adxintrin.h ammintrin.h amxavx512intrin.h - amxbf16transposeintrin.h amxcomplexintrin.h - amxcomplextransposeintrin.h amxfp16intrin.h - amxfp16transposeintrin.h amxfp8intrin.h amxintrin.h amxmovrsintrin.h - amxmovrstransposeintrin.h amxtf32intrin.h - amxtf32transposeintrin.h - amxtransposeintrin.h avx10_2_512bf16intrin.h avx10_2_512convertintrin.h avx10_2_512minmaxintrin.h diff --git a/clang/lib/Headers/amxbf16transposeintrin.h b/clang/lib/Headers/amxbf16transposeintrin.h deleted file mode 100644 index 86f09f2ad8db2..0000000000000 --- a/clang/lib/Headers/amxbf16transposeintrin.h +++ /dev/null @@ -1,94 +0,0 @@ -/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===------------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error \ - "Never use directly; use instead." -#endif /* __IMMINTRIN_H */ - -#ifndef __AMX_BF16TRANSPOSEINTRIN_H -#define __AMX_BF16TRANSPOSEINTRIN_H -#ifdef __x86_64__ - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("amx-bf16,amx-transpose"))) - -/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in -/// tiles \a a and \a b, accumulating the intermediate single-precision -/// (32-bit) floating-point elements with elements in \a dst, and store the -/// 32-bit result back to tile \a dst. -/// -/// \headerfile -/// -/// \code -/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b) -/// \endcode -/// -/// \code{.operation} -/// FOR m := 0 TO dst.rows - 1 -/// tmp := dst.row[m] -/// FOR k := 0 TO (a.colsb / 4) - 1 -/// FOR n := 0 TO (dst.colsb / 4) - 1 -/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) * -/// FP32(b.row[k].bf16[2*n+0]) -/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) * -/// FP32(b.row[k].bf16[2*n+1]) -/// ENDFOR -/// ENDFOR -/// write_row_and_zero(dst, m, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TTDPBF16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps((dst), (a), (b)) - -/// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS -_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k, - _tile1024i dst, _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2); -} - -/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in -/// tiles src0 and src1, accumulating the intermediate single-precision -/// (32-bit) floating-point elements with elements in "dst", and store the -/// 32-bit result back to tile "dst". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TTDPBF16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile, - src0.tile, src1.tile); -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* __x86_64__ */ -#endif /* __AMX_BF16TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/amxcomplextransposeintrin.h b/clang/lib/Headers/amxcomplextransposeintrin.h deleted file mode 100644 index 11abaf98e9371..0000000000000 --- a/clang/lib/Headers/amxcomplextransposeintrin.h +++ /dev/null @@ -1,303 +0,0 @@ -/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===------------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error \ - "Never use directly; include instead." -#endif // __IMMINTRIN_H - -#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H -#define __AMX_COMPLEXTRANSPOSEINTRIN_H -#ifdef __x86_64__ - -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("amx-complex,amx-transpose"))) - -/// Perform matrix multiplication of two tiles containing complex elements and -/// accumulate the results into a packed single precision tile. Each dword -/// element in input tiles \a a and \a b is interpreted as a complex number -/// with FP16 real part and FP16 imaginary part. -/// Calculates the imaginary part of the result. For each possible combination -/// of (transposed column of \a a, column of \a b), it performs a set of -/// multiplication and accumulations on all corresponding complex numbers -/// (one from \a a and one from \a b). The imaginary part of the \a a element -/// is multiplied with the real part of the corresponding \a b element, and -/// the real part of the \a a element is multiplied with the imaginary part -/// of the corresponding \a b elements. The two accumulated results are -/// added, and then accumulated into the corresponding row and column of -/// \a dst. -/// -/// \headerfile -/// -/// \code -/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b); -/// \endcode -/// -/// \code{.operation} -/// FOR m := 0 TO dst.rows - 1 -/// tmp := dst.row[m] -/// FOR k := 0 TO a.rows - 1 -/// FOR n := 0 TO (dst.colsb / 4) - 1 -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1]) -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0]) -/// ENDFOR -/// ENDFOR -/// write_row_and_zero(dst, m, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_tcmmimfp16ps(dst, a, b) \ - __builtin_ia32_ttcmmimfp16ps((dst), (a), (b)) - -/// Perform matrix multiplication of two tiles containing complex elements and -/// accumulate the results into a packed single precision tile. Each dword -/// element in input tiles \a a and \a b is interpreted as a complex number -/// with FP16 real part and FP16 imaginary part. -/// Calculates the real part of the result. For each possible combination -/// of (rtransposed colum of \a a, column of \a b), it performs a set of -/// multiplication and accumulations on all corresponding complex numbers -/// (one from \a a and one from \a b). The real part of the \a a element is -/// multiplied with the real part of the corresponding \a b element, and the -/// negated imaginary part of the \a a element is multiplied with the -/// imaginary part of the corresponding \a b elements. The two accumulated -/// results are added, and then accumulated into the corresponding row and -/// column of \a dst. -/// -/// \headerfile -/// -/// \code -/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b); -/// \endcode -/// -/// \code{.operation} -/// FOR m := 0 TO dst.rows - 1 -/// tmp := dst.row[m] -/// FOR k := 0 TO a.rows - 1 -/// FOR n := 0 TO (dst.colsb / 4) - 1 -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0]) -/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1]) -/// ENDFOR -/// ENDFOR -/// write_row_and_zero(dst, m, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_tcmmrlfp16ps(dst, a, b) \ - __builtin_ia32_ttcmmrlfp16ps((dst), (a), (b)) - -/// Perform matrix conjugate transpose and multiplication of two tiles -/// containing complex elements and accumulate the results into a packed -/// single precision tile. Each dword element in input tiles \a a and \a b -/// is interpreted as a complex number with FP16 real part and FP16 imaginary -/// part. -/// Calculates the imaginary part of the result. For each possible combination -/// of (transposed column of \a a, column of \a b), it performs a set of -/// multiplication and accumulations on all corresponding complex numbers -/// (one from \a a and one from \a b). The negated imaginary part of the \a a -/// element is multiplied with the real part of the corresponding \a b -/// element, and the real part of the \a a element is multiplied with the -/// imaginary part of the corresponding \a b elements. The two accumulated -/// results are added, and then accumulated into the corresponding row and -/// column of \a dst. -/// -/// \headerfile -/// -/// \code -/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b); -/// \endcode -/// -/// \code{.operation} -/// FOR m := 0 TO dst.rows - 1 -/// tmp := dst.row[m] -/// FOR k := 0 TO a.rows - 1 -/// FOR n := 0 TO (dst.colsb / 4) - 1 -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1]) -/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0]) -/// ENDFOR -/// ENDFOR -/// write_row_and_zero(dst, m, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_conjtcmmimfp16ps(dst, a, b) \ - __builtin_ia32_tconjtcmmimfp16ps((dst), (a), (b)) - -/// Perform conjugate transpose of an FP16-pair of complex elements from \a a -/// and writes the result to \a dst. -/// -/// \headerfile -/// -/// \code -/// void _tile_conjtfp16(__tile dst, __tile a); -/// \endcode -/// -/// \code{.operation} -/// FOR i := 0 TO dst.rows - 1 -/// FOR j := 0 TO (dst.colsb / 4) - 1 -/// tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0] -/// tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1] -/// ENDFOR -/// write_row_and_zero(dst, i, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TCONJTFP16 instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The source tile. Max size is 1024 Bytes. -#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16((dst), (a)) - -static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal( - unsigned short m, unsigned short n, unsigned short k, _tile1024i dst, - _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2); -} - -static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal( - unsigned short m, unsigned short n, unsigned short k, _tile1024i dst, - _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2); -} - -static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal( - unsigned short m, unsigned short n, unsigned short k, _tile1024i dst, - _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2); -} - -static __inline__ _tile1024i __DEFAULT_FN_ATTRS -_tile_conjtfp16_internal(unsigned short m, unsigned short n, _tile1024i src) { - return __builtin_ia32_tconjtfp16_internal(m, n, src); -} - -/// Perform matrix multiplication of two tiles containing complex elements and -/// accumulate the results into a packed single precision tile. Each dword -/// element in input tiles src0 and src1 is interpreted as a complex number -/// with FP16 real part and FP16 imaginary part. -/// This function calculates the imaginary part of the result. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TTCMMIMFP16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col, - dst->tile, src0.tile, src1.tile); -} - -/// Perform matrix multiplication of two tiles containing complex elements and -/// accumulate the results into a packed single precision tile. Each dword -/// element in input tiles src0 and src1 is interpreted as a complex number -/// with FP16 real part and FP16 imaginary part. -/// This function calculates the real part of the result. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TTCMMRLFP16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col, - dst->tile, src0.tile, src1.tile); -} - -/// Perform matrix conjugate transpose and multiplication of two tiles -/// containing complex elements and accumulate the results into a packed -/// single precision tile. Each dword element in input tiles src0 and src1 -/// is interpreted as a complex number with FP16 real part and FP16 imaginary -/// part. -/// This function calculates the imaginary part of the result. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TCONJTCMMIMFP16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col, - dst->tile, src0.tile, src1.tile); -} - -/// Perform conjugate transpose of an FP16-pair of complex elements from src and -/// writes the result to dst. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TCONJTFP16 instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src -/// The source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) { - dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile); -} - -#undef __DEFAULT_FN_ATTRS - -#endif // __x86_64__ -#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H diff --git a/clang/lib/Headers/amxfp16transposeintrin.h b/clang/lib/Headers/amxfp16transposeintrin.h deleted file mode 100644 index 191f8c6097a2c..0000000000000 --- a/clang/lib/Headers/amxfp16transposeintrin.h +++ /dev/null @@ -1,94 +0,0 @@ -/*===----- amxfp16transposeintrin.h - AMX-FP16 and AMX-TRANSPOSE ------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===------------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error \ - "Never use directly; use instead." -#endif /* __IMMINTRIN_H */ - -#ifndef __AMX_FP16TRANSPOSEINTRIN_H -#define __AMX_FP16TRANSPOSEINTRIN_H -#ifdef __x86_64__ - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("amx-fp16,amx-transpose"))) - -/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in -/// tiles \a a and \a b, accumulating the intermediate single-precision -/// (32-bit) floating-point elements with elements in \a dst, and store the -/// 32-bit result back to tile \a dst. -/// -/// \headerfile -/// -/// \code -/// void _tile_tdpfp16ps (__tile dst, __tile a, __tile b) -/// \endcode -/// -/// \code{.operation} -/// FOR m := 0 TO dst.rows - 1 -/// tmp := dst.row[m] -/// FOR k := 0 TO (a.colsb / 4) - 1 -/// FOR n := 0 TO (dst.colsb / 4) - 1 -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * -/// FP32(b.row[k].fp16[2*n+0]) -/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * -/// FP32(b.row[k].fp16[2*n+1]) -/// ENDFOR -/// ENDFOR -/// write_row_and_zero(dst, m, tmp, dst.colsb) -/// ENDFOR -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TTDPFP16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_tdpfp16ps(dst, a, b) __builtin_ia32_ttdpfp16ps((dst), (a), (b)) - -/// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS -_tile_tdpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k, - _tile1024i dst, _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_ttdpfp16ps_internal(m, n, k, dst, src1, src2); -} - -/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in -/// tiles src0 and src1, accumulating the intermediate single-precision -/// (32-bit) floating-point elements with elements in "dst", and store the -/// 32-bit result back to tile "dst". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TTDPFP16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS -static __inline__ void __tile_tdpfp16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_tdpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile, - src0.tile, src1.tile); -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* __x86_64__ */ -#endif /* __AMX_FP16TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h index a7da10d9951e7..208aa3580625f 100644 --- a/clang/lib/Headers/amxintrin.h +++ b/clang/lib/Headers/amxintrin.h @@ -230,8 +230,6 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) { /// bytes. Since there is no 2D type in llvm IR, we use vector type to /// represent 2D tile and the fixed size is maximum amx tile register size. typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64))); -typedef int _tile1024i_1024a - __attribute__((__vector_size__(1024), __aligned__(1024))); /// This is internal intrinsic. C/C++ user should avoid calling it directly. static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE diff --git a/clang/lib/Headers/amxmovrstransposeintrin.h b/clang/lib/Headers/amxmovrstransposeintrin.h deleted file mode 100644 index 5f48cba949f34..0000000000000 --- a/clang/lib/Headers/amxmovrstransposeintrin.h +++ /dev/null @@ -1,200 +0,0 @@ -/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * ===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error \ - "Never use directly; use instead." -#endif /* __IMMINTRIN_H */ - -#ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H -#define __AMX_MOVRS_TRANSPOSEINTRIN_H -#ifdef __x86_64__ - -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("amx-transpose,amx-movrs"))) - -#define _tile_2rpntlvwz0rs(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride) -#define _tile_2rpntlvwz0rst1(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride) -#define _tile_2rpntlvwz1rs(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride) -#define _tile_2rpntlvwz1rst1(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride) - -static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - // Use __tile1024i_1024a* to escape the alignment check in - // clang/test/Headers/x86-intrinsics-headers-clean.cpp - __builtin_ia32_t2rpntlvwz0rs_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz0rst1_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz1rs_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz1rst1_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. -/// Provides a hint to the implementation that the data will likely become -/// read shared in the near future and the data caching can be optimized. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the T2RPNTLVWZ0RS instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS -static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the T2RPNTLVWZ0T1RS instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS -static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. The last row will be not be read from memory but instead -/// filled with zeros. -/// Provides a hint to the implementation that the data will likely become -/// read shared in the near future and the data caching can be optimized. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the T2RPNTLVWZ1 instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS -static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. The last row will be not be read from memory but instead -/// filled with zeros. -/// Provides a hint to the implementation that the data will likely become -/// read shared in the near future and the data caching can be optimized. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the T2RPNTLVWZ1T1RS instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS -static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -#undef __DEFAULT_FN_ATTRS -#endif /* __x86_64__ */ -#endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/amxtf32transposeintrin.h b/clang/lib/Headers/amxtf32transposeintrin.h deleted file mode 100644 index e1b90c1adfb22..0000000000000 --- a/clang/lib/Headers/amxtf32transposeintrin.h +++ /dev/null @@ -1,105 +0,0 @@ -/*===--------- amxtf32transposeintrin.h - AMX-TF32 and AMX-TRANSPOSE --------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===------------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error \ - "Never use directly; include instead." -#endif // __IMMINTRIN_H - -#ifndef __AMX_TF32TRANSPOSEINTRIN_H -#define __AMX_TF32TRANSPOSEINTRIN_H -#ifdef __x86_64__ - -#define __DEFAULT_FN_ATTRS_TF32_TRANSPOSE \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("amx-tf32,amx-transpose"))) - -/// \code -/// void _tile_tmmultf32ps(constexpr int srcdst, constexpr int a, \ -/// constexpr int b); -/// \endcode -/// -/// This intrinsic corresponds to the TTMMULTF32PS instruction. -/// -/// \param srcdst -/// The destination tile. Max size is 1024 Bytes. -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source tile. Max size is 1024 Bytes. -/// -/// \code{.operation} -/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) { -/// dword[12:0] := 0 -/// dword[31:13] := x[31:13] -/// return dword -/// } -/// -/// DEFINE silence_snan_fp32(x[31:0]) { -/// IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0) -/// x.fraction[22] := 1 -/// return x -/// } -/// -/// elements_dest:= srcdst.colsb/4 -/// -/// FOR m := 0 TO (srcdst.rows-1) -/// tmp[511:0] := 0 -/// FOR k := 0 TO (a.rows-1) -/// FOR n := 0 TO (elements_dest-1) -/// a1e := silence_snan_fp32(a.row[k].fp32[m]) -/// a2e := silence_snan_fp32(b.row[k].fp32[n]) -/// s1e := zero_lower_mantissa_bits_fp32(a1e) -/// s2e := zero_lower_mantissa_bits_fp32(a2e) -/// tmp.fp32[n] += s1e * s2e -/// ENDFOR -/// ENDFOR -/// -/// FOR n := 0 TO (elements_dest-1) -/// tmp.fp32[n] += srcdst.row[m].fp32[n] -/// ENDFOR -/// write_row_and_zero(srcdst, m, tmp, srcdst.colsb) -/// -/// ENDFOR -/// -/// zero_upper_rows(srcdst, srcdst.rows) -/// zero_tileconfig_start() -/// \endcode -#define _tile_tmmultf32ps(srcdst, a, b) \ - __builtin_ia32_ttmmultf32ps((srcdst), (a), (b)) - -// dst = m x n (srcdest), src1 = k x m, src2 = k x n -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32_TRANSPOSE -_tile_tmmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k, - _tile1024i dst, _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_ttmmultf32ps_internal(m, n, k, dst, src1, src2); -} - -/// Compute transpose and do Matrix Multiplication of src0 and src1, and then do -/// Matrix Plus with dst. All the calculation is base on float32 but with the -/// lower 13-bit set to 0. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TTMMULTF32PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS_TF32_TRANSPOSE -static void __tile_tmmultf32ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_tmmultf32ps_internal(src0.row, src1.col, src0.col, - dst->tile, src0.tile, src1.tile); -} - -#endif // __x86_64__ -#endif // __AMX_TF32TRANSPOSEINTRIN_H diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h deleted file mode 100644 index b3fa37d766c45..0000000000000 --- a/clang/lib/Headers/amxtransposeintrin.h +++ /dev/null @@ -1,248 +0,0 @@ -/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * ===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; use instead." -#endif /* __IMMINTRIN_H */ - -#ifndef __AMX_TRANSPOSEINTRIN_H -#define __AMX_TRANSPOSEINTRIN_H -#ifdef __x86_64__ - -#define __DEFAULT_FN_ATTRS_TRANSPOSE \ - __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose"))) - -#define _tile_2rpntlvwz0(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz0(tdst, base, stride) -#define _tile_2rpntlvwz0t1(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride) -#define _tile_2rpntlvwz1(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz1(tdst, base, stride) -#define _tile_2rpntlvwz1t1(tdst, base, stride) \ - __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride) - -/// Transpose 32-bit elements from \a src and write the result to \a dst. -/// -/// \headerfile -/// -/// \code -/// void _tile_transposed(__tile dst, __tile src); -/// \endcode -/// -/// This intrinsic corresponds to the TTRANSPOSED instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src -/// The source tile. Max size is 1024 Bytes. -/// -/// \code{.operation} -/// -/// FOR i := 0 TO (dst.rows-1) -/// tmp[511:0] := 0 -/// FOR j := 0 TO (dst.colsb/4-1) -/// tmp.dword[j] := src.row[j].dword[i] -/// ENDFOR -/// dst.row[i] := tmp -/// ENDFOR -/// -/// zero_upper_rows(dst, dst.rows) -/// zero_tileconfig_start() -/// \endcode -#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src) - -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - // Use __tile1024i_1024a* to escape the alignment check in - // clang/test/Headers/x86-intrinsics-headers-clean.cpp - __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0, - (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz0t1_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0, - (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal( - unsigned short row, unsigned short col0, unsigned short col1, - _tile1024i *dst0, _tile1024i *dst1, const void *base, - __SIZE_TYPE__ stride) { - __builtin_ia32_t2rpntlvwz1t1_internal( - row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, - (__SIZE_TYPE__)(stride)); -} - -// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE -_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) { - return __builtin_ia32_ttransposed_internal(m, n, src); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. -/// Provides a hint to the implementation that the data will likely not be -/// reused in the near future and the data caching can be optimized. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the T2RPNTLVWZ0 instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TRANSPOSE -static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the T2RPNTLVWZ0T1 instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TRANSPOSE -static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. The last row will be not be read from memory but instead -/// filled with zeros. -/// Provides a hint to the implementation that the data will likely not be -/// reused in the near future and the data caching can be optimized. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the T2RPNTLVWZ1 instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TRANSPOSE -static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Converts a pair of tiles from memory into VNNI format, and places the -/// results in a pair of destinations specified by dst. The pair of tiles -/// in memory is specified via a tsib; the second tile is after the first -/// one, separated by the same stride that separates each row. -/// The tile configuration for the destination tiles indicates the amount -/// of data to read from memory. The instruction will load a number of rows -/// that is equal to twice the number of rows in tmm1. The size of each row -/// is equal to the average width of the destination tiles. If the second -/// tile is configured with zero rows and columns, only the first tile will -/// be written. The last row will be not be read from memory but instead -/// filled with zeros. -/// Provides a hint to the implementation that the data will likely not be -/// reused in the near future and the data caching can be optimized. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the T2RPNTLVWZ1T1 instruction. -/// -/// \param dst0 -/// First tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param dst1 -/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TRANSPOSE -static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1, - const void *base, __SIZE_TYPE__ stride) { - _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, - &dst1->tile, base, stride); -} - -/// Transpose 32-bit elements from src and write the result to dst. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TTRANSPOSED instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src -/// The source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS_TRANSPOSE -static void __tile_transposed(__tile1024i *dst, __tile1024i src) { - dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile); -} - -#endif /* __x86_64__ */ -#endif /* __AMX_TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index 35f012cc70043..19064a4ff5cea 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -475,24 +475,12 @@ _storebe_i64(void * __P, long long __D) { #include -#include - #include -#include - #include #include -#include - -#include - -#include - -#include - #include #include diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp index 850bcb17bece1..2f61bdd9a6540 100644 --- a/clang/lib/Sema/SemaX86.cpp +++ b/clang/lib/Sema/SemaX86.cpp @@ -489,14 +489,6 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_tileloaddrst164: case X86::BI__builtin_ia32_tilestored64: case X86::BI__builtin_ia32_tilezero: - case X86::BI__builtin_ia32_t2rpntlvwz0: - case X86::BI__builtin_ia32_t2rpntlvwz0t1: - case X86::BI__builtin_ia32_t2rpntlvwz1: - case X86::BI__builtin_ia32_t2rpntlvwz1t1: - case X86::BI__builtin_ia32_t2rpntlvwz0rst1: - case X86::BI__builtin_ia32_t2rpntlvwz1rs: - case X86::BI__builtin_ia32_t2rpntlvwz1rst1: - case X86::BI__builtin_ia32_t2rpntlvwz0rs: case X86::BI__builtin_ia32_tcvtrowps2bf16h: case X86::BI__builtin_ia32_tcvtrowps2bf16l: case X86::BI__builtin_ia32_tcvtrowps2phh: @@ -516,17 +508,8 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_tdpbhf8ps: case X86::BI__builtin_ia32_tdphbf8ps: case X86::BI__builtin_ia32_tdphf8ps: - case X86::BI__builtin_ia32_ttdpbf16ps: - case X86::BI__builtin_ia32_ttdpfp16ps: - case X86::BI__builtin_ia32_ttcmmimfp16ps: - case X86::BI__builtin_ia32_ttcmmrlfp16ps: - case X86::BI__builtin_ia32_tconjtcmmimfp16ps: case X86::BI__builtin_ia32_tmmultf32ps: - case X86::BI__builtin_ia32_ttmmultf32ps: return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2}); - case X86::BI__builtin_ia32_ttransposed: - case X86::BI__builtin_ia32_tconjtfp16: - return CheckBuiltinTileArgumentsRange(TheCall, {0, 1}); } } static bool isX86_32Builtin(unsigned BuiltinID) { diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose.c b/clang/test/CodeGen/X86/amx_movrs_tranpose.c deleted file mode 100755 index 192c153835e1e..0000000000000 --- a/clang/test/CodeGen/X86/amx_movrs_tranpose.c +++ /dev/null @@ -1,53 +0,0 @@ -// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ -// RUN: -target-feature +amx-movrs -emit-llvm -o - -Wall -Werror -pedantic \ -// RUN: -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s - -#include -#include - -char buf[2048]; -#define STRIDE 32 - -// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz0rs_internal( -// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) -// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0 -// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) -// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024 -// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1 -// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) -void test_tile_2rpntlvwz0rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) { - _tile_2rpntlvwz0rs_internal(row, col0, col1, D0, D1, B, 1); -} - -// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz0rst1_internal( -// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) -// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0 -// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) -// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024 -// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1 -// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) -void test_tile_2rpntlvwz0rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) { - _tile_2rpntlvwz0rst1_internal(row, col0, col1, D0, D1, B, 1); -} - -// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz1rs_internal( -// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) -// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0 -// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) -// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024 -// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1 -// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) -void test_tile_2rpntlvwz1rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) { - _tile_2rpntlvwz1rs_internal(row, col0, col1, D0, D1, B, 1); -} - -// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz1rst1_internal( -// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) -// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0 -// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) -// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024 -// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1 -// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) -void test_tile_2rpntlvwz1rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) { - _tile_2rpntlvwz1rst1_internal(row, col0, col1, D0, D1, B, 1); -} diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c b/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c deleted file mode 100755 index b174cc5067bf3..0000000000000 --- a/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c +++ /dev/null @@ -1,81 +0,0 @@ -// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ -// RUN: -target-feature +amx-movrs -emit-llvm -o - -Wall -Werror -pedantic \ -// RUN: -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s - -#include -#include - -char buf[2048]; -#define STRIDE 32 - -void test_tile_2rpntlvwz0rs(const void *A, size_t B) { - // CHECK-LABEL: @test_tile_2rpntlvwz0rs - // CHECK: call void @llvm.x86.t2rpntlvwz0rs(i8 1, ptr %{{.*}}, i64 %{{.*}}) - _tile_2rpntlvwz0rs(1, A, B); -} - -void test_tile_2rpntlvwz0rst1(const void *A, size_t B) { - // CHECK-LABEL: @test_tile_2rpntlvwz0rst1 - // CHECK: call void @llvm.x86.t2rpntlvwz0rst1(i8 1, ptr %{{.*}}, i64 %{{.*}}) - _tile_2rpntlvwz0rst1(1, A, B); -} - -void test_tile_2rpntlvwz1rs(const void *A, size_t B) { - // CHECK-LABEL: @test_tile_2rpntlvwz1rs - // CHECK: call void @llvm.x86.t2rpntlvwz1rs(i8 1, ptr %{{.*}}, i64 %{{.*}}) - _tile_2rpntlvwz1rs(1, A, B); -} - -void test_tile_2rpntlvwz1rst1(const void *A, size_t B) { - // CHECK-LABEL: @test_tile_2rpntlvwz1rst1 - // CHECK: call void @llvm.x86.t2rpntlvwz1rst1(i8 1, ptr %{{.*}}, i64 %{{.*}}) - _tile_2rpntlvwz1rst1(1, A, B); -} - -void test__tile_2rpntlvwz0rs(__tile1024i dst0, __tile1024i dst1) { - //CHECK-LABEL: @test__tile_2rpntlvwz0rs - //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal - //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 - //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} - //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 - //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} - __tile_2rpntlvwz0rs(&dst0, &dst1, buf, STRIDE); -} - -void test__tile_2rpntlvwz0rst1(__tile1024i dst0, __tile1024i dst1) { - //CHECK-LABEL: @test__tile_2rpntlvwz0rst1 - //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal - //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 - //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} - //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 - //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} - __tile_2rpntlvwz0rst1(&dst0, &dst1, buf, STRIDE); -} - -void test__tile_2rpntlvwz1rs(__tile1024i dst0, __tile1024i dst1) { - //CHECK-LABEL: @test__tile_2rpntlvwz1rs - //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal - //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 - //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} - //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 - //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} - __tile_2rpntlvwz1rs(&dst0, &dst1, buf, STRIDE); -} - -void test__tile_2rpntlvwz1rst1(__tile1024i dst0, __tile1024i dst1) { - //CHECK-LABEL: @test__tile_2rpntlvwz1rst1 - //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal - //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 - //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} - //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 - //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} - __tile_2rpntlvwz1rst1(&dst0, &dst1, buf, STRIDE); -} diff --git a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c deleted file mode 100755 index 840b52bbb29bb..0000000000000 --- a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c +++ /dev/null @@ -1,22 +0,0 @@ -// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ -// RUN: -target-feature +amx-int8 -target-feature +amx-transpose -target-feature +amx-movrs \ -// RUN: -verify - -#include -#include - -void test_tile_2rpntlvwz0rs(const void *A, size_t B) { - _tile_2rpntlvwz0rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} -} - -void test_tile_2rpntlvwz0rst1(const void *A, size_t B) { - _tile_2rpntlvwz0rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} -} - -void test_tile_2rpntlvwz1rs(const void *A, size_t B) { - _tile_2rpntlvwz1rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} -} - -void test_tile_2rpntlvwz1rst1(const void *A, size_t B) { - _tile_2rpntlvwz1rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} -} diff --git a/clang/test/CodeGen/X86/amx_tf32.c b/clang/test/CodeGen/X86/amx_tf32.c index 661a9dfbc673b..54ad6bb714933 100644 --- a/clang/test/CodeGen/X86/amx_tf32.c +++ b/clang/test/CodeGen/X86/amx_tf32.c @@ -10,8 +10,3 @@ void test_tile_mmultf32ps(void) { _tile_mmultf32ps(1, 2, 3); } -void test_tile_tmmultf32ps(void) { - // CHECK-LABEL: @test_tile_tmmultf32ps( - // CHECK: call void @llvm.x86.ttmmultf32ps(i8 1, i8 2, i8 3) - _tile_tmmultf32ps(1, 2, 3); -} diff --git a/clang/test/CodeGen/X86/amx_tf32_api.c b/clang/test/CodeGen/X86/amx_tf32_api.c index 2ac8489e3e0ba..8f574b7bc71dc 100644 --- a/clang/test/CodeGen/X86/amx_tf32_api.c +++ b/clang/test/CodeGen/X86/amx_tf32_api.c @@ -18,10 +18,3 @@ void test_tile_mmultf32ps(__tile1024i a, __tile1024i b, __tile1024i c) { __tile_mmultf32ps(&c, a, b); } -void test_tile_tmmultf32ps(__tile1024i a, __tile1024i b, __tile1024i c) { - //CHECK-LABEL: @test_tile_tmmultf32ps - //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) - //CHECK-DAG: call x86_amx @llvm.x86.ttmmultf32ps.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - __tile_tmmultf32ps(&c, a, b); -} diff --git a/clang/test/CodeGen/X86/amx_tf32_errors.c b/clang/test/CodeGen/X86/amx_tf32_errors.c index 4502130692115..f0fdd060363cf 100644 --- a/clang/test/CodeGen/X86/amx_tf32_errors.c +++ b/clang/test/CodeGen/X86/amx_tf32_errors.c @@ -13,11 +13,3 @@ void test_tile_mmultf32ps() { _tile_mmultf32ps(1, 3, 3); // expected-error {{tile arguments must refer to different tiles}} } -void test_tile_tmmultf32ps() { - _tile_tmmultf32ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}} - _tile_tmmultf32ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}} - _tile_tmmultf32ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}} - _tile_tmmultf32ps(1, 1, 3); // expected-error {{tile arguments must refer to different tiles}} - _tile_tmmultf32ps(1, 2, 1); // expected-error {{tile arguments must refer to different tiles}} - _tile_tmmultf32ps(1, 2, 2); // expected-error {{tile arguments must refer to different tiles}} -} diff --git a/clang/test/CodeGen/X86/amx_transpose.c b/clang/test/CodeGen/X86/amx_transpose.c deleted file mode 100644 index 7e88fd80592d6..0000000000000 --- a/clang/test/CodeGen/X86/amx_transpose.c +++ /dev/null @@ -1,75 +0,0 @@ -// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-transpose \ -// RUN: -target-feature +amx-bf16 -target-feature +amx-fp16 -target-feature +amx-complex \ -// RUN: -target-feature +avx512f -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression| FileCheck %s - -#include -#include - -void test_tile_2rpntlvwz0(const void *A, size_t B) { - // CHECK-LABEL: @test_tile_2rpntlvwz0 - // CHECK: call void @llvm.x86.t2rpntlvwz0(i8 1, ptr %{{.*}}, i64 %{{.*}}) - _tile_2rpntlvwz0(1, A, B); -} - -void test_tile_2rpntlvwz0t1(const void *A, size_t B) { - // CHECK-LABEL: @test_tile_2rpntlvwz0t1 - // CHECK: call void @llvm.x86.t2rpntlvwz0t1(i8 1, ptr %{{.*}}, i64 %{{.*}}) - _tile_2rpntlvwz0t1(1, A, B); -} - -void test_tile_2rpntlvwz1(const void *A, size_t B) { - // CHECK-LABEL: @test_tile_2rpntlvwz1 - // CHECK: call void @llvm.x86.t2rpntlvwz1(i8 1, ptr %{{.*}}, i64 %{{.*}}) - _tile_2rpntlvwz1(1, A, B); -} - -void test_tile_2rpntlvwz1t1(const void *A, size_t B) { - // CHECK-LABEL: @test_tile_2rpntlvwz1t1 - // CHECK: call void @llvm.x86.t2rpntlvwz1t1(i8 1, ptr %{{.*}}, i64 %{{.*}}) - _tile_2rpntlvwz1t1(1, A, B); -} - -void test_tile_transposed(void) -{ - // CHECK-LABEL: @test_tile_transposed - // CHECK: call void @llvm.x86.ttransposed(i8 1, i8 2) - _tile_transposed(1, 2); -} - -void test_tile_tdpbf16ps(void) -{ - // CHECK-LABEL: @test_tile_tdpbf16ps - // CHECK: call void @llvm.x86.ttdpbf16ps(i8 1, i8 2, i8 3) - _tile_tdpbf16ps(1, 2, 3); -} - -void test_tile_tdpfp16ps(void) -{ - // CHECK-LABEL: @test_tile_tdpfp16ps - // CHECK: call void @llvm.x86.ttdpfp16ps(i8 4, i8 5, i8 6) - _tile_tdpfp16ps(4, 5, 6); -} - -void test_tile_tcmmimfp16ps(void) { - // CHECK-LABEL: @test_tile_tcmmimfp16ps - // CHECK: call void @llvm.x86.ttcmmimfp16ps(i8 1, i8 2, i8 3) - _tile_tcmmimfp16ps(1, 2, 3); -} - -void test_tile_tcmmrlfp16ps(void) { - // CHECK-LABEL: @test_tile_tcmmrlfp16ps - // CHECK: call void @llvm.x86.ttcmmrlfp16ps(i8 1, i8 2, i8 3) - _tile_tcmmrlfp16ps(1, 2, 3); -} - -void test_tile_conjtcmmimfp16ps(void) { - // CHECK-LABEL: @test_tile_conjtcmmimfp16ps - // CHECK: call void @llvm.x86.tconjtcmmimfp16ps(i8 1, i8 2, i8 3) - _tile_conjtcmmimfp16ps(1, 2, 3); -} - -void test_tile_conjtfp16(void) { - // CHECK-LABEL: @test_tile_conjtfp16 - // CHECK: call void @llvm.x86.tconjtfp16(i8 1, i8 2) - _tile_conjtfp16(1, 2); -} diff --git a/clang/test/CodeGen/X86/amx_transpose_api.c b/clang/test/CodeGen/X86/amx_transpose_api.c deleted file mode 100644 index dc3ef5104252c..0000000000000 --- a/clang/test/CodeGen/X86/amx_transpose_api.c +++ /dev/null @@ -1,114 +0,0 @@ -// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512f \ -// RUN: -target-feature +amx-transpose -target-feature +amx-bf16 -target-feature +amx-fp16 -target-feature +amx-complex \ -// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK - -#include - -char buf[2048]; -#define STRIDE 32 - -char buf2[2048]; - -void test_tile_2rpntlvwz0(__tile1024i dst0, __tile1024i dst1) { - //CHECK-LABEL: @test_tile_2rpntlvwz0 - //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal - //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 - //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} - //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 - //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} - __tile_2rpntlvwz0(&dst0, &dst1, buf, STRIDE); -} - -void test_tile_2rpntlvwz0t1(__tile1024i dst0, __tile1024i dst1) { - //CHECK-LABEL: @test_tile_2rpntlvwz0t1 - //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal - //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 - //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} - //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 - //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} - __tile_2rpntlvwz0t1(&dst0, &dst1, buf, STRIDE); -} - -void test_tile_2rpntlvwz1(__tile1024i dst0, __tile1024i dst1) { - //CHECK-LABEL: @test_tile_2rpntlvwz1 - //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal - //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 - //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} - //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 - //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} - __tile_2rpntlvwz1(&dst0, &dst1, buf, STRIDE); -} - -void test_tile_2rpntlvwz1t1(__tile1024i dst0, __tile1024i dst1) { - //CHECK-LABEL: @test_tile_2rpntlvwz1t1 - //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal - //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 - //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} - //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 - //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} - __tile_2rpntlvwz1t1(&dst0, &dst1, buf, STRIDE); -} - -void test_tile_transposed(__tile1024i dst, __tile1024i src) { - //CHECK-LABEL: @test_tile_transposed - //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) - //CHECK-DAG: call x86_amx @llvm.x86.ttransposed.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - __tile_transposed(&dst, src); -} - -void test_tile_tdpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) { - //CHECK-LABEL: @test_tile_tdpbf16ps - //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) - //CHECK-DAG: call x86_amx @llvm.x86.ttdpbf16ps.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - __tile_tdpbf16ps(&c, a, b); -} - -void test_tile_tdpfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) { - //CHECK-LABEL: @test_tile_tdpfp16ps - //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) - //CHECK-DAG: call x86_amx @llvm.x86.ttdpfp16ps.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - __tile_tdpfp16ps(&c, a, b); -} - -void test_tile_tcmmimfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) { - //CHECK-LABEL: @test_tile_tcmmimfp16ps - //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) - //CHECK-DAG: call x86_amx @llvm.x86.ttcmmimfp16ps.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - __tile_tcmmimfp16ps(&c, a, b); -} - -void test_tile_tcmmrlfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) { - //CHECK-LABEL: @test_tile_tcmmrlfp16ps - //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) - //CHECK-DAG: call x86_amx @llvm.x86.ttcmmrlfp16ps.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - __tile_tcmmrlfp16ps(&c, a, b); -} - -void test_tile_conjtcmmimfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) { - //CHECK-LABEL: @test_tile_conjtcmmimfp16ps - //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) - //CHECK-DAG: call x86_amx @llvm.x86.tconjtcmmimfp16ps.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - __tile_conjtcmmimfp16ps(&c, a, b); -} - -void test_tile_conjtfp16(__tile1024i dst, __tile1024i src) { - //CHECK-LABEL: @test_tile_conjtfp16 - //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) - //CHECK-DAG: call x86_amx @llvm.x86.tconjtfp16.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - __tile_conjtfp16(&dst, src); -} diff --git a/clang/test/CodeGen/X86/amx_transpose_errors.c b/clang/test/CodeGen/X86/amx_transpose_errors.c deleted file mode 100644 index 80368c580c793..0000000000000 --- a/clang/test/CodeGen/X86/amx_transpose_errors.c +++ /dev/null @@ -1,75 +0,0 @@ -// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ -// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-transpose \ -// RUN: -target-feature +avx512f -target-feature +amx-fp16 -target-feature +amx-complex -verify - -#include -#include - -// Transpose -void test_tile_2rpntlvwz0(const void *A, size_t B) { - _tile_2rpntlvwz0(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} -} - -void test_tile_2rpntlvwz0t1(const void *A, size_t B) { - _tile_2rpntlvwz0t1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} -} - -void test_tile_2rpntlvwz1(const void *A, size_t B) { - _tile_2rpntlvwz1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} -} - -void test_tile_2rpntlvwz1t1(const void *A, size_t B) { - _tile_2rpntlvwz1t1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} -} - -void test_tile_tdpbf16ps() -{ - _tile_tdpbf16ps(8, 2, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} - _tile_tdpbf16ps(1, 8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} - _tile_tdpbf16ps(1, 2, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} - _tile_tdpbf16ps(1, 1, 3); // expected-error {{tile arguments must refer to different tiles}} - _tile_tdpbf16ps(1, 2, 1); // expected-error {{tile arguments must refer to different tiles}} - _tile_tdpbf16ps(1, 2, 2); // expected-error {{tile arguments must refer to different tiles}} -} - -void test_tile_tdpfp16ps() -{ - _tile_tdpfp16ps(8, 5, 6); // expected-error {{argument value 8 is outside the valid range [0, 7]}} - _tile_tdpfp16ps(1, 8, 6); // expected-error {{argument value 8 is outside the valid range [0, 7]}} - _tile_tdpfp16ps(1, 5, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} - _tile_tdpfp16ps(1, 1, 3); // expected-error {{tile arguments must refer to different tiles}} - _tile_tdpfp16ps(1, 2, 1); // expected-error {{tile arguments must refer to different tiles}} - _tile_tdpfp16ps(1, 2, 2); // expected-error {{tile arguments must refer to different tiles}} -} - -void test_tile_transposed() -{ - _tile_transposed(8, 2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} - _tile_transposed(1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} -} - -void test_tile_tcmmimfp16ps() { - _tile_tcmmimfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}} - _tile_tcmmimfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}} - _tile_tcmmimfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}} - _tile_tcmmimfp16ps(1, 1, 3); // expected-error {{tile arguments must refer to different tiles}} -} - -void test_tile_tcmmrlfp16ps() { - _tile_tcmmrlfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}} - _tile_tcmmrlfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}} - _tile_tcmmrlfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}} - _tile_tcmmrlfp16ps(1, 1, 3); // expected-error {{tile arguments must refer to different tiles}} -} - -void test_tile_conjtcmmimfp16ps() { - _tile_conjtcmmimfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}} - _tile_conjtcmmimfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}} - _tile_conjtcmmimfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}} - _tile_conjtcmmimfp16ps(1, 2, 1); // expected-error {{tile arguments must refer to different tiles}} -} - -void test_tile_conjtfp16() { - _tile_conjtfp16(16, 2); // expected-error {{argument value 16 is outside the valid range [0, 7]}} - _tile_conjtfp16(1, 26); // expected-error {{argument value 26 is outside the valid range [0, 7]}} -} diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c index 3717c449d6601..f1660b1afb518 100644 --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -304,13 +304,6 @@ // AMX-COMPLEX: "-target-feature" "+amx-complex" // NO-AMX-COMPLEX: "-target-feature" "-amx-complex" -// RUN: %clang --target=x86_64-unknown-linux-gnu -mamx-transpose %s \ -// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-TRANSPOSE %s -// RUN: %clang --target=x86_64-unknown-linux-gnu -mno-amx-transpose %s \ -// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s -// AMX-TRANSPOSE: "-target-feature" "+amx-transpose" -// NO-AMX-TRANSPOSE: "-target-feature" "-amx-transpose" - // RUN: %clang --target=x86_64-unknown-linux-gnu -mamx-avx512 %s \ // RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-AVX512 %s // RUN: %clang --target=x86_64-unknown-linux-gnu -mno-amx-avx512 %s \ diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c index cdb46326c2838..cf2cd4a10b056 100644 --- a/clang/test/Preprocessor/predefined-arch-macros.c +++ b/clang/test/Preprocessor/predefined-arch-macros.c @@ -1841,7 +1841,6 @@ // CHECK_DMR_M32: #define __AMX_MOVRS__ 1 // CHECK_DMR_M32: #define __AMX_TF32__ 1 // CHECK_GNR_M32: #define __AMX_TILE__ 1 -// CHECK_DMR_M32: #define __AMX_TRANSPOSE__ 1 // CHECK_DMR_M32: #define __AVX10_2_512__ 1 // CHECK_DMR_M32: #define __AVX10_2__ 1 // CHECK_GNR_M32: #define __AVX2__ 1 @@ -1947,7 +1946,6 @@ // CHECK_DMR_M64: #define __AMX_MOVRS__ 1 // CHECK_DMR_M64: #define __AMX_TF32__ 1 // CHECK_GNR_M64: #define __AMX_TILE__ 1 -// CHECK_DMR_M64: #define __AMX_TRANSPOSE__ 1 // CHECK_DMR_M64: #define __AVX10_2_512__ 1 // CHECK_DMR_M64: #define __AVX10_2__ 1 // CHECK_GNR_M64: #define __AVX2__ 1 diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c index 5f17641878761..78f8b19459c2f 100644 --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -526,18 +526,6 @@ // NO-AMX-COMPLEX-NOT: #define __AMX_COMPLEX__ 1 -// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-transpose -x c \ -// RUN: -E -dM -o - %s | FileCheck -check-prefix=AMX-TRANSPOSE %s - -// AMX-TRANSPOSE: #define __AMX_TRANSPOSE__ 1 - -// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mno-amx-transpose -x c \ -// RUN: -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s -// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-transpose -mno-amx-tile \ -// RUN: -x c -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s - -// NO-AMX-TRANSPOSE-NOT: #define __AMX_TRANSPOSE__ 1 - // RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-avx512 -x c \ // RUN: -E -dM -o - %s | FileCheck -check-prefix=AMX-AVX512 %s diff --git a/llvm/include/llvm/CodeGen/TileShapeInfo.h b/llvm/include/llvm/CodeGen/TileShapeInfo.h index 9cea327819895..24d9de842645a 100644 --- a/llvm/include/llvm/CodeGen/TileShapeInfo.h +++ b/llvm/include/llvm/CodeGen/TileShapeInfo.h @@ -34,30 +34,9 @@ class ShapeT { if (MRI) deduceImm(MRI); } - // When ShapeT has multiple shapes, we only use Shapes (never use Row and Col) - // and ImmShapes. Due to the most case is only one shape (just simply use - // Shape.Row or Shape.Col), so here we don't merge Row and Col into vector - // Shapes to keep the speed and code simplicity. - // TODO: The upper solution is a temporary way to minimize current tile - // register allocation code changes. It can not handle both Reg shape and - // Imm shape for different shapes (e.g. shape 1 is reg shape while shape 2 - // is imm shape). Refine me when we have more multi-tile shape instructions! - ShapeT(ArrayRef ShapesOperands, - const MachineRegisterInfo *MRI = nullptr) - : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape), - ColImm(InvalidImmShape) { - assert(ShapesOperands.size() % 2 == 0 && "Miss row or col!"); - - llvm::append_range(Shapes, ShapesOperands); - - if (MRI) - deduceImm(MRI); - } ShapeT() : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape), ColImm(InvalidImmShape) {} - // TODO: We need to extern cmp operator for multi-shapes if - // we have requirement in the future. bool operator==(const ShapeT &Shape) const { MachineOperand *R = Shape.Row; MachineOperand *C = Shape.Col; @@ -74,40 +53,11 @@ class ShapeT { bool operator!=(const ShapeT &Shape) const { return !(*this == Shape); } - MachineOperand *getRow(unsigned I = 0) const { - if (Shapes.empty()) - return Row; - assert(Shapes.size() / 2 >= I && "Get invalid row from id!"); - return Shapes[I * 2]; - } - - MachineOperand *getCol(unsigned I = 0) const { - if (Shapes.empty()) - return Col; - assert(Shapes.size() / 2 >= I && "Get invalid col from id!"); - return Shapes[I * 2 + 1]; - } - - int64_t getRowImm(unsigned I = 0) const { - if (ImmShapes.empty()) - return RowImm; - assert(ImmShapes.size() / 2 >= I && "Get invalid imm row from id!"); - return ImmShapes[I * 2]; - } - - int64_t getColImm(unsigned I = 0) const { - if (ImmShapes.empty()) - return ColImm; - assert(ImmShapes.size() / 2 >= I && "Get invalid imm col from id!"); - return ImmShapes[I * 2 + 1]; - } + MachineOperand *getRow() const { return Row; } + MachineOperand *getCol() const { return Col; } - unsigned getShapeNum() { - if (Shapes.empty()) - return isValid() ? 1 : 0; - else - return Shapes.size() / 2; - } + int64_t getRowImm() const { return RowImm; } + int64_t getColImm() const { return ColImm; } bool isValid() { return (Row != nullptr) && (Col != nullptr); } @@ -120,35 +70,14 @@ class ShapeT { for (const MachineOperand &DefMO : MRI->def_operands(Reg)) { const auto *MI = DefMO.getParent(); if (MI->isMoveImmediate()) { - assert(MI->getNumOperands() == 2 && - "Unsupported number of operands in instruction for setting " - "row/column."); - if (MI->getOperand(1).isImm()) { - Imm = MI->getOperand(1).getImm(); - } else { - assert(MI->getOperand(1).isImplicit() && - "Operand 1 is assumed to be implicit."); - Imm = 0; - } + Imm = MI->getOperand(1).getImm(); break; } } return Imm; }; - if (Shapes.empty()) { // Single Shape - RowImm = GetImm(Row->getReg()); - ColImm = GetImm(Col->getReg()); - // The number of rows of 2nd destination buffer is assigned by the one of - // 1st destination buffer. If the column size is equal to zero, the row - // size should be reset to zero too. - if (ColImm == 0) - Row = Col; - } else { // Multiple Shapes - for (auto *Shape : Shapes) { - int64_t ImmShape = GetImm(Shape->getReg()); - ImmShapes.push_back(ImmShape); - } - } + RowImm = GetImm(Row->getReg()); + ColImm = GetImm(Col->getReg()); } private: @@ -157,9 +86,6 @@ class ShapeT { MachineOperand *Col; int64_t RowImm = -1; int64_t ColImm = -1; - // Multiple Shapes - SmallVector Shapes; - SmallVector ImmShapes; }; } // namespace llvm diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 81fbfbf0bb1b4..1dd23f60c7e1e 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5505,46 +5505,6 @@ let TargetPrefix = "x86" in { [ImmArg>, ImmArg>, ImmArg>]>; - // AMX-TRANSPOSE - def int_x86_t2rpntlvwz0 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0">, - Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], - [ImmArg>]>; - def int_x86_t2rpntlvwz0t1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0t1">, - Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], - [ImmArg>]>; - def int_x86_t2rpntlvwz1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1">, - Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], - [ImmArg>]>; - def int_x86_t2rpntlvwz1t1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1t1">, - Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], - [ImmArg>]>; - def int_x86_ttransposed : ClangBuiltin<"__builtin_ia32_ttransposed">, - Intrinsic<[], [llvm_i8_ty, llvm_i8_ty], - [ImmArg>, ImmArg>]>; - def int_x86_ttdpbf16ps : ClangBuiltin<"__builtin_ia32_ttdpbf16ps">, - Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], - [ImmArg>, ImmArg>, - ImmArg>]>; - def int_x86_ttdpfp16ps : ClangBuiltin<"__builtin_ia32_ttdpfp16ps">, - Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], - [ImmArg>, ImmArg>, - ImmArg>]>; - def int_x86_ttcmmimfp16ps : ClangBuiltin<"__builtin_ia32_ttcmmimfp16ps">, - Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], - [ImmArg>, ImmArg>, - ImmArg>]>; - def int_x86_ttcmmrlfp16ps : ClangBuiltin<"__builtin_ia32_ttcmmrlfp16ps">, - Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], - [ImmArg>, ImmArg>, - ImmArg>]>; - def int_x86_tconjtcmmimfp16ps : ClangBuiltin<"__builtin_ia32_tconjtcmmimfp16ps">, - Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], - [ImmArg>, ImmArg>, - ImmArg>]>; - def int_x86_tconjtfp16 : ClangBuiltin<"__builtin_ia32_tconjtfp16">, - Intrinsic<[], [llvm_i8_ty, llvm_i8_ty], - [ImmArg>, ImmArg>]>; - // AMX-MORVS, AMX-TRANSPOSE def int_x86_t2rpntlvwz0rs : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0rs">, Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], @@ -5685,61 +5645,6 @@ let TargetPrefix = "x86" in { [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], [IntrArgMemOnly]>; - def int_x86_t2rpntlvwz0_internal : - Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], - []>; - def int_x86_t2rpntlvwz0t1_internal : - Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], - []>; - def int_x86_t2rpntlvwz1_internal : - Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], - []>; - def int_x86_t2rpntlvwz1t1_internal : - Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], - []>; - def int_x86_ttransposed_internal : - ClangBuiltin<"__builtin_ia32_ttransposed_internal">, - Intrinsic<[llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty], []>; - def int_x86_ttdpbf16ps_internal : - ClangBuiltin<"__builtin_ia32_ttdpbf16ps_internal">, - Intrinsic<[llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, - llvm_x86amx_ty, llvm_x86amx_ty, - llvm_x86amx_ty], []>; - def int_x86_ttdpfp16ps_internal : - ClangBuiltin<"__builtin_ia32_ttdpfp16ps_internal">, - Intrinsic<[llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, - llvm_x86amx_ty, llvm_x86amx_ty, - llvm_x86amx_ty], []>; - def int_x86_ttcmmimfp16ps_internal : - ClangBuiltin<"__builtin_ia32_ttcmmimfp16ps_internal">, - Intrinsic<[llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, - llvm_x86amx_ty, llvm_x86amx_ty, - llvm_x86amx_ty], []>; - def int_x86_ttcmmrlfp16ps_internal : - ClangBuiltin<"__builtin_ia32_ttcmmrlfp16ps_internal">, - Intrinsic<[llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, - llvm_x86amx_ty, llvm_x86amx_ty, - llvm_x86amx_ty], []>; - def int_x86_tconjtcmmimfp16ps_internal : - ClangBuiltin<"__builtin_ia32_tconjtcmmimfp16ps_internal">, - Intrinsic<[llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, - llvm_x86amx_ty, llvm_x86amx_ty, - llvm_x86amx_ty], []>; - def int_x86_tconjtfp16_internal : - ClangBuiltin<"__builtin_ia32_tconjtfp16_internal">, - Intrinsic<[llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty], []>; - def int_x86_tcvtrowd2ps_internal : ClangBuiltin<"__builtin_ia32_tcvtrowd2ps_internal">, Intrinsic<[llvm_v16f32_ty], @@ -5775,20 +5680,11 @@ let TargetPrefix = "x86" in { Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], [ImmArg>, ImmArg>, ImmArg>]>; - def int_x86_ttmmultf32ps : ClangBuiltin<"__builtin_ia32_ttmmultf32ps">, - Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], - [ImmArg>, ImmArg>, - ImmArg>]>; def int_x86_tmmultf32ps_internal : ClangBuiltin<"__builtin_ia32_tmmultf32ps_internal">, Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_x86amx_ty, llvm_x86amx_ty], []>; - def int_x86_ttmmultf32ps_internal : - ClangBuiltin<"__builtin_ia32_ttmmultf32ps_internal">, - Intrinsic<[llvm_x86amx_ty], - [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, - llvm_x86amx_ty, llvm_x86amx_ty], []>; def int_x86_tdpbf8ps_internal : ClangBuiltin<"__builtin_ia32_tdpbf8ps_internal">, diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h index 4aa6c01d29cc2..6f6f65dc075f3 100644 --- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h +++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h @@ -511,7 +511,6 @@ enum OperandEncoding { ENCODINGS ENCODING_max }; ENUM_ENTRY(TYPE_VK, "mask register") \ ENUM_ENTRY(TYPE_VK_PAIR, "mask register pair") \ ENUM_ENTRY(TYPE_TMM, "tile") \ - ENUM_ENTRY(TYPE_TMM_PAIR, "tile pair") \ ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \ ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \ ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand") \ diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def index a94eab1d7ae34..78cf46406192e 100644 --- a/llvm/include/llvm/TargetParser/X86TargetParser.def +++ b/llvm/include/llvm/TargetParser/X86TargetParser.def @@ -268,7 +268,6 @@ X86_FEATURE_COMPAT(AVX10_2_512, "avx10.2-512", 0) X86_FEATURE (MOVRS, "movrs") X86_FEATURE (ZU, "zu") X86_FEATURE (AMX_FP8, "amx-fp8") -X86_FEATURE (AMX_TRANSPOSE, "amx-transpose") X86_FEATURE (AMX_MOVRS, "amx-movrs") X86_FEATURE (AMX_AVX512, "amx-avx512") X86_FEATURE (AMX_TF32, "amx-tf32") diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h index 89ac53e0ecac9..a92272573bacd 100644 --- a/llvm/lib/Target/X86/AsmParser/X86Operand.h +++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -620,37 +620,6 @@ struct X86Operand final : public MCParsedAsmOperand { Inst.addOperand(MCOperand::createReg(Reg)); } - bool isTILEPair() const { - return Kind == Register && - X86MCRegisterClasses[X86::TILERegClassID].contains(getReg()); - } - - void addTILEPairOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - MCRegister Reg = getReg(); - switch (Reg.id()) { - default: - llvm_unreachable("Invalid tile register!"); - case X86::TMM0: - case X86::TMM1: - Reg = X86::TMM0_TMM1; - break; - case X86::TMM2: - case X86::TMM3: - Reg = X86::TMM2_TMM3; - break; - case X86::TMM4: - case X86::TMM5: - Reg = X86::TMM4_TMM5; - break; - case X86::TMM6: - case X86::TMM7: - Reg = X86::TMM6_TMM7; - break; - } - Inst.addOperand(MCOperand::createReg(Reg)); - } - void addMemOperands(MCInst &Inst, unsigned N) const { assert((N == 5) && "Invalid number of operands!"); if (getMemBaseReg()) diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index 4927b453458ef..7d2b5eb900133 100644 --- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -810,10 +810,6 @@ static int readModRM(struct InternalInstruction *insn) { if (index > 7) \ *valid = 0; \ return prefix##_TMM0 + index; \ - case TYPE_TMM_PAIR: \ - if (index > 7) \ - *valid = 0; \ - return prefix##_TMM0_TMM1 + (index / 2); \ case TYPE_VK: \ index &= 0xf; \ if (index > 7) \ @@ -2323,7 +2319,6 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_YMM: case TYPE_ZMM: case TYPE_TMM: - case TYPE_TMM_PAIR: case TYPE_VK_PAIR: case TYPE_VK: case TYPE_DEBUGREG: diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index dc9af2caa77b1..b0aa70be12d83 100644 --- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -535,12 +535,6 @@ namespace X86Disassembler { ENTRY(TMM6) \ ENTRY(TMM7) -#define REGS_TMM_PAIRS \ - ENTRY(TMM0_TMM1) \ - ENTRY(TMM2_TMM3) \ - ENTRY(TMM4_TMM5) \ - ENTRY(TMM6_TMM7) - #define ALL_EA_BASES \ EA_BASES_16BIT \ EA_BASES_32BIT \ @@ -565,7 +559,6 @@ namespace X86Disassembler { REGS_DEBUG \ REGS_CONTROL \ REGS_TMM \ - REGS_TMM_PAIRS \ ENTRY(RIP) /// All possible values of the base field for effective-address diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp index 1c5f1663d4f52..759d95e5a18ea 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -467,22 +467,3 @@ void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo, } llvm_unreachable("Unknown mask pair register name"); } - -void X86InstPrinterCommon::printTILEPair(const MCInst *MI, unsigned OpNo, - raw_ostream &OS) { - switch (MI->getOperand(OpNo).getReg()) { - case X86::TMM0_TMM1: - printRegName(OS, X86::TMM0); - return; - case X86::TMM2_TMM3: - printRegName(OS, X86::TMM2); - return; - case X86::TMM4_TMM5: - printRegName(OS, X86::TMM4); - return; - case X86::TMM6_TMM7: - printRegName(OS, X86::TMM6); - return; - } - llvm_unreachable("Unknown mask pair register name"); -} diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h index 2c9467ca7c615..cb55f2f0019b5 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h @@ -40,7 +40,6 @@ class X86InstPrinterCommon : public MCInstPrinter { const MCSubtargetInfo &STI); void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS); - void printTILEPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS); }; } // end namespace llvm diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index a1fd366e59444..9e291a6ae431f 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -274,9 +274,6 @@ def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true", def FeatureAMXMOVRS : SubtargetFeature<"amx-movrs", "HasAMXMOVRS", "true", "Support AMX-MOVRS instructions", [FeatureAMXTILE]>; -def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true", - "Support AMX amx-transpose instructions", - [FeatureAMXTILE]>; def FeatureAMXAVX512 : SubtargetFeature<"amx-avx512", "HasAMXAVX512", "true", "Support AMX-AVX512 instructions", @@ -1177,8 +1174,7 @@ def ProcessorFeatures { FeatureAMXMOVRS, FeatureAMXAVX512, FeatureAMXFP8, - FeatureAMXTF32, - FeatureAMXTRANSPOSE]; + FeatureAMXTF32]; list DMRFeatures = !listconcat(GNRDFeatures, DMRAdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 4a9b824b0db14..e3c44c048f7bf 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -649,149 +649,6 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.setDesc(TII->get(Opc)); return true; } - // TILEPAIRLOAD is just for TILEPair spill, we don't have corresponding - // AMX instruction to support it. So, split it to 2 load instructions: - // "TILEPAIRLOAD TMM0:TMM1, Base, Scale, Index, Offset, Segment" --> - // "TILELOAD TMM0, Base, Scale, Index, Offset, Segment" + - // "TILELOAD TMM1, Base, Scale, Index, Offset + TMM_SIZE, Segment" - case X86::PTILEPAIRLOAD: { - int64_t Disp = MBBI->getOperand(1 + X86::AddrDisp).getImm(); - Register TReg = MBBI->getOperand(0).getReg(); - bool DstIsDead = MBBI->getOperand(0).isDead(); - Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0); - Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1); - unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8; - - MachineInstrBuilder MIBLo = - BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD)) - .addReg(TReg0, RegState::Define | getDeadRegState(DstIsDead)); - MachineInstrBuilder MIBHi = - BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD)) - .addReg(TReg1, RegState::Define | getDeadRegState(DstIsDead)); - - for (int i = 0; i < X86::AddrNumOperands; ++i) { - MIBLo.add(MBBI->getOperand(1 + i)); - if (i == X86::AddrDisp) - MIBHi.addImm(Disp + TmmSize); - else - MIBHi.add(MBBI->getOperand(1 + i)); - } - - // Make sure the first stride reg used in first tileload is alive. - MachineOperand &Stride = - MIBLo.getInstr()->getOperand(1 + X86::AddrIndexReg); - Stride.setIsKill(false); - - // Split the memory operand, adjusting the offset and size for the halves. - MachineMemOperand *OldMMO = MBBI->memoperands().front(); - MachineFunction *MF = MBB.getParent(); - MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize); - MachineMemOperand *MMOHi = - MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize); - - MIBLo.setMemRefs(MMOLo); - MIBHi.setMemRefs(MMOHi); - - // Delete the pseudo. - MBB.erase(MBBI); - return true; - } - // Similar with TILEPAIRLOAD, TILEPAIRSTORE is just for TILEPair spill, no - // corresponding AMX instruction to support it. So, split it too: - // "TILEPAIRSTORE Base, Scale, Index, Offset, Segment, TMM0:TMM1" --> - // "TILESTORE Base, Scale, Index, Offset, Segment, TMM0" + - // "TILESTORE Base, Scale, Index, Offset + TMM_SIZE, Segment, TMM1" - case X86::PTILEPAIRSTORE: { - int64_t Disp = MBBI->getOperand(X86::AddrDisp).getImm(); - Register TReg = MBBI->getOperand(X86::AddrNumOperands).getReg(); - bool SrcIsKill = MBBI->getOperand(X86::AddrNumOperands).isKill(); - Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0); - Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1); - unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8; - - MachineInstrBuilder MIBLo = - BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED)); - MachineInstrBuilder MIBHi = - BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED)); - - for (int i = 0; i < X86::AddrNumOperands; ++i) { - MIBLo.add(MBBI->getOperand(i)); - if (i == X86::AddrDisp) - MIBHi.addImm(Disp + TmmSize); - else - MIBHi.add(MBBI->getOperand(i)); - } - MIBLo.addReg(TReg0, getKillRegState(SrcIsKill)); - MIBHi.addReg(TReg1, getKillRegState(SrcIsKill)); - - // Make sure the first stride reg used in first tilestore is alive. - MachineOperand &Stride = MIBLo.getInstr()->getOperand(X86::AddrIndexReg); - Stride.setIsKill(false); - - // Split the memory operand, adjusting the offset and size for the halves. - MachineMemOperand *OldMMO = MBBI->memoperands().front(); - MachineFunction *MF = MBB.getParent(); - MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize); - MachineMemOperand *MMOHi = - MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize); - - MIBLo.setMemRefs(MMOLo); - MIBHi.setMemRefs(MMOHi); - - // Delete the pseudo. - MBB.erase(MBBI); - return true; - } - case X86::PT2RPNTLVWZ0V: - case X86::PT2RPNTLVWZ0T1V: - case X86::PT2RPNTLVWZ1V: - case X86::PT2RPNTLVWZ1T1V: - case X86::PT2RPNTLVWZ0RSV: - case X86::PT2RPNTLVWZ0RST1V: - case X86::PT2RPNTLVWZ1RSV: - case X86::PT2RPNTLVWZ1RST1V: { - for (unsigned i = 3; i > 0; --i) - MI.removeOperand(i); - unsigned Opc; - switch (Opcode) { - case X86::PT2RPNTLVWZ0V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0); - break; - case X86::PT2RPNTLVWZ0T1V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1); - break; - case X86::PT2RPNTLVWZ1V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1); - break; - case X86::PT2RPNTLVWZ1T1V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1); - break; - case X86::PT2RPNTLVWZ0RSV: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS); - break; - case X86::PT2RPNTLVWZ0RST1V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1); - break; - case X86::PT2RPNTLVWZ1RSV: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS); - break; - case X86::PT2RPNTLVWZ1RST1V: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1); - break; - default: - llvm_unreachable("Impossible Opcode!"); - } - MI.setDesc(TII->get(Opc)); - return true; - } - case X86::PTTRANSPOSEDV: - case X86::PTCONJTFP16V: { - for (int i = 2; i > 0; --i) - MI.removeOperand(i); - MI.setDesc(TII->get(Opcode == X86::PTTRANSPOSEDV ? X86::TTRANSPOSED - : X86::TCONJTFP16)); - return true; - } case X86::PTCMMIMFP16PSV: case X86::PTCMMRLFP16PSV: case X86::PTDPBSSDV: @@ -800,13 +657,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, case X86::PTDPBUUDV: case X86::PTDPBF16PSV: case X86::PTDPFP16PSV: - case X86::PTTDPBF16PSV: - case X86::PTTDPFP16PSV: - case X86::PTTCMMIMFP16PSV: - case X86::PTTCMMRLFP16PSV: - case X86::PTCONJTCMMIMFP16PSV: case X86::PTMMULTF32PSV: - case X86::PTTMMULTF32PSV: case X86::PTDPBF8PSV: case X86::PTDPBHF8PSV: case X86::PTDPHBF8PSV: @@ -816,6 +667,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.removeOperand(i); unsigned Opc; switch (Opcode) { + // clang-format off case X86::PTCMMIMFP16PSV: Opc = X86::TCMMIMFP16PS; break; case X86::PTCMMRLFP16PSV: Opc = X86::TCMMRLFP16PS; break; case X86::PTDPBSSDV: Opc = X86::TDPBSSD; break; @@ -824,40 +676,12 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, case X86::PTDPBUUDV: Opc = X86::TDPBUUD; break; case X86::PTDPBF16PSV: Opc = X86::TDPBF16PS; break; case X86::PTDPFP16PSV: Opc = X86::TDPFP16PS; break; - case X86::PTTDPBF16PSV: - Opc = X86::TTDPBF16PS; - break; - case X86::PTTDPFP16PSV: - Opc = X86::TTDPFP16PS; - break; - case X86::PTTCMMIMFP16PSV: - Opc = X86::TTCMMIMFP16PS; - break; - case X86::PTTCMMRLFP16PSV: - Opc = X86::TTCMMRLFP16PS; - break; - case X86::PTCONJTCMMIMFP16PSV: - Opc = X86::TCONJTCMMIMFP16PS; - break; - case X86::PTMMULTF32PSV: - Opc = X86::TMMULTF32PS; - break; - case X86::PTTMMULTF32PSV: - Opc = X86::TTMMULTF32PS; - break; - case X86::PTDPBF8PSV: - Opc = X86::TDPBF8PS; - break; - case X86::PTDPBHF8PSV: - Opc = X86::TDPBHF8PS; - break; - case X86::PTDPHBF8PSV: - Opc = X86::TDPHBF8PS; - break; - case X86::PTDPHF8PSV: - Opc = X86::TDPHF8PS; - break; - + case X86::PTMMULTF32PSV: Opc = X86::TMMULTF32PS; break; + case X86::PTDPBF8PSV: Opc = X86::TDPBF8PS; break; + case X86::PTDPBHF8PSV: Opc = X86::TDPBHF8PS; break; + case X86::PTDPHBF8PSV: Opc = X86::TDPHBF8PS; break; + case X86::PTDPHF8PSV: Opc = X86::TDPHF8PS; break; + // clang-format on default: llvm_unreachable("Unexpected Opcode"); } diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp index 787b71d425cb3..06f729a7e0cdc 100644 --- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp @@ -267,24 +267,16 @@ void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI, << printReg(TileReg, TRI) << '\n'); } -static unsigned getTileDefNum(MachineRegisterInfo *MRI, Register Reg) { - if (Reg.isVirtual()) { - unsigned RegClassID = MRI->getRegClass(Reg)->getID(); - if (RegClassID == X86::TILERegClassID) - return 1; - if (RegClassID == X86::TILEPAIRRegClassID) - return 2; - } else { - if (Reg >= X86::TMM0 && Reg <= X86::TMM7) - return 1; - if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) - return 2; +static bool isTileRegister(MachineRegisterInfo *MRI, Register Reg) { + if (Reg.isVirtual() && + (MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)) { + return true; } - return 0; -} -static bool isTileRegister(MachineRegisterInfo *MRI, Register VirtReg) { - return getTileDefNum(MRI, VirtReg) > 0; + if (Reg >= X86::TMM0 && Reg <= X86::TMM7) + return true; + + return false; } static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { @@ -296,7 +288,7 @@ static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { if (!MO.isReg()) return false; - return getTileDefNum(MRI, MO.getReg()) > 0; + return isTileRegister(MRI, MO.getReg()); } static ShapeT getShape(MachineRegisterInfo *MRI, Register TileReg) { @@ -636,19 +628,7 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) { else if (dominates(MBB, LastShapeMI, ColMI)) LastShapeMI = ColMI; } - unsigned TileDefNum = getTileDefNum(MRI, MI.getOperand(0).getReg()); - if (TileDefNum > 1) { - for (unsigned I = 1; I < TileDefNum; I++) { - MachineOperand *ColxMO = &MI.getOperand(2 + I); - MachineInstr *ColxMI = MRI->getVRegDef(ColxMO->getReg()); - if (ColxMI->getParent() == &MBB) { - if (!LastShapeMI) - LastShapeMI = ColxMI; - else if (dominates(MBB, LastShapeMI, ColxMI)) - LastShapeMI = ColxMI; - } - } - } + // If there is user live out of the tilecfg, spill it and reload in // before the user. Register TileReg = MI.getOperand(0).getReg(); diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp index 11d331b11737f..d86ae36aa2a67 100644 --- a/llvm/lib/Target/X86/X86FastTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp @@ -77,14 +77,14 @@ INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE, INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE, "Fast Tile Register Configure", false, false) -static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) { +static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { // There is no phi instruction after register allocation. assert(MI.isPHI() == false); // The instruction must have 3 operands: tile def, row, col. // It should be AMX pseudo instruction that have shape operand. if (MI.isDebugInstr() || MI.isCopy() || MI.getNumOperands() < 3 || !MI.isPseudo()) - return 0; + return false; MachineOperand &MO = MI.getOperand(0); if (MO.isReg()) { @@ -93,24 +93,18 @@ static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) { // register is not rewritten yet. if (Reg.isVirtual()) { if (MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) - return 1; - if (MRI->getRegClass(Reg)->getID() == X86::TILEPAIRRegClassID) - return 2; + return true; } if (Reg >= X86::TMM0 && Reg <= X86::TMM7) - return 1; - if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) - return 2; + return true; } - return 0; + return false; } static unsigned getTMMIndex(Register Reg) { if (Reg >= X86::TMM0 && Reg <= X86::TMM7) return Reg - X86::TMM0; - if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) - return (Reg - X86::TMM0_TMM1) * 2; llvm_unreachable("Invalid Tmm Reg!"); } @@ -120,17 +114,14 @@ bool X86FastTileConfig::configBasicBlock(MachineBasicBlock &MBB) { bool Change = false; SmallVector, 6> ShapeInfos; for (MachineInstr &MI : reverse(MBB)) { - unsigned DefNum = getNumDefTiles(MRI, MI); - if (DefNum == 0 && MI.getOpcode() != X86::PLDTILECFGV) + if (!isTileDef(MRI, MI) && MI.getOpcode() != X86::PLDTILECFGV) continue; // AMX instructions that define tile register. if (MI.getOpcode() != X86::PLDTILECFGV) { MachineOperand &Row = MI.getOperand(1); unsigned TMMIdx = getTMMIndex(MI.getOperand(0).getReg()); - for (unsigned I = 0; I < DefNum; I++) { - MachineOperand &Col = MI.getOperand(2 + I); - ShapeInfos.push_back({TMMIdx + I, ShapeT(&Row, &Col)}); - } + MachineOperand &Col = MI.getOperand(2); + ShapeInfos.push_back({TMMIdx, ShapeT(&Row, &Col)}); } else { // PLDTILECFGV // Rewrite the shape information to memory. Stack slot should have // been initialized to zero in pre config. diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 4393f6ecaa033..d4418c8563780 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -337,23 +337,8 @@ namespace { // lowering but before ISEL. bool isAMXSDNode(SDNode *N) const { // Check if N is AMX SDNode: - // 1. check specific opcode since these carry MVT::Untyped instead of - // x86amx_type; - // 2. check result type; - // 3. check operand type; - switch (N->getOpcode()) { - default: - break; - case X86::PT2RPNTLVWZ0V: - case X86::PT2RPNTLVWZ0T1V: - case X86::PT2RPNTLVWZ1V: - case X86::PT2RPNTLVWZ1T1V: - case X86::PT2RPNTLVWZ0RSV: - case X86::PT2RPNTLVWZ0RST1V: - case X86::PT2RPNTLVWZ1RSV: - case X86::PT2RPNTLVWZ1RST1V: - return true; - } + // 1. check result type; + // 2. check operand type; for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) { if (N->getValueType(Idx) == MVT::x86amx) return true; @@ -5398,65 +5383,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, CNode); return; } - case Intrinsic::x86_t2rpntlvwz0rs: - case Intrinsic::x86_t2rpntlvwz0rst1: - case Intrinsic::x86_t2rpntlvwz1rs: - case Intrinsic::x86_t2rpntlvwz1rst1: - if (!Subtarget->hasAMXMOVRS()) - break; - [[fallthrough]]; - case Intrinsic::x86_t2rpntlvwz0: - case Intrinsic::x86_t2rpntlvwz0t1: - case Intrinsic::x86_t2rpntlvwz1: - case Intrinsic::x86_t2rpntlvwz1t1: { - if (!Subtarget->hasAMXTRANSPOSE()) - break; - auto *MFI = - CurDAG->getMachineFunction().getInfo(); - MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); - unsigned Opc; - switch (IntNo) { - default: - llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_t2rpntlvwz0: - Opc = X86::PT2RPNTLVWZ0; - break; - case Intrinsic::x86_t2rpntlvwz0t1: - Opc = X86::PT2RPNTLVWZ0T1; - break; - case Intrinsic::x86_t2rpntlvwz1: - Opc = X86::PT2RPNTLVWZ1; - break; - case Intrinsic::x86_t2rpntlvwz1t1: - Opc = X86::PT2RPNTLVWZ1T1; - break; - case Intrinsic::x86_t2rpntlvwz0rs: - Opc = X86::PT2RPNTLVWZ0RS; - break; - case Intrinsic::x86_t2rpntlvwz0rst1: - Opc = X86::PT2RPNTLVWZ0RST1; - break; - case Intrinsic::x86_t2rpntlvwz1rs: - Opc = X86::PT2RPNTLVWZ1RS; - break; - case Intrinsic::x86_t2rpntlvwz1rst1: - Opc = X86::PT2RPNTLVWZ1RST1; - break; - } - // FIXME: Match displacement and scale. - unsigned TIndex = Node->getConstantOperandVal(2); - SDValue TReg = getI8Imm(TIndex, dl); - SDValue Base = Node->getOperand(3); - SDValue Scale = getI8Imm(1, dl); - SDValue Index = Node->getOperand(4); - SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); - SDValue Segment = CurDAG->getRegister(0, MVT::i16); - SDValue Chain = Node->getOperand(0); - SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain}; - MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); - ReplaceNode(Node, CNode); - return; - } } break; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9525e03baa167..1ce419ba00824 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -27946,67 +27946,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } - case Intrinsic::x86_t2rpntlvwz0rs_internal: - case Intrinsic::x86_t2rpntlvwz0rst1_internal: - case Intrinsic::x86_t2rpntlvwz1rs_internal: - case Intrinsic::x86_t2rpntlvwz1rst1_internal: - case Intrinsic::x86_t2rpntlvwz0_internal: - case Intrinsic::x86_t2rpntlvwz0t1_internal: - case Intrinsic::x86_t2rpntlvwz1_internal: - case Intrinsic::x86_t2rpntlvwz1t1_internal: { - auto *X86MFI = DAG.getMachineFunction().getInfo(); - X86MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); - unsigned IntNo = Op.getConstantOperandVal(1); - unsigned Opc = 0; - switch (IntNo) { - default: - llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_t2rpntlvwz0_internal: - Opc = X86::PT2RPNTLVWZ0V; - break; - case Intrinsic::x86_t2rpntlvwz0t1_internal: - Opc = X86::PT2RPNTLVWZ0T1V; - break; - case Intrinsic::x86_t2rpntlvwz1_internal: - Opc = X86::PT2RPNTLVWZ1V; - break; - case Intrinsic::x86_t2rpntlvwz1t1_internal: - Opc = X86::PT2RPNTLVWZ1T1V; - break; - case Intrinsic::x86_t2rpntlvwz0rs_internal: - Opc = X86::PT2RPNTLVWZ0RSV; - break; - case Intrinsic::x86_t2rpntlvwz0rst1_internal: - Opc = X86::PT2RPNTLVWZ0RST1V; - break; - case Intrinsic::x86_t2rpntlvwz1rs_internal: - Opc = X86::PT2RPNTLVWZ1RSV; - break; - case Intrinsic::x86_t2rpntlvwz1rst1_internal: - Opc = X86::PT2RPNTLVWZ1RST1V; - break; - } - - SDLoc DL(Op); - SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other); - - SDValue Ops[] = {Op.getOperand(2), // Row - Op.getOperand(3), // Col0 - Op.getOperand(4), // Col1 - Op.getOperand(5), // Base - DAG.getTargetConstant(1, DL, MVT::i8), // Scale - Op.getOperand(6), // Index - DAG.getTargetConstant(0, DL, MVT::i32), // Disp - DAG.getRegister(0, MVT::i16), // Segment - Op.getOperand(0)}; // Chain - - MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops); - SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx, - SDValue(Res, 0)); - SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx, - SDValue(Res, 0)); - return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL); - } case Intrinsic::x86_atomic_bts_rm: case Intrinsic::x86_atomic_btc_rm: case Intrinsic::x86_atomic_btr_rm: { @@ -37745,10 +37684,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, assert (Imm < 8 && "Illegal tmm index"); return X86::TMM0 + Imm; }; - auto TMMImmToTMMPair = [](unsigned Imm) { - assert(Imm < 8 && "Illegal tmm pair index."); - return X86::TMM0_TMM1 + Imm / 2; - }; switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected instr type to insert"); @@ -38129,53 +38064,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PTDPBHF8PS: case X86::PTDPHBF8PS: case X86::PTDPHF8PS: - case X86::PTTDPBF16PS: - case X86::PTTDPFP16PS: - case X86::PTTCMMIMFP16PS: - case X86::PTTCMMRLFP16PS: - case X86::PTCONJTCMMIMFP16PS: - case X86::PTMMULTF32PS: - case X86::PTTMMULTF32PS: { + case X86::PTMMULTF32PS: { unsigned Opc; switch (MI.getOpcode()) { default: llvm_unreachable("illegal opcode!"); + // clang-format off case X86::PTDPBSSD: Opc = X86::TDPBSSD; break; case X86::PTDPBSUD: Opc = X86::TDPBSUD; break; case X86::PTDPBUSD: Opc = X86::TDPBUSD; break; case X86::PTDPBUUD: Opc = X86::TDPBUUD; break; case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break; case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break; - case X86::PTCMMIMFP16PS: - Opc = X86::TCMMIMFP16PS; - break; - case X86::PTCMMRLFP16PS: - Opc = X86::TCMMRLFP16PS; - break; + case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break; + case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break; case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break; case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break; case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break; case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break; - case X86::PTTDPBF16PS: - Opc = X86::TTDPBF16PS; - break; - case X86::PTTDPFP16PS: - Opc = X86::TTDPFP16PS; - break; - case X86::PTTCMMIMFP16PS: - Opc = X86::TTCMMIMFP16PS; - break; - case X86::PTTCMMRLFP16PS: - Opc = X86::TTCMMRLFP16PS; - break; - case X86::PTCONJTCMMIMFP16PS: - Opc = X86::TCONJTCMMIMFP16PS; - break; - case X86::PTMMULTF32PS: - Opc = X86::TMMULTF32PS; - break; - case X86::PTTMMULTF32PS: - Opc = X86::TTMMULTF32PS; - break; + case X86::PTMMULTF32PS: Opc = X86::TMMULTF32PS; break; + // clang-format on } MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc)); @@ -38246,70 +38153,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); // The pseudo is gone now. return BB; } - case X86::PT2RPNTLVWZ0: - case X86::PT2RPNTLVWZ0T1: - case X86::PT2RPNTLVWZ1: - case X86::PT2RPNTLVWZ1T1: - case X86::PT2RPNTLVWZ0RS: - case X86::PT2RPNTLVWZ0RST1: - case X86::PT2RPNTLVWZ1RS: - case X86::PT2RPNTLVWZ1RST1: { - const DebugLoc &DL = MI.getDebugLoc(); - unsigned Opc; -#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC) - switch (MI.getOpcode()) { - default: - llvm_unreachable("Unexpected instruction!"); - case X86::PT2RPNTLVWZ0: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0); - break; - case X86::PT2RPNTLVWZ0T1: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1); - break; - case X86::PT2RPNTLVWZ1: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1); - break; - case X86::PT2RPNTLVWZ1T1: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1); - break; - case X86::PT2RPNTLVWZ0RS: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS); - break; - case X86::PT2RPNTLVWZ0RST1: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1); - break; - case X86::PT2RPNTLVWZ1RS: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS); - break; - case X86::PT2RPNTLVWZ1RST1: - Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1); - break; - } -#undef GET_EGPR_IF_ENABLED - MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); - MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define); - - MIB.add(MI.getOperand(1)); // base - MIB.add(MI.getOperand(2)); // scale - MIB.add(MI.getOperand(3)); // index - MIB.add(MI.getOperand(4)); // displacement - MIB.add(MI.getOperand(5)); // segment - MI.eraseFromParent(); // The pseudo is gone now. - return BB; - } - case X86::PTTRANSPOSED: - case X86::PTCONJTFP16: { - const DebugLoc &DL = MI.getDebugLoc(); - unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED - : X86::TCONJTFP16; - - MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); - MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define); - MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef); - - MI.eraseFromParent(); // The pseudo is gone now. - return BB; - } case X86::PTCVTROWPS2BF16Hrri: case X86::PTCVTROWPS2BF16Lrri: case X86::PTCVTROWPS2PHHrri: diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 69a5115201ef2..522782abd710f 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -338,188 +338,6 @@ let Predicates = [HasAMXFP8, In64BitMode] in { } } -let Predicates = [HasAMXTILE, In64BitMode], isPseudo = true, SchedRW = [WriteSystem] in { - let mayStore = 1 in - def PTILEPAIRSTORE : PseudoI<(outs), (ins opaquemem:$src1, TILEPair:$src2), []>; - let mayLoad = 1 in - def PTILEPAIRLOAD : PseudoI<(outs TILEPair:$dst), (ins opaquemem:$src), []>; -} - -multiclass T2RPNTLVW_Base op1, bits<8> op2, string rs, string suffix> { - def Z0#rs#suffix : I, PS; - def Z0#rs#T1#suffix : I, PS; - def Z1#rs#suffix : I, PD; - def Z1#rs#T1#suffix : I, PD; -} - -let Predicates = [HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in - defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "">, T8, VEX; - -let Predicates = [HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in - defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "_EVEX">, T8, EVEX, NoCD8; - -let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in - defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "">, T_MAP5, VEX; - -let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in - defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "_EVEX">, T_MAP5, EVEX, NoCD8; - -let Predicates = [HasAMXTRANSPOSE, In64BitMode] in { - let SchedRW = [WriteSystem] in { - def TTRANSPOSED : I<0x5f, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src), - "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8, XS; - let isPseudo = true in { - def PT2RPNTLVWZ0V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ0T1V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ1V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ1T1V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - } - - def PTTRANSPOSEDV : PseudoI<(outs TILE:$dst), - (ins GR16:$src1, GR16:$src2, TILE:$src), - [(set TILE: $dst, - (int_x86_ttransposed_internal GR16:$src1, GR16:$src2, - TILE:$src))]>; - - let usesCustomInserter = 1 in { - def PT2RPNTLVWZ0 : PseudoI<(outs), (ins u8imm:$dst, - sibmem:$src1), []>; - def PT2RPNTLVWZ0T1 : PseudoI<(outs), (ins u8imm:$dst, - sibmem:$src1), []>; - def PT2RPNTLVWZ1 : PseudoI<(outs), (ins u8imm:$dst, - sibmem:$src1), []>; - def PT2RPNTLVWZ1T1 : PseudoI<(outs), (ins u8imm:$dst, - sibmem:$src1), []>; - def PTTRANSPOSED : PseudoI<(outs), (ins u8imm:$dst, u8imm:$src), - [(int_x86_ttransposed timm:$dst, timm:$src)]>; - } - } -} // HasAMXTILE, HasAMXTRANSPOSE - -let Predicates = [HasAMXBF16, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in { - let Constraints = "$src1 = $dst" in - def TTDPBF16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "ttdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", - []>, VEX, VVVV, T8,XS; - let Constraints = "$src4 = $dst" in - def PTTDPBF16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), - [(set TILE: $dst, - (int_x86_ttdpbf16ps_internal GR16:$src1, GR16:$src2, - GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; - let usesCustomInserter = 1 in - def PTTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_ttdpbf16ps timm:$src1, timm:$src2, timm:$src3)]>; -} - -let Predicates = [HasAMXFP16, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in { - let Constraints = "$src1 = $dst" in - def TTDPFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "ttdpfp16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", - []>, VEX, VVVV, T8,XD; - let Constraints = "$src4 = $dst" in - def PTTDPFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), - [(set TILE: $dst, - (int_x86_ttdpfp16ps_internal GR16:$src1, GR16:$src2, - GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; - let usesCustomInserter = 1 in - def PTTDPFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_ttdpfp16ps timm:$src1, timm:$src2, timm:$src3)]>; -} - -let Predicates = [HasAMXCOMPLEX, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in { - let Constraints = "$src1 = $dst" in { - def TTCMMIMFP16PS : I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "ttcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}", - []>, VEX, VVVV, T8,XD; - def TTCMMRLFP16PS: I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "ttcmmrlfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}", - []>, VEX, VVVV, T8,XS; - def TCONJTCMMIMFP16PS : I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "tconjtcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}", - []>, VEX, VVVV, WIG, T8,PS; - } - def TCONJTFP16 : I<0x6b, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src), - "tconjtfp16\t{$src, $dst|$dst, $src}", []>, VEX, T8,PD; - - let Constraints = "$src4 = $dst" in { - def PTTCMMIMFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), - [(set TILE: $dst, - (int_x86_ttcmmimfp16ps_internal GR16:$src1, GR16:$src2, - GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; - def PTTCMMRLFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), - [(set TILE: $dst, - (int_x86_ttcmmrlfp16ps_internal GR16:$src1, GR16:$src2, - GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; - def PTCONJTCMMIMFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), - [(set TILE: $dst, - (int_x86_tconjtcmmimfp16ps_internal GR16:$src1, GR16:$src2, - GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; - } - def PTCONJTFP16V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3), - [(set TILE: $dst, (int_x86_tconjtfp16_internal GR16:$src1, GR16:$src2, TILE:$src3))]>; - - let usesCustomInserter = 1 in { - def PTTCMMIMFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_ttcmmimfp16ps timm:$src1, timm:$src2, timm:$src3)]>; - def PTTCMMRLFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_ttcmmrlfp16ps timm:$src1, timm:$src2, timm:$src3)]>; - def PTCONJTCMMIMFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_tconjtcmmimfp16ps timm:$src1, timm:$src2, timm:$src3)]>; - def PTCONJTFP16 : PseudoI<(outs), (ins u8imm:$dst, u8imm:$src), - [(int_x86_tconjtfp16 timm:$dst, timm:$src)]>; - } -} - -let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in { - let isPseudo = true in { - def PT2RPNTLVWZ0RSV : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ0RST1V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ1RSV : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - def PT2RPNTLVWZ1RST1V : PseudoI<(outs TILEPair:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), - []>; - } - let usesCustomInserter = 1 in { - def PT2RPNTLVWZ0RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; - def PT2RPNTLVWZ0RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; - def PT2RPNTLVWZ1RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; - def PT2RPNTLVWZ1RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; - } -} // HasAMXMOVRS, HasAMXTRANSPOSE - multiclass TILELOADDRS_Base { def suffix : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src1), "tileloaddrs\t{$src1, $dst|$dst, $src1}", []>, T8, XD; @@ -721,29 +539,3 @@ let Predicates = [HasAMXTF32, In64BitMode] in { } } // SchedRW = [WriteSystem] } // HasAMXTF32 - -let Predicates = [HasAMXTF32, HasAMXTRANSPOSE, In64BitMode] in { - let SchedRW = [WriteSystem] in { - let Constraints = "$src1 = $dst" in { - def TTMMULTF32PS: I<0x48, MRMSrcReg4VOp3, (outs TILE:$dst), - (ins TILE:$src1, TILE:$src2, TILE:$src3), - "ttmmultf32ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", - []>, VEX, VVVV, T8, PS; - } - let Constraints = "$src4 = $dst" in { - def PTTMMULTF32PSV : PseudoI<(outs TILE:$dst), - (ins GR16:$src1, GR16:$src2, GR16:$src3, - TILE:$src4, TILE:$src5, TILE:$src6), - [(set TILE:$dst, - (int_x86_ttmmultf32ps_internal GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6))]>; - } - let usesCustomInserter = 1 in { - def PTTMMULTF32PS : PseudoI<(outs), - (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_ttmmultf32ps timm:$src1, timm:$src2, - timm:$src3)]>; - } - } // SchedRW = [WriteSystem] -} // HasAMXTF32, HasAMXTRANSPOSE diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 5c23f917d0530..6b2a7a4ec3583 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4544,11 +4544,6 @@ static unsigned getLoadStoreRegOpcode(Register Reg, return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD) : GET_EGPR_IF_ENABLED(X86::TILESTORED); #undef GET_EGPR_IF_ENABLED - case 2048: - assert(X86::TILEPAIRRegClass.hasSubClassEq(RC) && - "Unknown 2048-byte regclass"); - assert(STI.hasAMXTILE() && "Using 2048-bit register requires AMX-TILE"); - return Load ? X86::PTILEPAIRLOAD : X86::PTILEPAIRSTORE; } } @@ -4743,8 +4738,6 @@ static bool isAMXOpcode(unsigned Opc) { case X86::TILESTORED: case X86::TILELOADD_EVEX: case X86::TILESTORED_EVEX: - case X86::PTILEPAIRLOAD: - case X86::PTILEPAIRSTORE: return true; } } @@ -4757,8 +4750,7 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB, default: llvm_unreachable("Unexpected special opcode!"); case X86::TILESTORED: - case X86::TILESTORED_EVEX: - case X86::PTILEPAIRSTORE: { + case X86::TILESTORED_EVEX: { // tilestored %tmm, (%sp, %idx) MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); @@ -4772,8 +4764,7 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB, break; } case X86::TILELOADD: - case X86::TILELOADD_EVEX: - case X86::PTILEPAIRLOAD: { + case X86::TILELOADD_EVEX: { // tileloadd (%sp, %idx), %tmm MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); diff --git a/llvm/lib/Target/X86/X86InstrOperands.td b/llvm/lib/Target/X86/X86InstrOperands.td index 5207ecad127a2..6ba07f74d74c5 100644 --- a/llvm/lib/Target/X86/X86InstrOperands.td +++ b/llvm/lib/Target/X86/X86InstrOperands.td @@ -536,10 +536,3 @@ def VK8Pair : RegisterOperand { def VK16Pair : RegisterOperand { let ParserMatchClass = VK16PairAsmOperand; } - -let RenderMethod = "addTILEPairOperands" in - def TILEPairAsmOperand : AsmOperandClass { let Name = "TILEPair"; } - -def TILEPair : RegisterOperand { - let ParserMatchClass = TILEPairAsmOperand; -} diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index c20bb05018b4d..98104a6fad1a9 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -183,7 +183,6 @@ def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">; def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">; def HasAMXFP8 : Predicate<"Subtarget->hasAMXFP8()">; def HasAMXMOVRS : Predicate<"Subtarget->hasAMXMOVRS()">; -def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">; def HasAMXAVX512 : Predicate<"Subtarget->hasAMXAVX512()">; def HasAMXTF32 : Predicate<"Subtarget->hasAMXTF32()">; def HasUINTR : Predicate<"Subtarget->hasUINTR()">; diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index 8ffd454f4f73e..2fc5d38ef5055 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -74,22 +74,6 @@ static bool isAMXCast(Instruction *II) { match(II, m_Intrinsic(m_Value())); } -// Some instructions may return more than one tiles. -// e.g: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal -static unsigned getNumDefTiles(IntrinsicInst *II) { - Type *Ty = II->getType(); - if (Ty->isX86_AMXTy()) - return 1; - - unsigned Num = 0; - for (unsigned i = 0; i < Ty->getNumContainedTypes(); i++) { - Type *STy = Ty->getContainedType(i); - if (STy->isX86_AMXTy()) - Num++; - } - return Num; -} - static bool isAMXIntrinsic(Value *I) { auto *II = dyn_cast(I); if (!II) @@ -98,7 +82,7 @@ static bool isAMXIntrinsic(Value *I) { return false; // Check if return type or parameter is x86_amx. If it is x86_amx // the intrinsic must be x86 amx intrinsics. - if (getNumDefTiles(II) > 0) + if (II->getType()->isX86_AMXTy()) return true; for (Value *V : II->args()) { if (V->getType()->isX86_AMXTy()) @@ -137,27 +121,7 @@ static Instruction *getFirstNonAllocaInTheEntryBlock(Function &F) { llvm_unreachable("No terminator in the entry block!"); } -class ShapeCalculator { -private: - const TargetMachine *TM = nullptr; - - // In AMX intrinsics we let Shape = {Row, Col}, but the - // RealCol = Col / ElementSize. We may use the RealCol - // as a new Row for other new created AMX intrinsics. - std::map Col2Row, Row2Col; - -public: - ShapeCalculator(const TargetMachine *TargetM) : TM(TargetM) {} - std::pair getShape(IntrinsicInst *II, unsigned OpNo); - std::pair getShape(PHINode *Phi); - Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity); - Value *getColFromRow(Instruction *II, Value *V, unsigned Granularity); -}; - -Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V, - unsigned Granularity) { - if (auto It = Col2Row.find(V); It != Col2Row.end()) - return It->second; +static Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity) { IRBuilder<> Builder(II); Value *RealRow = nullptr; if (isa(V)) @@ -186,47 +150,16 @@ Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V, getFirstNonAllocaInTheEntryBlock(*II->getFunction())); RealRow = NewBuilder.CreateUDiv(V, NewBuilder.getInt16(Granularity)); } - Col2Row[V] = RealRow; return RealRow; } -Value *ShapeCalculator::getColFromRow(Instruction *II, Value *V, - unsigned Granularity) { - if (auto It = Row2Col.find(V); It != Row2Col.end()) - return It->second; - IRBuilder<> Builder(II); - Value *RealCol = nullptr; - if (isa(V)) - RealCol = - Builder.getInt16((cast(V)->getSExtValue()) * Granularity); - else if (isa(V)) { - Builder.SetInsertPoint(cast(V)); - RealCol = Builder.CreateNUWMul(V, Builder.getInt16(Granularity)); - cast(RealCol)->moveAfter(cast(V)); - } else { - // When it is not a const value and it is a function argument, we create - // Row at the entry bb. - IRBuilder<> NewBuilder( - getFirstNonAllocaInTheEntryBlock(*II->getFunction())); - RealCol = NewBuilder.CreateNUWMul(V, NewBuilder.getInt16(Granularity)); - } - Row2Col[V] = RealCol; - return RealCol; -} - // TODO: Refine the row and col-in-bytes of tile to row and col of matrix. -std::pair ShapeCalculator::getShape(IntrinsicInst *II, - unsigned OpNo) { - (void)TM; +std::pair getShape(IntrinsicInst *II, unsigned OpNo) { IRBuilder<> Builder(II); Value *Row = nullptr, *Col = nullptr; switch (II->getIntrinsicID()) { default: llvm_unreachable("Expect amx intrinsics"); - case Intrinsic::x86_t2rpntlvwz0_internal: - case Intrinsic::x86_t2rpntlvwz0t1_internal: - case Intrinsic::x86_t2rpntlvwz1_internal: - case Intrinsic::x86_t2rpntlvwz1t1_internal: case Intrinsic::x86_tileloadd64_internal: case Intrinsic::x86_tileloaddt164_internal: case Intrinsic::x86_tilestored64_internal: @@ -271,13 +204,6 @@ std::pair ShapeCalculator::getShape(IntrinsicInst *II, } break; } - case Intrinsic::x86_ttransposed_internal: - case Intrinsic::x86_tconjtfp16_internal: { - assert((OpNo == 2) && "Illegal Operand Number."); - Row = getRowFromCol(II, II->getArgOperand(1), 4); - Col = getColFromRow(II, II->getArgOperand(0), 4); - break; - } case Intrinsic::x86_tcvtrowd2ps_internal: case Intrinsic::x86_tcvtrowps2bf16h_internal: case Intrinsic::x86_tcvtrowps2bf16l_internal: @@ -289,34 +215,12 @@ std::pair ShapeCalculator::getShape(IntrinsicInst *II, Col = II->getArgOperand(1); break; } - case Intrinsic::x86_ttdpbf16ps_internal: - case Intrinsic::x86_ttdpfp16ps_internal: - case Intrinsic::x86_ttcmmimfp16ps_internal: - case Intrinsic::x86_ttcmmrlfp16ps_internal: - case Intrinsic::x86_tconjtcmmimfp16ps_internal: - case Intrinsic::x86_ttmmultf32ps_internal: { - switch (OpNo) { - case 3: - Row = II->getArgOperand(0); - Col = II->getArgOperand(1); - break; - case 4: - Row = getRowFromCol(II, II->getArgOperand(2), 4); - Col = getColFromRow(II, II->getArgOperand(0), 4); - break; - case 5: - Row = getRowFromCol(II, II->getArgOperand(2), 4); - Col = II->getArgOperand(1); - break; - } - break; - } } return std::make_pair(Row, Col); } -std::pair ShapeCalculator::getShape(PHINode *Phi) { +static std::pair getShape(PHINode *Phi) { Use &U = *(Phi->use_begin()); unsigned OpNo = U.getOperandNo(); User *V = U.getUser(); @@ -349,15 +253,14 @@ std::pair ShapeCalculator::getShape(PHINode *Phi) { namespace { class X86LowerAMXType { Function &Func; - ShapeCalculator *SC; // In AMX intrinsics we let Shape = {Row, Col}, but the // RealCol = Col / ElementSize. We may use the RealCol // as a new Row for other new created AMX intrinsics. - std::map Col2Row, Row2Col; + std::map Col2Row; public: - X86LowerAMXType(Function &F, ShapeCalculator *ShapeC) : Func(F), SC(ShapeC) {} + X86LowerAMXType(Function &F) : Func(F) {} bool visit(); void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast); void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST); @@ -374,7 +277,7 @@ void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) { Use &U = *(Bitcast->use_begin()); unsigned OpNo = U.getOperandNo(); auto *II = cast(U.getUser()); - std::tie(Row, Col) = SC->getShape(II, OpNo); + std::tie(Row, Col) = getShape(II, OpNo); IRBuilder<> Builder(Bitcast); // Use the maximun column as stride. Value *Stride = Builder.getInt64(64); @@ -454,7 +357,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) { Builder.CreateStore(Src, AllocaAddr); // TODO we can pick an constant operand for the shape. Value *Row = nullptr, *Col = nullptr; - std::tie(Row, Col) = SC->getShape(II, OpNo); + std::tie(Row, Col) = getShape(II, OpNo); std::array Args = {Row, Col, I8Ptr, Stride}; Value *NewInst = Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, Args); @@ -594,18 +497,11 @@ static Value *getAllocaPos(BasicBlock *BB) { static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) { assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!"); - auto *II = dyn_cast(TileDef); - unsigned Idx = 0; - // Extract tile from multiple tiles' def. - if (auto *Extr = dyn_cast(TileDef)) { - assert(Extr->hasIndices() && "Tile extract miss index!"); - Idx = Extr->getIndices()[0]; - II = cast(Extr->getOperand(0)); - } + auto *II = cast(TileDef); assert(II && "Not tile intrinsic!"); - Value *Row = II->getOperand(Idx); - Value *Col = II->getOperand(Idx + 1); + Value *Row = II->getOperand(0); + Value *Col = II->getOperand(1); BasicBlock *BB = TileDef->getParent(); BasicBlock::iterator Iter = TileDef->getIterator(); @@ -624,20 +520,14 @@ static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) { // Get tile shape. IntrinsicInst *II = nullptr; - unsigned Idx = 0; if (IsPHI) { Value *PhiOp = cast(V)->getIncomingValue(0); II = cast(PhiOp); - } else if (auto *Extr = dyn_cast(V)) { - // Extract tile from multiple tiles' def. - assert(Extr->hasIndices() && "Tile extract miss index!"); - Idx = Extr->getIndices()[0]; - II = cast(Extr->getOperand(0)); } else { II = cast(V); } - Value *Row = II->getOperand(Idx); - Value *Col = II->getOperand(Idx + 1); + Value *Row = II->getOperand(0); + Value *Col = II->getOperand(1); Instruction *UserI = cast(U.getUser()); IRBuilder<> Builder(UserI); @@ -848,12 +738,10 @@ namespace { class X86LowerAMXCast { Function &Func; - ShapeCalculator *SC; std::unique_ptr DT; public: - X86LowerAMXCast(Function &F, ShapeCalculator *ShapeC) - : Func(F), SC(ShapeC), DT(nullptr) {} + X86LowerAMXCast(Function &F) : Func(F), DT(nullptr) {} bool combineCastStore(IntrinsicInst *Cast, StoreInst *ST); bool combineLoadCast(IntrinsicInst *Cast, LoadInst *LD); bool combineTilezero(IntrinsicInst *Cast); @@ -932,7 +820,7 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi( if (!isa(IncValue) && !IncConst->isZeroValue()) return false; Value *Row = nullptr, *Col = nullptr; - std::tie(Row, Col) = SC->getShape(OldPN); + std::tie(Row, Col) = getShape(OldPN); // TODO: If it is not constant the Row and Col must domoniate tilezero // that we are going to create. if (!Row || !Col || !isa(Row) || !isa(Col)) @@ -1063,19 +951,6 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi( return true; } -static Value *getShapeFromAMXIntrinsic(Value *Inst, unsigned ShapeIdx, - bool IsRow) { - if (!isAMXIntrinsic(Inst)) - return nullptr; - - auto *II = cast(Inst); - if (IsRow) - return II->getOperand(0); - - assert(ShapeIdx < 2 && "Currently 2 shapes in 1 instruction at most!"); - return II->getOperand(ShapeIdx + 1); -} - // %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %42) // store <256 x i32> %43, <256 x i32>* %p, align 64 // --> @@ -1090,38 +965,13 @@ bool X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) { if (!Tile->hasOneUse()) return false; - // We don't fetch shape from tilestore, we only get shape from tiledef, - // so we can set the max tile shape to tilestore for special cases. + auto *II = cast(Tile); + // Tile is output from AMX intrinsic. The first operand of the + // intrinsic is row, the second operand of the intrinsic is column. + Value *Row = II->getOperand(0); + Value *Col = II->getOperand(1); + IRBuilder<> Builder(ST); - Value *Row = nullptr; - Value *Col = nullptr; - - if (isAMXIntrinsic(Tile)) { - auto *II = cast(Tile); - // Tile is output from AMX intrinsic. The first operand of the - // intrinsic is row, the second operand of the intrinsic is column. - Row = II->getOperand(0); - Col = II->getOperand(1); - } else { - // Now we supported multi-tiles value in structure, so we may get tile - // from extracting multi-tiles structure. - // For example: - // %6 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %1, - // i16 %2, i16 %3, i8* %4, i64 %5) - // %7 = extractvalue { x86_amx, x86_amx } %6, 0 - // %8 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %7) - // store <256 x i32> %8, <256 x i32>* %0, align 1024 - // - // TODO: Currently we only handle extractvalue case, enhance me for other - // cases if possible. - auto *II = cast(Tile); - assert(II && "We meet unhandle source in fetching tile value!"); - unsigned ShapeIdx = II->getIndices()[0]; - Value *Tiles = II->getOperand(0); - Row = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, true); - Col = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, false); - } - assert(Row && Col && "Shape got failed!"); // Stride should be equal to col(measured by bytes) Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty()); @@ -1146,7 +996,7 @@ bool X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) { // shape information through def-use chain. if (!isAMXIntrinsic(II)) return false; - std::tie(Row, Col) = SC->getShape(II, OpNo); + std::tie(Row, Col) = getShape(II, OpNo); IRBuilder<> Builder(LD); // Stride should be equal to col(measured by bytes) Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty()); @@ -1189,7 +1039,7 @@ bool X86LowerAMXCast::combineTilezero(IntrinsicInst *Cast) { if (!isAMXIntrinsic(II)) return false; - std::tie(Row, Col) = SC->getShape(II, OpNo); + std::tie(Row, Col) = getShape(II, OpNo); IRBuilder<> Builder(Cast); Value *NewInst = @@ -1384,7 +1234,7 @@ bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) { Builder.CreateStore(Src, AllocaAddr); // TODO we can pick an constant operand for the shape. Value *Row = nullptr, *Col = nullptr; - std::tie(Row, Col) = SC->getShape(II, OpNo); + std::tie(Row, Col) = getShape(II, OpNo); std::array Args = { Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty())}; Value *NewInst = @@ -1445,14 +1295,13 @@ bool lowerAmxType(Function &F, const TargetMachine *TM, return false; bool C = false; - ShapeCalculator SC(TM); - X86LowerAMXCast LAC(F, &SC); + X86LowerAMXCast LAC(F); C |= LAC.combineAMXcast(TLI); // There might be remaining AMXcast after combineAMXcast and they should be // handled elegantly. C |= LAC.transformAllAMXCast(); - X86LowerAMXType LAT(F, &SC); + X86LowerAMXType LAT(F); C |= LAT.visit(); // Prepare for fast register allocation at O0. diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp index 2a1c49957bf7a..8a1d00d2f6427 100644 --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -141,15 +141,10 @@ class X86PreTileConfig : public MachineFunctionPass { if (!MO.isReg() || !MO.getReg().isVirtual()) return false; - unsigned Shapes = 0; - if (MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) - Shapes = 1; - if (MRI->getRegClass(MO.getReg())->getID() == X86::TILEPAIRRegClassID) - Shapes = 2; - if (!Shapes) + if (MRI->getRegClass(MO.getReg())->getID() != X86::TILERegClassID) return false; - collectShapeInfo(MI, Shapes); + collectShapeInfo(MI); return true; } @@ -165,7 +160,7 @@ class X86PreTileConfig : public MachineFunctionPass { } /// Collect the shape def information for later use. - void collectShapeInfo(MachineInstr &MI, unsigned Shapes); + void collectShapeInfo(MachineInstr &MI); /// Try to hoist shapes definded below AMX instructions. bool hoistShapesInBB(MachineBasicBlock *MBB, SmallVectorImpl &Shapes) { @@ -231,7 +226,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig", "Tile Register Pre-configure", false, false) -void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) { +void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) { auto RecordShape = [&](MachineInstr *MI, MachineBasicBlock *MBB) { MIRef MIR(MI, MBB); auto &Refs = ShapeBBs[MBB]; @@ -240,10 +235,8 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) { Refs.insert(I, MIR); }; - // All shapes have same row in multi-tile operand. - SmallVector WorkList; - for (unsigned I = 1; I < Shapes + 2; ++I) - WorkList.push_back(MI.getOperand(I).getReg()); + SmallVector WorkList( + {MI.getOperand(1).getReg(), MI.getOperand(2).getReg()}); while (!WorkList.empty()) { Register R = WorkList.pop_back_val(); MachineInstr *DefMI = MRI->getVRegDef(R); @@ -252,13 +245,6 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) { if (DefMI->isMoveImmediate() || !DefVisited.insert(DefMI).second) continue; - // This happens when column = 0 in multi-tile operand. - if (DefMI->getOpcode() == X86::COPY) { - MachineInstr *MI = MRI->getVRegDef(DefMI->getOperand(1).getReg()); - if (MI && MI->isMoveImmediate()) - continue; - } - if (DefMI->isPHI()) { for (unsigned I = 1; I < DefMI->getNumOperands(); I += 2) if (isLoopBackEdge(DefMBB, DefMI->getOperand(I + 1).getMBB())) diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 76979e37c4618..72f38133e21ff 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -597,10 +597,6 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(*AI); } - // Reserve low half pair registers in case they are used by RA aggressively. - Reserved.set(X86::TMM0_TMM1); - Reserved.set(X86::TMM2_TMM3); - assert(checkAllSuperRegsMarked(Reserved, {X86::SIL, X86::DIL, X86::BPL, X86::SPL, X86::SIH, X86::DIH, X86::BPH, X86::SPH})); @@ -621,7 +617,7 @@ unsigned X86RegisterInfo::getNumSupportedRegs(const MachineFunction &MF) const { // and try to return the minimum number of registers supported by the target. static_assert((X86::R15WH + 1 == X86::YMM0) && (X86::YMM15 + 1 == X86::K0) && (X86::K6_K7 + 1 == X86::TMMCFG) && - (X86::TMM6_TMM7 + 1 == X86::R16) && + (X86::TMM7 + 1 == X86::R16) && (X86::R31WH + 1 == X86::NUM_TARGET_REGS), "Register number may be incorrect"); @@ -694,8 +690,7 @@ bool X86RegisterInfo::isFixedRegister(const MachineFunction &MF, } bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const { - return RC->getID() == X86::TILERegClassID || - RC->getID() == X86::TILEPAIRRegClassID; + return RC->getID() == X86::TILERegClassID; } void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { @@ -1062,17 +1057,9 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, case X86::PTDPFP16PSV: case X86::PTCMMIMFP16PSV: case X86::PTCMMRLFP16PSV: - case X86::PTTRANSPOSEDV: - case X86::PTTDPBF16PSV: - case X86::PTTDPFP16PSV: - case X86::PTTCMMIMFP16PSV: - case X86::PTTCMMRLFP16PSV: - case X86::PTCONJTCMMIMFP16PSV: - case X86::PTCONJTFP16V: case X86::PTILELOADDRSV: case X86::PTILELOADDRST1V: case X86::PTMMULTF32PSV: - case X86::PTTMMULTF32PSV: case X86::PTDPBF8PSV: case X86::PTDPBHF8PSV: case X86::PTDPHBF8PSV: @@ -1083,56 +1070,7 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, VRM->assignVirt2Shape(VirtReg, Shape); return Shape; } - case X86::PT2RPNTLVWZ0V: - case X86::PT2RPNTLVWZ0T1V: - case X86::PT2RPNTLVWZ1V: - case X86::PT2RPNTLVWZ1T1V: - case X86::PT2RPNTLVWZ0RSV: - case X86::PT2RPNTLVWZ0RST1V: - case X86::PT2RPNTLVWZ1RSV: - case X86::PT2RPNTLVWZ1RST1V: { - MachineOperand &MO1 = MI->getOperand(1); - MachineOperand &MO2 = MI->getOperand(2); - MachineOperand &MO3 = MI->getOperand(3); - ShapeT Shape({&MO1, &MO2, &MO1, &MO3}, MRI); - VRM->assignVirt2Shape(VirtReg, Shape); - return Shape; - } - } -} - -static bool canHintShape(ShapeT &PhysShape, ShapeT &VirtShape) { - unsigned PhysShapeNum = PhysShape.getShapeNum(); - unsigned VirtShapeNum = VirtShape.getShapeNum(); - - if (PhysShapeNum < VirtShapeNum) - return false; - - if (PhysShapeNum == VirtShapeNum) { - if (PhysShapeNum == 1) - return PhysShape == VirtShape; - - for (unsigned I = 0; I < PhysShapeNum; I++) { - ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I)); - ShapeT VShape(VirtShape.getRow(I), VirtShape.getCol(I)); - if (VShape != PShape) - return false; - } - return true; - } - - // Hint subreg of mult-tile reg to single tile reg. - if (VirtShapeNum == 1) { - for (unsigned I = 0; I < PhysShapeNum; I++) { - ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I)); - if (VirtShape == PShape) - return true; - } } - - // Note: Currently we have no requirement for case of - // (VirtShapeNum > 1 and PhysShapeNum > VirtShapeNum) - return false; } bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, @@ -1153,7 +1091,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, if (!VRM) return BaseImplRetVal; - if (ID != X86::TILERegClassID && ID != X86::TILEPAIRRegClassID) { + if (ID != X86::TILERegClassID) { if (DisableRegAllocNDDHints || !ST.hasNDD() || !TRI.isGeneralPurposeRegisterClass(&RC)) return BaseImplRetVal; @@ -1204,7 +1142,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, return; } ShapeT PhysShape = getTileShape(VReg, const_cast(VRM), MRI); - if (canHintShape(PhysShape, VirtShape)) + if (PhysShape == VirtShape) Hints.push_back(PhysReg); }; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td index 99b7910131dc5..692e42ae5e752 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -30,8 +30,6 @@ let Namespace = "X86" in { def sub_ymm : SubRegIndex<256>; def sub_mask_0 : SubRegIndex<-1>; def sub_mask_1 : SubRegIndex<-1, -1>; - def sub_t0 : SubRegIndex<8192>; - def sub_t1 : SubRegIndex<8192, 8192>; } //===----------------------------------------------------------------------===// @@ -432,10 +430,6 @@ def TMM4: X86Reg<"tmm4", 4>; def TMM5: X86Reg<"tmm5", 5>; def TMM6: X86Reg<"tmm6", 6>; def TMM7: X86Reg<"tmm7", 7>; -// TMM register pairs -def TPAIRS : RegisterTuples<[sub_t0, sub_t1], - [(add TMM0, TMM2, TMM4, TMM6), - (add TMM1, TMM3, TMM5, TMM7)]>; } // Floating point stack registers. These don't map one-to-one to the FP @@ -862,9 +856,6 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} let CopyCost = -1 in // Don't allow copying of tile registers def TILE : RegisterClass<"X86", [x86amx], 8192, (sequence "TMM%u", 0, 7)> {let Size = 8192;} -// Need check alignment 3rd operand size=1024*2*8 -let isAllocatable = 1 in -def TILEPAIR : RegisterClass<"X86", [untyped], 512, (add TPAIRS)> {let Size = 16384;} //===----------------------------------------------------------------------===// // Register categories. diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp index 17a44dde6480f..09ef8fbc12de9 100644 --- a/llvm/lib/Target/X86/X86TileConfig.cpp +++ b/llvm/lib/Target/X86/X86TileConfig.cpp @@ -74,63 +74,6 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy) INITIALIZE_PASS_END(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false, false) -unsigned getAMXRegNum(MachineRegisterInfo *MRI, Register Reg) { - if (Reg.isVirtual()) { - unsigned RegClassID = MRI->getRegClass(Reg)->getID(); - if (RegClassID == X86::TILERegClassID) - return 1; - if (RegClassID == X86::TILEPAIRRegClassID) - return 2; - } else { - if (Reg >= X86::TMM0 && Reg <= X86::TMM7) - return 1; - if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) - return 2; - } - return 0; -} - -static void collectVirtRegShapes(MachineRegisterInfo *MRI, VirtRegMap &VRM, - Register VirtReg, - SmallVector &Phys2Shapes) { - unsigned Num = getAMXRegNum(MRI, VirtReg); - MCRegister PhysReg = VRM.getPhys(VirtReg); - if (!PhysReg) - return; - - if (Num == 1) { - unsigned Index = PhysReg - X86::TMM0; - if (!Phys2Shapes[Index].isValid()) { - ShapeT Shape = VRM.getShape(VirtReg); - Phys2Shapes[Index] = std::move(Shape); - return; - } - } - // Split tile pair shape info to 2 single tile shape info. e.g: - // Put TMM0_TMM1's Shape to TMM0's shape + TMM1's Shape in Phys2Shapes. - if (Num == 2) { - unsigned Index0 = (PhysReg - X86::TMM0_TMM1) * 2; - unsigned Index1 = (PhysReg - X86::TMM0_TMM1) * 2 + 1; - - ShapeT Shape = VRM.getShape(VirtReg); - assert(Shape.getShapeNum() == 2 && "Unexpected shape number!"); - - if (!Phys2Shapes[Index0].isValid()) { - ShapeT Shape0(Shape.getRow(0), Shape.getCol(0), MRI); - Phys2Shapes[Index0] = std::move(Shape0); - } - - if (!Phys2Shapes[Index1].isValid()) { - ShapeT Shape1(Shape.getRow(1), Shape.getCol(1), MRI); - Phys2Shapes[Index1] = std::move(Shape1); - } - } -} - -static bool isAMXRegClass(MachineRegisterInfo *MRI, Register Reg) { - return getAMXRegNum(MRI, Reg) > 0; -} - bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { X86MachineFunctionInfo *X86FI = MF.getInfo(); // Early exit in the common case of non-AMX code. @@ -138,7 +81,7 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { return false; const X86Subtarget &ST = MF.getSubtarget(); - const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + const X86RegisterInfo *TRI = ST.getRegisterInfo(); const TargetInstrInfo *TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); LiveIntervals &LIS = getAnalysis().getLIS(); @@ -176,24 +119,29 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { assert(ConstMI && "Cannot find an insertion point"); unsigned AMXRegNum = TRI->getRegClass(X86::TILERegClassID)->getNumRegs(); - SmallVector Phys2Shapes(AMXRegNum, ShapeT()); + SmallVector Phys2Virt(AMXRegNum, 0); for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { Register VirtReg = Register::index2VirtReg(I); if (MRI.reg_nodbg_empty(VirtReg)) continue; - if (!isAMXRegClass(&MRI, VirtReg)) + if (!TRI->isTileRegisterClass(MRI.getRegClass(VirtReg))) + continue; + MCRegister PhysReg = VRM.getPhys(VirtReg); + if (!PhysReg) continue; - collectVirtRegShapes(&MRI, VRM, VirtReg, Phys2Shapes); + unsigned Index = PhysReg - X86::TMM0; + if (!Phys2Virt[Index]) + Phys2Virt[Index] = VirtReg; } // Fill in the shape of each tile physical register. for (unsigned I = 0; I < AMXRegNum; ++I) { - ShapeT Shape = Phys2Shapes[I]; - if (!Shape.isValid()) + if (!Phys2Virt[I]) continue; DebugLoc DL; bool IsRow = true; MachineInstr *NewMI = nullptr; + ShapeT Shape = VRM.getShape(Phys2Virt[I]); for (auto &R : {Shape.getRow()->getReg(), Shape.getCol()->getReg()}) { // Here is the data format for the tile config. // 0 palette @@ -222,14 +170,7 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { "Cannot initialize with different shapes"); continue; } - if (DefMI.getOperand(1).isImm()) { - Imm = DefMI.getOperand(1).getImm(); - } else { - assert(DefMI.getOpcode() == X86::MOV32r0 && - "The opcode is assumed to be MOV32r0 if the operand is not " - "immediate."); - Imm = 0; - } + Imm = DefMI.getOperand(1).getImm(); NewMI = addFrameReference( BuildMI(MF.front(), ++ConstMI->getIterator(), DL, diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 0849fc7d55a32..c164762de2966 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -2192,7 +2192,6 @@ StringMap sys::getHostCPUFeatures() { bool HasLeaf1E = MaxLevel >= 0x1e && !getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX); Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave; - Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave; Features["amx-tf32"] = HasLeaf1E && ((EAX >> 6) & 1) && HasAMXSave; Features["amx-avx512"] = HasLeaf1E && ((EAX >> 7) & 1) && HasAMXSave; Features["amx-movrs"] = HasLeaf1E && ((EAX >> 8) & 1) && HasAMXSave; diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index b13c795c1649c..37e8ad986aa55 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -143,7 +143,7 @@ constexpr FeatureBitset FeaturesDiamondRapids = FeatureAVXVNNIINT8 | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 | FeatureSM4 | FeatureEGPR | FeatureZU | FeatureCCMP | FeaturePush2Pop2 | FeaturePPX | FeatureNDD | FeatureNF | FeatureMOVRS | FeatureAMX_MOVRS | - FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32 | FeatureAMX_TRANSPOSE; + FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32; // Intel Atom processors. // Bonnell has feature parity with Core2 and adds MOVBE. @@ -615,7 +615,6 @@ constexpr FeatureBitset ImpliedFeaturesAMX_FP16 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE; -constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_MOVRS = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_AVX512 = FeatureAMX_TILE | FeatureAVX10_2; diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt index d3c0da9862245..000c67efb1de7 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt @@ -1439,11 +1439,8 @@ Key: PSUBWrm: [ 0.00 0.00 ] Key: PSUBWrr: [ 0.00 0.00 ] Key: PSWAPDrm: [ 0.00 0.00 ] Key: PSWAPDrr: [ 0.00 0.00 ] -Key: PT: [ 0.00 0.00 ] Key: PTCMMIMFP: [ 0.00 0.00 ] Key: PTCMMRLFP: [ 0.00 0.00 ] -Key: PTCONJTCMMIMFP: [ 0.00 0.00 ] -Key: PTCONJTFP: [ 0.00 0.00 ] Key: PTCVTROWD: [ 0.00 0.00 ] Key: PTCVTROWPS: [ 0.00 0.00 ] Key: PTDPBF: [ 0.00 0.00 ] @@ -1471,20 +1468,11 @@ Key: PTILEMOVROWrre: [ 0.00 0.00 ] Key: PTILEMOVROWrreV: [ 0.00 0.00 ] Key: PTILEMOVROWrri: [ 0.00 0.00 ] Key: PTILEMOVROWrriV: [ 0.00 0.00 ] -Key: PTILEPAIRLOAD: [ 0.00 0.00 ] -Key: PTILEPAIRSTORE: [ 0.00 0.00 ] Key: PTILESTORED: [ 0.00 0.00 ] Key: PTILESTOREDV: [ 0.00 0.00 ] Key: PTILEZERO: [ 0.00 0.00 ] Key: PTILEZEROV: [ 0.00 0.00 ] Key: PTMMULTF: [ 0.00 0.00 ] -Key: PTTCMMIMFP: [ 0.00 0.00 ] -Key: PTTCMMRLFP: [ 0.00 0.00 ] -Key: PTTDPBF: [ 0.00 0.00 ] -Key: PTTDPFP: [ 0.00 0.00 ] -Key: PTTMMULTF: [ 0.00 0.00 ] -Key: PTTRANSPOSED: [ 0.00 0.00 ] -Key: PTTRANSPOSEDV: [ 0.00 0.00 ] Key: PTWRITE: [ 0.00 0.00 ] Key: PTWRITEm: [ 0.00 0.00 ] Key: PTWRITEr: [ 0.00 0.00 ] @@ -1717,8 +1705,6 @@ Key: TAILJMPm: [ 0.00 0.00 ] Key: TAILJMPr: [ 0.00 0.00 ] Key: TCMMIMFP: [ 0.00 0.00 ] Key: TCMMRLFP: [ 0.00 0.00 ] -Key: TCONJTCMMIMFP: [ 0.00 0.00 ] -Key: TCONJTFP: [ 0.00 0.00 ] Key: TCRETURN_HIPE: [ 0.00 0.00 ] Key: TCRETURN_WIN: [ 0.00 0.00 ] Key: TCRETURN_WINmi: [ 0.00 0.00 ] @@ -1764,12 +1750,6 @@ Key: TPAUSE: [ 0.00 0.00 ] Key: TRAP: [ 0.00 0.00 ] Key: TST_F: [ 0.00 0.00 ] Key: TST_Fp: [ 0.00 0.00 ] -Key: TTCMMIMFP: [ 0.00 0.00 ] -Key: TTCMMRLFP: [ 0.00 0.00 ] -Key: TTDPBF: [ 0.00 0.00 ] -Key: TTDPFP: [ 0.00 0.00 ] -Key: TTMMULTF: [ 0.00 0.00 ] -Key: TTRANSPOSED: [ 0.00 0.00 ] Key: TZCNT: [ 0.00 0.00 ] Key: TZMSK: [ 0.00 0.00 ] Key: UBSAN_UD: [ 0.00 0.00 ] @@ -7034,7 +7014,6 @@ Key: PhyReg_VR256: [ 0.00 0.00 ] Key: PhyReg_VR512: [ 0.00 0.00 ] Key: PhyReg_VR512_0_15: [ 0.00 0.00 ] Key: PhyReg_TILE: [ 0.00 0.00 ] -Key: PhyReg_TILEPAIR: [ 0.00 0.00 ] Key: VirtReg_GR8: [ 0.00 0.00 ] Key: VirtReg_GRH8: [ 0.00 0.00 ] Key: VirtReg_GR8_NOREX2: [ 0.00 0.00 ] @@ -7170,4 +7149,3 @@ Key: VirtReg_VR256: [ 0.00 0.00 ] Key: VirtReg_VR512: [ 0.00 0.00 ] Key: VirtReg_VR512_0_15: [ 0.00 0.00 ] Key: VirtReg_TILE: [ 0.00 0.00 ] -Key: VirtReg_TILEPAIR: [ 0.00 0.00 ] diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt index c6e5508248b9b..bb72886f73bfd 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt @@ -1439,11 +1439,8 @@ Key: PSUBWrm: [ 0.00 0.00 ] Key: PSUBWrr: [ 0.00 0.00 ] Key: PSWAPDrm: [ 0.00 0.00 ] Key: PSWAPDrr: [ 0.00 0.00 ] -Key: PT: [ 0.00 0.00 ] Key: PTCMMIMFP: [ 0.00 0.00 ] Key: PTCMMRLFP: [ 0.00 0.00 ] -Key: PTCONJTCMMIMFP: [ 0.00 0.00 ] -Key: PTCONJTFP: [ 0.00 0.00 ] Key: PTCVTROWD: [ 0.00 0.00 ] Key: PTCVTROWPS: [ 0.00 0.00 ] Key: PTDPBF: [ 0.00 0.00 ] @@ -1471,20 +1468,11 @@ Key: PTILEMOVROWrre: [ 0.00 0.00 ] Key: PTILEMOVROWrreV: [ 0.00 0.00 ] Key: PTILEMOVROWrri: [ 0.00 0.00 ] Key: PTILEMOVROWrriV: [ 0.00 0.00 ] -Key: PTILEPAIRLOAD: [ 0.00 0.00 ] -Key: PTILEPAIRSTORE: [ 0.00 0.00 ] Key: PTILESTORED: [ 0.00 0.00 ] Key: PTILESTOREDV: [ 0.00 0.00 ] Key: PTILEZERO: [ 0.00 0.00 ] Key: PTILEZEROV: [ 0.00 0.00 ] Key: PTMMULTF: [ 0.00 0.00 ] -Key: PTTCMMIMFP: [ 0.00 0.00 ] -Key: PTTCMMRLFP: [ 0.00 0.00 ] -Key: PTTDPBF: [ 0.00 0.00 ] -Key: PTTDPFP: [ 0.00 0.00 ] -Key: PTTMMULTF: [ 0.00 0.00 ] -Key: PTTRANSPOSED: [ 0.00 0.00 ] -Key: PTTRANSPOSEDV: [ 0.00 0.00 ] Key: PTWRITE: [ 0.00 0.00 ] Key: PTWRITEm: [ 0.00 0.00 ] Key: PTWRITEr: [ 0.00 0.00 ] @@ -1717,8 +1705,6 @@ Key: TAILJMPm: [ 0.00 0.00 ] Key: TAILJMPr: [ 0.00 0.00 ] Key: TCMMIMFP: [ 0.00 0.00 ] Key: TCMMRLFP: [ 0.00 0.00 ] -Key: TCONJTCMMIMFP: [ 0.00 0.00 ] -Key: TCONJTFP: [ 0.00 0.00 ] Key: TCRETURN_HIPE: [ 0.00 0.00 ] Key: TCRETURN_WIN: [ 0.00 0.00 ] Key: TCRETURN_WINmi: [ 0.00 0.00 ] @@ -1764,12 +1750,6 @@ Key: TPAUSE: [ 0.00 0.00 ] Key: TRAP: [ 0.00 0.00 ] Key: TST_F: [ 0.00 0.00 ] Key: TST_Fp: [ 0.00 0.00 ] -Key: TTCMMIMFP: [ 0.00 0.00 ] -Key: TTCMMRLFP: [ 0.00 0.00 ] -Key: TTDPBF: [ 0.00 0.00 ] -Key: TTDPFP: [ 0.00 0.00 ] -Key: TTMMULTF: [ 0.00 0.00 ] -Key: TTRANSPOSED: [ 0.00 0.00 ] Key: TZCNT: [ 0.00 0.00 ] Key: TZMSK: [ 0.00 0.00 ] Key: UBSAN_UD: [ 0.00 0.00 ] @@ -7034,7 +7014,6 @@ Key: PhyReg_VR256: [ 0.00 0.00 ] Key: PhyReg_VR512: [ 0.00 0.00 ] Key: PhyReg_VR512_0_15: [ 0.00 0.00 ] Key: PhyReg_TILE: [ 0.00 0.00 ] -Key: PhyReg_TILEPAIR: [ 0.00 0.00 ] Key: VirtReg_GR8: [ 0.00 0.00 ] Key: VirtReg_GRH8: [ 0.00 0.00 ] Key: VirtReg_GR8_NOREX2: [ 0.00 0.00 ] @@ -7170,4 +7149,3 @@ Key: VirtReg_VR256: [ 0.00 0.00 ] Key: VirtReg_VR512: [ 0.00 0.00 ] Key: VirtReg_VR512_0_15: [ 0.00 0.00 ] Key: VirtReg_TILE: [ 0.00 0.00 ] -Key: VirtReg_TILEPAIR: [ 0.00 0.00 ] diff --git a/llvm/test/CodeGen/X86/amx-tf32-internal.ll b/llvm/test/CodeGen/X86/amx-tf32-internal.ll index 6d0f3c57c08d8..caf7a1cb7bd2d 100644 --- a/llvm/test/CodeGen/X86/amx-tf32-internal.ll +++ b/llvm/test/CodeGen/X86/amx-tf32-internal.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+avx512f, \ -; RUN: -mattr=+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s +; RUN: -mattr=+amx-tf32 -verify-machineinstrs | FileCheck %s define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { ; CHECK-LABEL: test_amx: @@ -20,7 +20,6 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { ; CHECK-NEXT: tilezero %tmm1 ; CHECK-NEXT: tilezero %tmm2 ; CHECK-NEXT: tmmultf32ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: ttmmultf32ps %tmm1, %tmm0, %tmm2 ; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx) ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper @@ -31,9 +30,8 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) %c1 = call x86_amx @llvm.x86.tmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) - %c2 = call x86_amx @llvm.x86.ttmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c2) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c1) ret void } @@ -43,4 +41,3 @@ declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) declare x86_amx @llvm.x86.tmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.ttmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) diff --git a/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll index af1a7ae102975..642c1b7317f81 100644 --- a/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll +++ b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-tf32 -verify-machineinstrs | FileCheck %s define void @test_tmmultf32ps() { ; CHECK-LABEL: test_tmmultf32ps: @@ -11,13 +11,3 @@ define void @test_tmmultf32ps() { } declare void @llvm.x86.tmmultf32ps(i8 %A, i8 %B, i8 %C) -define void @test_ttmmultf32ps() { -; CHECK-LABEL: test_ttmmultf32ps: -; CHECK: # %bb.0: -; CHECK-NEXT: ttmmultf32ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: retq - call void @llvm.x86.ttmmultf32ps(i8 1, i8 2, i8 3) - ret void -} -declare void @llvm.x86.ttmmultf32ps(i8 %A, i8 %B, i8 %C) - diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll deleted file mode 100755 index 1f5758c804b2b..0000000000000 --- a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll +++ /dev/null @@ -1,122 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0 -; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2 -; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR - -define void @test_amx(i64 %stride, i8* %addr1) #0 { -; CHECK-LABEL: test_amx: -; CHECK: # %bb.0: -; CHECK-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0 -; CHECK-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 -; CHECK-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0 -; CHECK-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx: -; EGPR: # %bb.0: -; EGPR-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x04,0x3e] -; EGPR-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x3e] -; EGPR-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x04,0x3e] -; EGPR-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x3e] -; EGPR-NEXT: retq # encoding: [0xc3] - call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz1rst1(i8 2, i8* %addr1, i64 %stride) - ret void -} -declare void @llvm.x86.t2rpntlvwz0rs(i8 , i8* , i64 ) -declare void @llvm.x86.t2rpntlvwz0rst1(i8 , i8* , i64 ) -declare void @llvm.x86.t2rpntlvwz1rs(i8 , i8* , i64 ) -declare void @llvm.x86.t2rpntlvwz1rst1(i8 , i8* , i64 ) - -define void @test_amx2(i8* %base, i64 %stride) #0 { -; O0-LABEL: test_amx2: -; O0: # %bb.0: -; O0-NEXT: xorps %xmm0, %xmm0 -; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O0-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw $8, %ax -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; O0-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 -; O0-NEXT: movw $8, %ax -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; O0-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 -; O0-NEXT: movw $8, %ax -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; O0-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 -; O0-NEXT: tilerelease -; O0-NEXT: retq -; -; O2-LABEL: test_amx2: -; O2: # %bb.0: -; O2-NEXT: xorps %xmm0, %xmm0 -; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O2-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; O2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; O2-NEXT: movw $8, %ax -; O2-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4 -; O2-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 -; O2-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 -; O2-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 -; O2-NEXT: tilerelease -; O2-NEXT: retq -; -; EGPR-LABEL: test_amx2: -; EGPR: # %bb.0: -; EGPR-NEXT: xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0] -; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xc0] -; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xd0] -; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xe0] -; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xf0] -; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x08,0x00] -; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0] -; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] -; EGPR-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x24,0x37] -; EGPR-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x24,0x37] -; EGPR-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x24,0x37] -; EGPR-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x24,0x37] -; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] -; EGPR-NEXT: retq # encoding: [0xc3] - call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - ret void -} -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16, i16, i16, i8*, i64) diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll deleted file mode 100644 index 4f41410010302..0000000000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll +++ /dev/null @@ -1,136 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -; RUN: -mattr=+amx-transpose -verify-machineinstrs | FileCheck %s - -@buf = dso_local global [2048 x i8] zeroinitializer, align 16 -@buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 - -define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { -; CHECK-LABEL: test_tile_2rpntlvwz0: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; CHECK-NEXT: subq $8192, %rsp # imm = 0x2000 -; CHECK-NEXT: .cfi_offset %rbx, -24 -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # kill: def $dx killed $dx killed $edx -; CHECK-NEXT: movw %si, %cx -; CHECK-NEXT: movw %di, %ax -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $cl -; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: movl $buf, %esi -; CHECK-NEXT: movl $32, %edi -; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdi), %tmm4 -; CHECK-NEXT: movabsq $64, %rbx -; CHECK-NEXT: tilestored %tmm5, (%rsp,%rbx) # 1024-byte Folded Spill -; CHECK-NEXT: tileloadd (%rsp,%rbx), %tmm0 # 1024-byte Folded Reload -; CHECK-NEXT: movabsq $64, %rbx -; CHECK-NEXT: tilestored %tmm4, 1024(%rsp,%rbx) # 1024-byte Folded Spill -; CHECK-NEXT: tileloadd 1024(%rsp,%rbx), %tmm1 # 1024-byte Folded Reload -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tilestored %tmm1, (%rsi,%rdi) -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) -; CHECK-NEXT: tilezero %tmm0 -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm1 -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm2 -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0 -; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0 -; CHECK-NEXT: movl $buf2, %edx -; CHECK-NEXT: movl $32, %esi -; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi) -; CHECK-NEXT: leaq -8(%rbp), %rsp -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -entry: - %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #3 - %1 = extractvalue { x86_amx, x86_amx } %0, 0 - %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #3 - %3 = extractvalue { x86_amx, x86_amx } %0, 1 - %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #3 - %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #3 - %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #3 - %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #3 - %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #3 - %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #3 - %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #3 - %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #3 - %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #3 - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #3 - ret void -} - -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1 - -declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 - -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 - -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 - -declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 - -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4 - -attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose" } -attributes #1 = { argmemonly nofree nounwind readonly } -attributes #2 = { nofree nosync nounwind readnone } -attributes #3 = { nounwind } -attributes #4 = { argmemonly nounwind writeonly } - -!llvm.module.flags = !{!0, !1, !2} - -!0 = !{i32 1, !"wchar_size", i32 4} -!1 = !{i32 7, !"uwtable", i32 2} -!2 = !{i32 7, !"frame-pointer", i32 2} diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir deleted file mode 100644 index ab12ab3a4f13d..0000000000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir +++ /dev/null @@ -1,165 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=fasttileconfig -o - %s | FileCheck %s - ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: [] -liveins: - - { reg: '$edi', virtual-reg: '' } - - { reg: '$esi', virtual-reg: '' } - - { reg: '$edx', virtual-reg: '' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1024 - adjustsStack: false - hasCalls: true - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: - - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 2, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 3, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 4, name: '', type: default, offset: 0, size: 64, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 5, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 6, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 7, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $rdi, $rsi, $rdx, $rax - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $rdi, $rsi, $rdx, $rax - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $zmm0 = AVX512_512_SET0 - ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4) - ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) - ; CHECK-NEXT: renamable $rcx = MOV32ri64 64 - ; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7) - ; CHECK-NEXT: renamable $cx = MOV16ri 64 - ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5) - ; CHECK-NEXT: renamable $cx = MOV16ri 16 - ; CHECK-NEXT: renamable $r8w = MOV16ri 16 - ; CHECK-NEXT: MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 50, $noreg, $al :: (store (s512) into %stack.4 + 50, align 2, basealign 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 20, $noreg, $cx :: (store (s512) into %stack.4 + 20, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 49, $noreg, $al :: (store (s512) into %stack.4 + 49, align 1, basealign 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 18, $noreg, $di :: (store (s512) into %stack.4 + 18, align 2, basealign 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 52, $noreg, $al :: (store (s512) into %stack.4 + 52, align 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 24, $noreg, $cx :: (store (s512) into %stack.4 + 24, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 53, $noreg, $al :: (store (s512) into %stack.4 + 53, align 1, basealign 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 26, $noreg, $di :: (store (s512) into %stack.4 + 26, align 2, basealign 4) - ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4) - ; CHECK-NEXT: renamable $r9 = COPY $rsi - ; CHECK-NEXT: $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7) - ; CHECK-NEXT: renamable $r8 = COPY $rdi - ; CHECK-NEXT: $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6) - ; CHECK-NEXT: renamable $r10 = COPY $rax - ; CHECK-NEXT: $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5) - ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg - ; CHECK-NEXT: renamable $tmm0 = COPY renamable $tmm5 - ; CHECK-NEXT: renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 - ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1 - ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 - ; CHECK-NEXT: renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx - ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 - ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg - ; CHECK-NEXT: renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg - ; CHECK-NEXT: renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg - ; CHECK-NEXT: renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2 - ; CHECK-NEXT: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0 - renamable $zmm0 = AVX512_512_SET0 - VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4) - MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) - renamable $rcx = MOV32ri64 64 - MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7) - renamable $cx = MOV16ri 64 - MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5) - renamable $cx = MOV16ri 16 - renamable $r8w = MOV16ri 16 - MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6) - PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4) - renamable $r9 = COPY $rsi - $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7) - renamable $r8 = COPY $rdi - $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6) - renamable $r10 = COPY $rax - $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5) - renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg - renamable $tmm0 = COPY renamable $tmm5 - renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 - PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1 - PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 - renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx - PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 - renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg - renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg - renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg - renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2 - PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0 -... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir deleted file mode 100644 index c7d241f8a98b6..0000000000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir +++ /dev/null @@ -1,153 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=greedy,tileconfig -o - %s | FileCheck %s - ---- | - @buf = dso_local global [2048 x i8] zeroinitializer, align 16 - @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 - - define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { - entry: - %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf, i64 0, i64 0), i64 32) #5 - %1 = extractvalue { x86_amx, x86_amx } %0, 0 - %2 = extractvalue { x86_amx, x86_amx } %0, 1 - %3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5 - %4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %3, x86_amx %1, x86_amx %2) #5 - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf2, i64 0, i64 0), i64 32, x86_amx %4) #5 - ret void - } - - declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) #1 - - declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 - - declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 - - declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 - - declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 - - declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #4 - - attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } - attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #5 = { nounwind } - -... ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: - - { id: 0, class: gr32, preferred-register: '' } - - { id: 1, class: gr32, preferred-register: '' } - - { id: 2, class: gr32, preferred-register: '' } - - { id: 3, class: gr16, preferred-register: '' } - - { id: 4, class: gr16, preferred-register: '' } - - { id: 5, class: gr16, preferred-register: '' } - - { id: 6, class: gr64, preferred-register: '' } - - { id: 7, class: gr64_nosp, preferred-register: '' } - - { id: 8, class: tilepair, preferred-register: '' } - - { id: 9, class: tile, preferred-register: '' } - - { id: 10, class: tile, preferred-register: '' } - - { id: 11, class: tile, preferred-register: '' } - - { id: 12, class: tile, preferred-register: '' } - - { id: 13, class: gr64, preferred-register: '' } - - { id: 14, class: vr512, preferred-register: '' } -liveins: - - { reg: '$edi', virtual-reg: '%0' } - - { reg: '$esi', virtual-reg: '%1' } - - { reg: '$edx', virtual-reg: '%2' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 4 - adjustsStack: false - hasCalls: false - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: - - { id: 0, name: '', type: default, offset: 0, size: 64, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $edi, $esi, $edx - - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $edi, $esi, $edx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi - ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 - ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4) - ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) - ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 26, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 26, align 2, basealign 4) - ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 53, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 53, align 1, basealign 4) - ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 24, $noreg, [[COPY1]].sub_16bit :: (store (s512) into %stack.0 + 24, align 4) - ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 52, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 52, align 4) - ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 16, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 16, align 4) - ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 48, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 48, align 4) - ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) - ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64 = MOV32ri64 @buf - ; CHECK-NEXT: [[MOV32ri64_1:%[0-9]+]]:gr64_nosp = MOV32ri64 32 - ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[COPY]].sub_16bit, [[MOV32ri64_]], 1, [[MOV32ri64_1]], 0, $noreg - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY2]].sub_16bit, [[COPY]].sub_16bit, [[COPY1]].sub_16bit, [[PTILEZEROV]], [[PT2RPNTLVWZ0V]].sub_t0, [[PT2RPNTLVWZ0V]].sub_t1 - ; CHECK-NEXT: [[MOV32ri64_2:%[0-9]+]]:gr64 = MOV32ri64 @buf2 - ; CHECK-NEXT: PTILESTOREDV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[MOV32ri64_2]], 1, [[MOV32ri64_1]], 0, $noreg, [[PTILEZEROV]] - ; CHECK-NEXT: RET 0 - %2:gr32 = COPY $edx - %1:gr32 = COPY $esi - %0:gr32 = COPY $edi - %14:vr512 = AVX512_512_SET0 - VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, %14 :: (store (s512) into %stack.0, align 4) - MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) - PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) - %6:gr64 = MOV32ri64 @buf - %7:gr64_nosp = MOV32ri64 32 - %8:tilepair = PT2RPNTLVWZ0V %0.sub_16bit, %1.sub_16bit, %2.sub_16bit, %6, 1, %7, 0, $noreg - %12:tile = PTILEZEROV %0.sub_16bit, %1.sub_16bit - %12:tile = PTDPBSSDV %0.sub_16bit, %2.sub_16bit, %1.sub_16bit, %12, %8.sub_t0, %8.sub_t1 - %13:gr64 = MOV32ri64 @buf2 - PTILESTOREDV %0.sub_16bit, %1.sub_16bit, %13, 1, %7, 0, $noreg, %12 - RET 0 - -... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir b/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir deleted file mode 100644 index 66b15aa5b3cde..0000000000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir +++ /dev/null @@ -1,97 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=lowertilecopy -o - %s | FileCheck %s - ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: [] -liveins: - - { reg: '$edi', virtual-reg: '' } - - { reg: '$esi', virtual-reg: '' } - - { reg: '$edx', virtual-reg: '' } - - { reg: '$cx', virtual-reg: '' } - - { reg: '$r9', virtual-reg: '' } - - { reg: '$r10', virtual-reg: '' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1024 - adjustsStack: false - hasCalls: true - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: - - { id: 43, name: '', type: default, offset: 0, size: 64, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 68, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9 - - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) - ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg - ; CHECK-NEXT: $rax = MOV64ri 64 - ; CHECK-NEXT: TILESTORED %stack.3, 1, $rax, 0, $noreg, $tmm5 :: (store (s8192) into %stack.3) - ; CHECK-NEXT: $tmm0 = TILELOADD %stack.3, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.3) - ; CHECK-NEXT: $rax = MOV64ri 64 - ; CHECK-NEXT: TILESTORED %stack.2, 1, $rax, 0, $noreg, $tmm4 :: (store (s8192) into %stack.2) - ; CHECK-NEXT: $tmm1 = TILELOADD %stack.2, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.2) - ; CHECK-NEXT: renamable $r8 = MOV32ri64 64 - ; CHECK-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.1) - ; CHECK-NEXT: renamable $di = MOV16ri 64 - ; CHECK-NEXT: renamable $cx = MOV16ri 16 - ; CHECK-NEXT: PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1 - ; CHECK-NEXT: PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0 - PLDTILECFGV %stack.43, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.43, align 4) - renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg - renamable $tmm0 = COPY renamable $tmm5 - renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 - renamable $r8 = MOV32ri64 64 - MOV64mr %stack.68, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.68) - renamable $di = MOV16ri 64 - renamable $cx = MOV16ri 16 - PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1 - PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0 - -... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll deleted file mode 100644 index 3549875e858a9..0000000000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll +++ /dev/null @@ -1,87 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py - ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s - ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s - - @buf = dso_local global [2048 x i8] zeroinitializer, align 16 - - ; Function Attrs: noinline nounwind optnone uwtable - define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1, ptr %m) #0 { -; CHECK-LABEL: @test_tile_2rpntlvwz0( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = udiv i16 [[COL1:%.*]], 4 -; CHECK-NEXT: [[TMP1:%.*]] = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]] -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M:%.*]], i64 [[TMP3]], x86_amx [[TMP2]]) -; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[COL1]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP6]], x86_amx [[TMP5]]) -; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]] -; CHECK-NEXT: [[TMP9:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP9]], x86_amx [[TMP8]]) -; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP11]]) -; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[COL1]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP14]]) -; CHECK-NEXT: [[TMP17:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP0]], i16 [[COL0]], ptr [[M]], i64 [[TMP17]]) -; CHECK-NEXT: [[TMP20:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL0]], i16 [[COL1]], x86_amx [[TMP13]], x86_amx [[TMP16]], x86_amx [[TMP19]]) #[[ATTR3]] -; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP21]], x86_amx [[TMP20]]) -; CHECK-NEXT: ret void -; - entry: - - %0 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr getelementptr inbounds ([2048 x i8], ptr @buf, i64 0, i64 0), i64 32) #7 - %1 = extractvalue { x86_amx, x86_amx } %0, 0 - %2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #7 - store <256 x i32> %2, ptr %m, align 1024 - - %3 = extractvalue { x86_amx, x86_amx } %0, 1 - %4 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #7 - store <256 x i32> %4, ptr %m, align 1024 - - %5 = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #7 - %6 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #7 - store <256 x i32> %6, ptr %m, align 64 - - %7 = load <256 x i32>, ptr %m, align 64 - %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %7) #7 - %9 = load <256 x i32>, ptr %m, align 64 - %10 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %9) #7 - %11 = load <256 x i32>, ptr %m, align 64 - %12 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #7 - - %13 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col0, i16 %col1, x86_amx %8, x86_amx %10, x86_amx %12) #7 - %14 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %13) #7 - store <256 x i32> %14, ptr %m, align 64 - - ret void - } - - ; Function Attrs: argmemonly nounwind readonly - declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #2 - - ; Function Attrs: nounwind readnone - declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #3 - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #4 - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #4 - - ; Function Attrs: nounwind readnone - declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #3 - - ; Function Attrs: argmemonly nounwind writeonly - declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #5 - - attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } - attributes #1 = { argmemonly nofree nounwind willreturn writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #2 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #3 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #4 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #5 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #6 = { argmemonly nofree nounwind willreturn "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #7 = { nounwind } diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll deleted file mode 100644 index 96966264e0515..0000000000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll +++ /dev/null @@ -1,61 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s -; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s - - @buf = dso_local global [2048 x i8] zeroinitializer, align 16 - @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 - - ; Function Attrs: nounwind uwtable - define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { -; CHECK-LABEL: @test_tile_2rpntlvwz0( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1:%.*]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]] -; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL1]], i16 [[COL0]], x86_amx [[TMP3]], x86_amx [[TMP1]], x86_amx [[TMP2]]) #[[ATTR3]] -; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr @buf2, i64 32, x86_amx [[TMP4]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; - entry: - %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #5 - %1 = extractvalue { x86_amx, x86_amx } %0, 0 - %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #5 - %3 = extractvalue { x86_amx, x86_amx } %0, 1 - %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #5 - %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5 - %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #5 - %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #5 - %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #5 - %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #5 - %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #5 - %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #5 - %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #5 - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #5 - ret void - } - - ; Function Attrs: argmemonly nounwind readonly - declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1 - - ; Function Attrs: nounwind readnone - declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 - - ; Function Attrs: nounwind readnone - declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 - - ; Function Attrs: argmemonly nounwind writeonly - declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4 - - attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } - attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #5 = { nounwind } diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir deleted file mode 100644 index 1e3b242bca96c..0000000000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir +++ /dev/null @@ -1,134 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=fastpretileconfig -o - %s | FileCheck %s - ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: - - { id: 0, class: gr64_nosp, preferred-register: '' } - - { id: 1, class: gr16, preferred-register: '' } - - { id: 2, class: gr16, preferred-register: '' } - - { id: 3, class: gr16, preferred-register: '' } - - { id: 4, class: gr64, preferred-register: '' } - - { id: 5, class: gr64, preferred-register: '' } - - { id: 6, class: gr64, preferred-register: '' } - - { id: 7, class: gr64_nosp, preferred-register: '' } - - { id: 8, class: tilepair, preferred-register: '' } - - { id: 9, class: tile, preferred-register: '' } - - { id: 10, class: tile, preferred-register: '' } - - { id: 11, class: tile, preferred-register: '' } - - { id: 181, class: tile, preferred-register: '' } - - { id: 183, class: tile, preferred-register: '' } - - { id: 185, class: tile, preferred-register: '' } - - { id: 186, class: tile, preferred-register: '' } -liveins: - - { reg: '$edi', virtual-reg: '%0' } - - { reg: '$esi', virtual-reg: '%1' } - - { reg: '$edx', virtual-reg: '%2' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1024 - adjustsStack: false - hasCalls: true - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: - - { id: 18, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 19, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 20, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 21, name: '', type: default, offset: 0, size: 8, - alignment: 8, stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $rdi, $rsi, $rdx, $rax - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $rdi, $rsi, $rdx, $rax - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 - ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.4, align 4) - ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) - ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 64 - ; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 64 - ; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 16 - ; CHECK-NEXT: [[MOV16ri2:%[0-9]+]]:gr16 = MOV16ri 16 - ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.4, align 4) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY $rdx - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY $rax - ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[COPY2]], 1, killed [[COPY3]], 0, $noreg - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0 - ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY5]] - ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY4]] - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV16ri]], [[MOV16ri1]] - ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTILEZEROV]] - ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[PTILELOADDV]], killed [[PTILELOADDV1]], killed [[PTILELOADDV2]] - ; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri]], killed [[MOV16ri1]], killed [[COPY2]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] - %0:gr64_nosp = MOV32ri64 64 - %1:gr16 = MOV16ri 64 - %2:gr16 = MOV16ri 16 - %3:gr16 = MOV16ri 16 - %4:gr64 = COPY $rsi - %5:gr64 = COPY $rdi - %6:gr64 = COPY $rdx - %7:gr64_nosp = COPY $rax - %8:tilepair = PT2RPNTLVWZ0V %1, %2, %3, %6, 1, killed %7, 0, $noreg - %9:tile = COPY %8.sub_t1 - %10:tile = COPY %8.sub_t0 - PTILESTOREDV %1, %2, %4, 1, %0, 0, $noreg, killed %10 - PTILESTOREDV %1, %3, %5, 1, %0, 0, $noreg, killed %9 - %11:tile = PTILEZEROV %1, %2 - PTILESTOREDV %1, %2, %6, 1, %0, 0, $noreg, killed %11 - %181:tile = PTILELOADDV %1, %2, %4, 1, %0, 0, $noreg - %183:tile = PTILELOADDV %1, %3, %5, 1, %0, 0, $noreg - %185:tile = PTILELOADDV %1, %2, %6, 1, %0, 0, $noreg - %186:tile = PTDPBSSDV %1, %2, %3, %181, killed %183, killed %185 - PTILESTOREDV killed %1, killed %2, killed %6, 1, killed %0, 0, $noreg, killed %186 -... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir deleted file mode 100644 index ac2cdb4a50568..0000000000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir +++ /dev/null @@ -1,113 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=tilepreconfig -o - %s | FileCheck %s - ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: - - { id: 0, class: gr32, preferred-register: '' } - - { id: 1, class: gr32, preferred-register: '' } - - { id: 2, class: gr32, preferred-register: '' } - - { id: 3, class: gr16, preferred-register: '' } - - { id: 4, class: gr16, preferred-register: '' } - - { id: 5, class: gr16, preferred-register: '' } - - { id: 6, class: gr64, preferred-register: '' } - - { id: 7, class: gr64_nosp, preferred-register: '' } - - { id: 8, class: tilepair, preferred-register: '' } - - { id: 9, class: tile, preferred-register: '' } - - { id: 10, class: tile, preferred-register: '' } - - { id: 11, class: tile, preferred-register: '' } - - { id: 12, class: tile, preferred-register: '' } - - { id: 13, class: gr64, preferred-register: '' } -liveins: - - { reg: '$edi', virtual-reg: '%0' } - - { reg: '$esi', virtual-reg: '%1' } - - { reg: '$edx', virtual-reg: '%2' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1 - adjustsStack: false - hasCalls: false - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: [] -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $edi, $esi, $edx, $rax, $rbx - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $edi, $esi, $edx, $rax, $rbx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 - ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4) - ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr16 = COPY [[COPY1]].sub_16bit - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr16 = COPY [[COPY2]].sub_16bit - ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.0, align 4) - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY $rax - ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32 - ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY5]], [[COPY4]], [[COPY3]], killed [[COPY6]], 1, [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0 - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY5]], [[COPY4]] - ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY5]], [[COPY3]], [[COPY4]], [[PTILEZEROV]], killed [[COPY8]], killed [[COPY7]] - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY $rbx - ; CHECK-NEXT: PTILESTOREDV [[COPY5]], [[COPY4]], killed [[COPY9]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] - ; CHECK-NEXT: RET 0 - %2:gr32 = COPY $edx - %1:gr32 = COPY $esi - %0:gr32 = COPY $edi - %3:gr16 = COPY %2.sub_16bit - %4:gr16 = COPY %1.sub_16bit - %5:gr16 = COPY %0.sub_16bit - %6:gr64 = COPY $rax - %7:gr64_nosp = MOV32ri64 32 - %8:tilepair = PT2RPNTLVWZ0V %5, %4, %3, killed %6, 1, %7, 0, $noreg - %9:tile = COPY %8.sub_t1 - %10:tile = COPY %8.sub_t0 - %11:tile = PTILEZEROV %5, %4 - %12:tile = PTDPBSSDV %5, %3, %4, %11, killed %10, killed %9 - %13:gr64 = COPY $rbx - PTILESTOREDV %5, %4, killed %13, 1, %7, 0, $noreg, killed %12 - RET 0 - -... diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll deleted file mode 100644 index 4cfd97afe721b..0000000000000 --- a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll +++ /dev/null @@ -1,371 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR - -define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 { -; CHECK-LABEL: test_amx: -; CHECK: # %bb.0: -; CHECK-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0 -; CHECK-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 -; CHECK-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0 -; CHECK-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 -; CHECK-NEXT: ttransposed %tmm3, %tmm1 -; CHECK-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4 -; CHECK-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: tconjtfp16 %tmm2, %tmm1 -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx: -; EGPR: # %bb.0: -; EGPR-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x04,0x31] -; EGPR-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x31] -; EGPR-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x04,0x31] -; EGPR-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x31] -; EGPR-NEXT: ttransposed %tmm3, %tmm1 # encoding: [0xc4,0xe2,0x7a,0x5f,0xcb] -; EGPR-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6c,0xca] -; EGPR-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4 # encoding: [0xc4,0xe2,0x4b,0x6c,0xe5] -; EGPR-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x63,0x6b,0xca] -; EGPR-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6b,0xca] -; EGPR-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x60,0x6b,0xca] -; EGPR-NEXT: tconjtfp16 %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x79,0x6b,0xca] -; EGPR-NEXT: retq # encoding: [0xc3] - call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz1t1(i8 2, i8* %addr1, i64 %stride) - call void @llvm.x86.ttransposed(i8 1, i8 3) - call void @llvm.x86.ttdpbf16ps(i8 1, i8 2, i8 3) - call void @llvm.x86.ttdpfp16ps(i8 4, i8 5, i8 6) - call void @llvm.x86.ttcmmimfp16ps(i8 1, i8 2, i8 3) - call void @llvm.x86.ttcmmrlfp16ps(i8 1, i8 2, i8 3) - call void @llvm.x86.tconjtcmmimfp16ps(i8 1, i8 2, i8 3) - call void @llvm.x86.tconjtfp16(i8 1, i8 2) - ret void -} - -declare void @llvm.x86.t2rpntlvwz0(i8 %tile1, i8* %addr1, i64 %stride) -declare void @llvm.x86.t2rpntlvwz0t1(i8 %tile1, i8* %addr1, i64 %stride) -declare void @llvm.x86.t2rpntlvwz1(i8 %tile1, i8* %addr1, i64 %stride) -declare void @llvm.x86.t2rpntlvwz1t1(i8 %tile1, i8* %addr1, i64 %stride) -declare void @llvm.x86.ttransposed(i8 %tile0, i8 %tile1) -declare void @llvm.x86.ttdpbf16ps(i8 %tile0, i8 %tile1, i8 %tile2) -declare void @llvm.x86.ttdpfp16ps(i8 %tile0, i8 %tile1, i8 %tile2) -declare void @llvm.x86.ttcmmimfp16ps(i8 %A, i8 %B, i8 %C) -declare void @llvm.x86.ttcmmrlfp16ps(i8 %A, i8 %B, i8 %C) -declare void @llvm.x86.tconjtcmmimfp16ps(i8 %A, i8 %B, i8 %C) -declare void @llvm.x86.tconjtfp16(i8 %A, i8 %B) - -define void @test_amx2(i8* %pointer, i8* %base, i64 %stride) #0 { -; CHECK-LABEL: test_amx2: -; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: subq $2928, %rsp # imm = 0xB70 -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, %ax -; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0 -; CHECK-NEXT: tilezero %tmm1 -; CHECK-NEXT: tilezero %tmm2 -; CHECK-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: movabsq $64, %rbp -; CHECK-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill -; CHECK-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload -; CHECK-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 -; CHECK-NEXT: tconjtfp16 %tmm3, %tmm0 -; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx) -; CHECK-NEXT: addq $2928, %rsp # imm = 0xB70 -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx2: -; EGPR: # %bb.0: -; EGPR-NEXT: pushq %rbp # encoding: [0x55] -; EGPR-NEXT: subq $2928, %rsp # encoding: [0x48,0x81,0xec,0x70,0x0b,0x00,0x00] -; EGPR-NEXT: # imm = 0xB70 -; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] -; EGPR-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0x0d] -; EGPR-NEXT: movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x40,0x03,0x00,0x00,0x01] -; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x70,0x03,0x00,0x00,0x08] -; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x50,0x03,0x00,0x00,0x08,0x00] -; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x71,0x03,0x00,0x00,0x08] -; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x52,0x03,0x00,0x00,0x08,0x00] -; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x72,0x03,0x00,0x00,0x08] -; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x54,0x03,0x00,0x00,0x08,0x00] -; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x73,0x03,0x00,0x00,0x08] -; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x56,0x03,0x00,0x00,0x08,0x00] -; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0x40,0x03,0x00,0x00] -; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] -; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16] -; EGPR-NEXT: tilezero %tmm1 # encoding: [0xc4,0xe2,0x7b,0x49,0xc8] -; EGPR-NEXT: tilezero %tmm2 # encoding: [0xc4,0xe2,0x7b,0x49,0xd0] -; EGPR-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6c,0xd0] -; EGPR-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6c,0xd0] -; EGPR-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6b,0xd0] -; EGPR-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6b,0xd0] -; EGPR-NEXT: movabsq $64, %rbp # encoding: [0x48,0xbd,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; EGPR-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill -; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x94,0x2c,0x80,0x03,0x00,0x00] -; EGPR-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload -; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x2c,0x80,0x03,0x00,0x00] -; EGPR-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 # encoding: [0xc4,0xe2,0x70,0x6b,0xd8] -; EGPR-NEXT: tconjtfp16 %tmm3, %tmm0 # encoding: [0xc4,0xe2,0x79,0x6b,0xc3] -; EGPR-NEXT: tilestored %tmm2, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x14,0x17] -; EGPR-NEXT: addq $2928, %rsp # encoding: [0x48,0x81,0xc4,0x70,0x0b,0x00,0x00] -; EGPR-NEXT: # imm = 0xB70 -; EGPR-NEXT: popq %rbp # encoding: [0x5d] -; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] -; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; EGPR-NEXT: retq # encoding: [0xc3] - - %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) - %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) - %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) - %c1 = call x86_amx @llvm.x86.ttdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) - %c2 = call x86_amx @llvm.x86.ttdpfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b) - %c3 = call x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c2, x86_amx %a, x86_amx %b) - %c4 = call x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c3, x86_amx %a, x86_amx %b) - %c5 = call x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c4, x86_amx %a, x86_amx %b) - %c6 = call x86_amx @llvm.x86.tconjtfp16.internal(i16 8, i16 8, x86_amx %c5) - - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c4) - ret void -} - -define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 { -; CHECK-LABEL: test_amx3: -; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: movw $8, %cx -; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: ttransposed %tmm4, %tmm0 -; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx) -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx3: -; EGPR: # %bb.0: -; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] -; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xff] -; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf0,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd0,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00] -; EGPR-NEXT: movb $0, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x00] -; EGPR-NEXT: movw $0, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x00,0x00] -; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0] -; EGPR-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; EGPR-NEXT: movw $8, %cx # encoding: [0x66,0xb9,0x08,0x00] -; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16] -; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x24,0x16] -; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x24,0x16] -; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x24,0x16] -; EGPR-NEXT: ttransposed %tmm4, %tmm0 # encoding: [0xc4,0xe2,0x7a,0x5f,0xc4] -; EGPR-NEXT: tilestored %tmm0, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x17] -; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] -; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; EGPR-NEXT: retq # encoding: [0xc3] - %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) - %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) - %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) - %4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) - %5 = extractvalue { x86_amx, x86_amx } %4, 0 - %6 = call x86_amx @llvm.x86.ttransposed.internal(i16 8, i16 8, x86_amx %5) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %6) - ret void -} - -define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 { -; CHECK-LABEL: test_amx_spill: -; CHECK: # %bb.0: -; CHECK-NEXT: subq $6088, %rsp # imm = 0x17C8 -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, %ax -; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0 -; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: movabsq $64, %rcx -; CHECK-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) -; CHECK-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; CHECK-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) -; CHECK-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; CHECK-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) -; CHECK-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; CHECK-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm6, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm7, (%rsi,%rdx) -; CHECK-NEXT: addq $6088, %rsp # imm = 0x17C8 -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx_spill: -; EGPR: # %bb.0: -; EGPR-NEXT: subq $6088, %rsp # encoding: [0x48,0x81,0xec,0xc8,0x17,0x00,0x00] -; EGPR-NEXT: # imm = 0x17C8 -; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] -; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xfe] -; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x80,0x01] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb0,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x90,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb4,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x98,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb5,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9a,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb6,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9c,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb7,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9e,0x08,0x00] -; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x80] -; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] -; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16] -; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16] -; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x34,0x16] -; EGPR-NEXT: movabsq $64, %rcx # encoding: [0x48,0xb9,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; EGPR-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x0f,0x00,0x00] -; EGPR-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x13,0x00,0x00] -; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x34,0x16] -; EGPR-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x07,0x00,0x00] -; EGPR-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x0b,0x00,0x00] -; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x34,0x16] -; EGPR-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0x74,0x0c,0xc0] -; EGPR-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x03,0x00,0x00] -; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x34,0x16] -; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] -; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] -; EGPR-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x0f,0x00,0x00] -; EGPR-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x13,0x00,0x00] -; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] -; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] -; EGPR-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x07,0x00,0x00] -; EGPR-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x0b,0x00,0x00] -; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] -; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] -; EGPR-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0x64,0x0c,0xc0] -; EGPR-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x03,0x00,0x00] -; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] -; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] -; EGPR-NEXT: tilestored %tmm6, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x34,0x16] -; EGPR-NEXT: tilestored %tmm7, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x3c,0x16] -; EGPR-NEXT: addq $6088, %rsp # encoding: [0x48,0x81,0xc4,0xc8,0x17,0x00,0x00] -; EGPR-NEXT: # imm = 0x17C8 -; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] -; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; EGPR-NEXT: retq # encoding: [0xc3] - %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) - %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %b3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %b4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %b5 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %e11 = extractvalue { x86_amx, x86_amx } %b1, 0 - %e12 = extractvalue { x86_amx, x86_amx } %b1, 1 - %e21 = extractvalue { x86_amx, x86_amx } %b2, 0 - %e22 = extractvalue { x86_amx, x86_amx } %b2, 1 - %e31 = extractvalue { x86_amx, x86_amx } %b3, 0 - %e32 = extractvalue { x86_amx, x86_amx } %b3, 1 - %e41 = extractvalue { x86_amx, x86_amx } %b4, 0 - %e42 = extractvalue { x86_amx, x86_amx } %b4, 1 - %e51 = extractvalue { x86_amx, x86_amx } %b5, 0 - %e52 = extractvalue { x86_amx, x86_amx } %b5, 1 - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e11) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e12) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e21) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e22) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e31) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e32) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e41) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e42) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e51) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e52) - ret void -} - -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) -declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16, i16, i16, i8*, i64) -declare x86_amx @llvm.x86.ttransposed.internal(i16, i16, x86_amx) -declare x86_amx @llvm.x86.ttdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.ttdpfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tconjtfp16.internal(i16, i16, x86_amx) - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/X86/ipra-reg-usage.ll b/llvm/test/CodeGen/X86/ipra-reg-usage.ll index e73ff791dc423..f270f8fc741aa 100644 --- a/llvm/test/CodeGen/X86/ipra-reg-usage.ll +++ b/llvm/test/CodeGen/X86/ipra-reg-usage.ll @@ -7,7 +7,7 @@ target triple = "x86_64-unknown-unknown" declare void @bar1() define preserve_allcc void @foo()#0 { -; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh +; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh call void @bar1() call void @bar2() ret void @@ -15,7 +15,7 @@ define preserve_allcc void @foo()#0 { declare void @bar2() define preserve_nonecc void @foo2()#0 { -; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh +; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh call void @bar1() call void @bar2() ret void diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt index 57e3153da401b..5c2927afbda4c 100755 --- a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt +++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt @@ -1,70 +1,6 @@ # RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s -check-prefix=ATT # RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s -check-prefix=INTEL -# ATT: t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6 -# INTEL: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456] -0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2 -# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291] -0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz0rs 64(%rbx), %tmm6 -# INTEL: t2rpntlvwz0rs tmm6, [rbx + 64] -0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40 - -# ATT: t2rpntlvwz0rs -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32] -0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6 -# INTEL: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456] -0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2 -# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291] -0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz0rst1 64(%rbx), %tmm6 -# INTEL: t2rpntlvwz0rst1 tmm6, [rbx + 64] -0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40 - -# ATT: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32] -0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6 -# INTEL: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456] -0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2 -# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291] -0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz1rs 64(%rbx), %tmm6 -# INTEL: t2rpntlvwz1rs tmm6, [rbx + 64] -0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40 - -# ATT: t2rpntlvwz1rs -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32] -0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6 -# INTEL: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456] -0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2 -# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291] -0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz1rst1 64(%rbx), %tmm6 -# INTEL: t2rpntlvwz1rst1 tmm6, [rbx + 64] -0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40 - -# ATT: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32] -0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff - # ATT: tileloaddrs 268435456(%rbp,%r14,8), %tmm6 # INTEL: tileloaddrs tmm6, [rbp + 8*r14 + 268435456] 0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10 @@ -97,70 +33,6 @@ # INTEL: tileloaddrst1 tmm3, [2*rbp - 32] 0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff -# ATT: t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6 -# INTEL: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456] -0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2 -# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291] -0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz0rs 64(%r18), %tmm6 -# INTEL: t2rpntlvwz0rs tmm6, [r18 + 64] -0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40 - -# ATT: t2rpntlvwz0rs -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32] -0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6 -# INTEL: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456] -0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2 -# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291] -0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz0rst1 64(%r18), %tmm6 -# INTEL: t2rpntlvwz0rst1 tmm6, [r18 + 64] -0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40 - -# ATT: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32] -0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6 -# INTEL: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456] -0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2 -# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291] -0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz1rs 64(%r18), %tmm6 -# INTEL: t2rpntlvwz1rs tmm6, [r18 + 64] -0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40 - -# ATT: t2rpntlvwz1rs -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32] -0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6 -# INTEL: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456] -0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2 -# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291] -0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz1rst1 64(%r18), %tmm6 -# INTEL: t2rpntlvwz1rst1 tmm6, [r18 + 64] -0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40 - -# ATT: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32] -0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff - # ATT: tileloaddrs 268435456(%r16,%r14,8), %tmm6 # INTEL: tileloaddrs tmm6, [r16 + 8*r14 + 268435456] 0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10 diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt index f372c42982b1b..347e61cdfc4b8 100644 --- a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt +++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt @@ -9,11 +9,3 @@ # INTEL: tmmultf32ps tmm3, tmm2, tmm1 0xc4,0xe2,0x71,0x48,0xda -# ATT: ttmmultf32ps %tmm4, %tmm5, %tmm6 -# INTEL: ttmmultf32ps tmm6, tmm5, tmm4 -0xc4,0xe2,0x58,0x48,0xf5 - -# ATT: ttmmultf32ps %tmm1, %tmm2, %tmm3 -# INTEL: ttmmultf32ps tmm3, tmm2, tmm1 -0xc4,0xe2,0x70,0x48,0xda - diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt deleted file mode 100644 index d768630ac1475..0000000000000 --- a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt +++ /dev/null @@ -1,154 +0,0 @@ -# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT -# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL - -# ATT: t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4 -# INTEL: t2rpntlvwz0 tmm4, [rbp + 8*r14 + 268435456] -0xc4,0xa2,0x78,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz0 291(%r8,%rax,4), %tmm2 -# INTEL: t2rpntlvwz0 tmm2, [r8 + 4*rax + 291] -0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz0 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz0 tmm2, [2*rbp - 32] -0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm4 -# INTEL: t2rpntlvwz0t1 tmm4, [rbp + 8*r14 + 268435456] -0xc4,0xa2,0x78,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2 -# INTEL: t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291] -0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz0t1 tmm2, [2*rbp - 32] -0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm4 -# INTEL: t2rpntlvwz1 tmm4, [rbp + 8*r14 + 268435456] -0xc4,0xa2,0x79,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz1 291(%r8,%rax,4), %tmm2 -# INTEL: t2rpntlvwz1 tmm2, [r8 + 4*rax + 291] -0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz1 tmm2, [2*rbp - 32] -0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm4 -# INTEL: t2rpntlvwz1t1 tmm4, [rbp + 8*r14 + 268435456] -0xc4,0xa2,0x79,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2 -# INTEL: t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291] -0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32] -0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4 -# INTEL: t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456] -0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz0 291(%r8,%r17,4), %tmm2 -# INTEL: t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291] -0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz0 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz0 tmm2, [2*rbp - 32] -0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4 -# INTEL: t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456] -0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2 -# INTEL: t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291] -0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz0t1 tmm2, [2*rbp - 32] -0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4 -# INTEL: t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456] -0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz1 291(%r8,%r17,4), %tmm2 -# INTEL: t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291] -0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz1 tmm2, [2*rbp - 32] -0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4 -# INTEL: t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456] -0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10 - -# ATT: t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2 -# INTEL: t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291] -0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00 - -# ATT: t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 -# INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32] -0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff - -# ATT: ttransposed %tmm1, %tmm2 -# INTEL: ttransposed tmm2, tmm1 -0xc4,0xe2,0x7a,0x5f,0xd1 - -# ATT: ttransposed %tmm2, %tmm3 -# INTEL: ttransposed tmm3, tmm2 -0xc4,0xe2,0x7a,0x5f,0xda - -# ATT: ttdpbf16ps %tmm7, %tmm6, %tmm5 -# INTEL: ttdpbf16ps tmm5, tmm6, tmm7 -0xc4,0xe2,0x42,0x6c,0xee - -# ATT: ttdpbf16ps %tmm1, %tmm2, %tmm3 -# INTEL: ttdpbf16ps tmm3, tmm2, tmm1 -0xc4,0xe2,0x72,0x6c,0xda - -# ATT: ttdpfp16ps %tmm7, %tmm6, %tmm5 -# INTEL: ttdpfp16ps tmm5, tmm6, tmm7 -0xc4,0xe2,0x43,0x6c,0xee - -# ATT: ttdpfp16ps %tmm1, %tmm2, %tmm3 -# INTEL: ttdpfp16ps tmm3, tmm2, tmm1 -0xc4,0xe2,0x73,0x6c,0xda - -# ATT: ttcmmimfp16ps %tmm4, %tmm5, %tmm6 -# INTEL: ttcmmimfp16ps tmm6, tmm5, tmm4 -0xc4,0xe2,0x5b,0x6b,0xf5 - -# ATT: ttcmmimfp16ps %tmm1, %tmm2, %tmm3 -# INTEL: ttcmmimfp16ps tmm3, tmm2, tmm1 -0xc4,0xe2,0x73,0x6b,0xda - -# ATT: ttcmmrlfp16ps %tmm4, %tmm5, %tmm6 -# INTEL: ttcmmrlfp16ps tmm6, tmm5, tmm4 -0xc4,0xe2,0x5a,0x6b,0xf5 - -# ATT: ttcmmrlfp16ps %tmm1, %tmm2, %tmm3 -# INTEL: ttcmmrlfp16ps tmm3, tmm2, tmm1 -0xc4,0xe2,0x72,0x6b,0xda - -# ATT: tconjtcmmimfp16ps %tmm4, %tmm5, %tmm6 -# INTEL: tconjtcmmimfp16ps tmm6, tmm5, tmm4 -0xc4,0xe2,0x58,0x6b,0xf5 - -# ATT: tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3 -# INTEL: tconjtcmmimfp16ps tmm3, tmm2, tmm1 -0xc4,0xe2,0x70,0x6b,0xda - -# ATT: tconjtfp16 %tmm5, %tmm6 -# INTEL: tconjtfp16 tmm6, tmm5 -0xc4,0xe2,0x79,0x6b,0xf5 - -# ATT: tconjtfp16 %tmm2, %tmm3 -# INTEL: tconjtfp16 tmm3, tmm2 -0xc4,0xe2,0x79,0x6b,0xda diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s index 92db672e1c82d..497a1c6b7bad5 100755 --- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s +++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s @@ -1,69 +1,5 @@ // RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s -// CHECK: t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6 -// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6 - -// CHECK: t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2 -// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2 - -// CHECK: t2rpntlvwz0rs 64(%rbx), %tmm6 -// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40] - t2rpntlvwz0rs 64(%rbx), %tmm6 - -// CHECK: t2rpntlvwz0rs -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz0rs -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6 -// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6 - -// CHECK: t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2 -// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2 - -// CHECK: t2rpntlvwz0rst1 64(%rbx), %tmm6 -// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40] - t2rpntlvwz0rst1 64(%rbx), %tmm6 - -// CHECK: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6 -// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6 - -// CHECK: t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2 -// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2 - -// CHECK: t2rpntlvwz1rs 64(%rbx), %tmm6 -// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40] - t2rpntlvwz1rs 64(%rbx), %tmm6 - -// CHECK: t2rpntlvwz1rs -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz1rs -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6 -// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6 - -// CHECK: t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2 -// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2 - -// CHECK: t2rpntlvwz1rst1 64(%rbx), %tmm6 -// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40] - t2rpntlvwz1rst1 64(%rbx), %tmm6 - -// CHECK: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 - // CHECK: tileloaddrs 268435456(%rbp,%r14,8), %tmm6 // CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10] tileloaddrs 268435456(%rbp,%r14,8), %tmm6 @@ -88,70 +24,6 @@ // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] tileloaddrst1 -32(,%rbp,2), %tmm3 -// CHECK: t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6 -// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6 - -// CHECK: t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2 -// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2 - -// CHECK: t2rpntlvwz0rs 64(%r18), %tmm6 -// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40] - t2rpntlvwz0rs 64(%r18), %tmm6 - -// CHECK: {evex} t2rpntlvwz0rs -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz0rs -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6 -// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6 - -// CHECK: t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2 -// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2 - -// CHECK: t2rpntlvwz0rst1 64(%r18), %tmm6 -// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40] - t2rpntlvwz0rst1 64(%r18), %tmm6 - -// CHECK: {evex} t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6 -// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6 - -// CHECK: t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2 -// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2 - -// CHECK: t2rpntlvwz1rs 64(%r18), %tmm6 -// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40] - t2rpntlvwz1rs 64(%r18), %tmm6 - -// CHECK: {evex} t2rpntlvwz1rs -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz1rs -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6 -// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6 - -// CHECK: t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2 -// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2 - -// CHECK: t2rpntlvwz1rst1 64(%r18), %tmm6 -// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40] - t2rpntlvwz1rst1 64(%r18), %tmm6 - -// CHECK: {evex} t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 - // CHECK: tileloaddrs 291(%r16,%rax,4), %tmm3 // CHECK: encoding: [0x62,0xfa,0x7f,0x08,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] tileloaddrs 291(%r16,%rax,4), %tmm3 diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s index 140d1aa6b198e..0e030ca415a16 100755 --- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s +++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s @@ -1,69 +1,5 @@ // RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s -// CHECK: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456] -// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291] -// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291] - -// CHECK: t2rpntlvwz0rs tmm6, [rbx + 64] -// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40] - t2rpntlvwz0rs tmm6, [rbx + 64] - -// CHECK: t2rpntlvwz0rs tmm2, [2*rbp - 32] -// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz0rs tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456] -// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291] -// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291] - -// CHECK: t2rpntlvwz0rst1 tmm6, [rbx + 64] -// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40] - t2rpntlvwz0rst1 tmm6, [rbx + 64] - -// CHECK: t2rpntlvwz0rst1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz0rst1 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456] -// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291] -// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291] - -// CHECK: t2rpntlvwz1rs tmm6, [rbx + 64] -// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40] - t2rpntlvwz1rs tmm6, [rbx + 64] - -// CHECK: t2rpntlvwz1rs tmm2, [2*rbp - 32] -// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz1rs tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456] -// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291] -// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291] - -// CHECK: t2rpntlvwz1rst1 tmm6, [rbx + 64] -// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40] - t2rpntlvwz1rst1 tmm6, [rbx + 64] - -// CHECK: t2rpntlvwz1rst1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz1rst1 tmm2, [2*rbp - 32] - // CHECK: tileloaddrs tmm6, [rbp + 8*r14 + 268435456] // CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10] tileloaddrs tmm6, [rbp + 8*r14 + 268435456] @@ -96,70 +32,6 @@ // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] tileloaddrst1 tmm3, [2*rbp - 32] -// CHECK: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456] -// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291] -// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291] - -// CHECK: t2rpntlvwz0rs tmm6, [r18 + 64] -// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40] - t2rpntlvwz0rs tmm6, [r18 + 64] - -// CHECK: {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32] -// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456] -// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291] -// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291] - -// CHECK: t2rpntlvwz0rst1 tmm6, [r18 + 64] -// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40] - t2rpntlvwz0rst1 tmm6, [r18 + 64] - -// CHECK: {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456] -// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291] -// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291] - -// CHECK: t2rpntlvwz1rs tmm6, [r18 + 64] -// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40] - t2rpntlvwz1rs tmm6, [r18 + 64] - -// CHECK: {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32] -// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456] -// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291] -// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291] - -// CHECK: t2rpntlvwz1rst1 tmm6, [r18 + 64] -// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40] - t2rpntlvwz1rst1 tmm6, [r18 + 64] - -// CHECK: {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32] - // CHECK: tileloaddrs tmm6, [r16 + 8*r14 + 268435456] // CHECK: encoding: [0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10] tileloaddrs tmm6, [r16 + 8*r14 + 268435456] diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s index b413597cd9da7..d1d0997b7eec0 100644 --- a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s +++ b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s @@ -8,10 +8,3 @@ // CHECK: encoding: [0xc4,0xe2,0x71,0x48,0xda] tmmultf32ps %tmm1, %tmm2, %tmm3 -// CHECK: ttmmultf32ps %tmm4, %tmm5, %tmm6 -// CHECK: encoding: [0xc4,0xe2,0x58,0x48,0xf5] - ttmmultf32ps %tmm4, %tmm5, %tmm6 - -// CHECK: ttmmultf32ps %tmm1, %tmm2, %tmm3 -// CHECK: encoding: [0xc4,0xe2,0x70,0x48,0xda] - ttmmultf32ps %tmm1, %tmm2, %tmm3 diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s index 98f55275716eb..b6c0947ee750c 100644 --- a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s +++ b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s @@ -8,10 +8,3 @@ // CHECK: encoding: [0xc4,0xe2,0x71,0x48,0xda] tmmultf32ps tmm3, tmm2, tmm1 -// CHECK: ttmmultf32ps tmm6, tmm5, tmm4 -// CHECK: encoding: [0xc4,0xe2,0x58,0x48,0xf5] - ttmmultf32ps tmm6, tmm5, tmm4 - -// CHECK: ttmmultf32ps tmm3, tmm2, tmm1 -// CHECK: encoding: [0xc4,0xe2,0x70,0x48,0xda] - ttmmultf32ps tmm3, tmm2, tmm1 diff --git a/llvm/test/MC/X86/amx-transpose-att.s b/llvm/test/MC/X86/amx-transpose-att.s deleted file mode 100644 index 5158470f8c905..0000000000000 --- a/llvm/test/MC/X86/amx-transpose-att.s +++ /dev/null @@ -1,153 +0,0 @@ -// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s - -// CHECK: t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4 -// CHECK: encoding: [0xc4,0xa2,0x78,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4 - -// CHECK: t2rpntlvwz0 291(%r8,%rax,4), %tmm2 -// CHECK: encoding: [0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz0 291(%r8,%rax,4), %tmm2 - -// CHECK: t2rpntlvwz0 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz0 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm4 -// CHECK: encoding: [0xc4,0xa2,0x78,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm5 - -// CHECK: t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2 -// CHECK: encoding: [0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2 - -// CHECK: t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm4 -// CHECK: encoding: [0xc4,0xa2,0x79,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm5 - -// CHECK: t2rpntlvwz1 291(%r8,%rax,4), %tmm2 -// CHECK: encoding: [0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz1 291(%r8,%rax,4), %tmm2 - -// CHECK: t2rpntlvwz1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz1 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm2 -// CHECK: encoding: [0xc4,0xa2,0x79,0x6f,0x94,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm3 - -// CHECK: t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2 -// CHECK: encoding: [0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2 - -// CHECK: t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4 -// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4 - -// CHECK: t2rpntlvwz0 291(%r8,%r17,4), %tmm2 -// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz0 291(%r8,%r17,4), %tmm2 - -// CHECK: {evex} t2rpntlvwz0 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz0 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4 -// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4 - -// CHECK: t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2 -// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2 - -// CHECK: {evex} t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4 -// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4 - -// CHECK: t2rpntlvwz1 291(%r8,%r17,4), %tmm2 -// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz1 291(%r8,%r17,4), %tmm2 - -// CHECK: {evex} t2rpntlvwz1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz1 -32(,%rbp,2), %tmm2 - -// CHECK: t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4 -// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4 - -// CHECK: t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2 -// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2 - -// CHECK: {evex} t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 -// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 - -// CHECK: ttransposed %tmm1, %tmm5 -// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9] - ttransposed %tmm1, %tmm5 - -// CHECK: ttransposed %tmm2, %tmm3 -// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda] - ttransposed %tmm2, %tmm3 - -// CHECK: ttdpbf16ps %tmm1, %tmm2, %tmm5 -// CHECK: encoding: [0xc4,0xe2,0x72,0x6c,0xea] - ttdpbf16ps %tmm1, %tmm2, %tmm5 - -// CHECK: ttdpbf16ps %tmm1, %tmm2, %tmm3 -// CHECK: encoding: [0xc4,0xe2,0x72,0x6c,0xda] - ttdpbf16ps %tmm1, %tmm2, %tmm3 - -// CHECK: ttdpfp16ps %tmm3, %tmm4, %tmm5 -// CHECK: encoding: [0xc4,0xe2,0x63,0x6c,0xec] - ttdpfp16ps %tmm3, %tmm4, %tmm5 - -// CHECK: ttdpfp16ps %tmm1, %tmm2, %tmm3 -// CHECK: encoding: [0xc4,0xe2,0x73,0x6c,0xda] - ttdpfp16ps %tmm1, %tmm2, %tmm3 - -// CHECK: ttcmmimfp16ps %tmm4, %tmm5, %tmm6 -// CHECK: encoding: [0xc4,0xe2,0x5b,0x6b,0xf5] - ttcmmimfp16ps %tmm4, %tmm5, %tmm6 - -// CHECK: ttcmmimfp16ps %tmm1, %tmm2, %tmm3 -// CHECK: encoding: [0xc4,0xe2,0x73,0x6b,0xda] - ttcmmimfp16ps %tmm1, %tmm2, %tmm3 - -// CHECK: ttcmmrlfp16ps %tmm4, %tmm5, %tmm6 -// CHECK: encoding: [0xc4,0xe2,0x5a,0x6b,0xf5] - ttcmmrlfp16ps %tmm4, %tmm5, %tmm6 - -// CHECK: ttcmmrlfp16ps %tmm1, %tmm2, %tmm3 -// CHECK: encoding: [0xc4,0xe2,0x72,0x6b,0xda] - ttcmmrlfp16ps %tmm1, %tmm2, %tmm3 - -// CHECK: tconjtcmmimfp16ps %tmm4, %tmm5, %tmm6 -// CHECK: encoding: [0xc4,0xe2,0x58,0x6b,0xf5] - tconjtcmmimfp16ps %tmm4, %tmm5, %tmm6 - -// CHECK: tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3 -// CHECK: encoding: [0xc4,0xe2,0x70,0x6b,0xda] - tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3 - -// CHECK: tconjtfp16 %tmm5, %tmm6 -// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xf5] - tconjtfp16 %tmm5, %tmm6 - -// CHECK: tconjtfp16 %tmm2, %tmm3 -// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xda] - tconjtfp16 %tmm2, %tmm3 diff --git a/llvm/test/MC/X86/amx-transpose-intel.s b/llvm/test/MC/X86/amx-transpose-intel.s deleted file mode 100644 index 0d2c22f67a173..0000000000000 --- a/llvm/test/MC/X86/amx-transpose-intel.s +++ /dev/null @@ -1,153 +0,0 @@ -// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s - -// CHECK: t2rpntlvwz0 tmm6, [rbp + 8*r14 + 268435456] -// CHECK: encoding: [0xc4,0xa2,0x78,0x6e,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz0 tmm6, [rbp + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz0 tmm2, [r8 + 4*rax + 291] -// CHECK: encoding: [0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz0 tmm2, [r8 + 4*rax + 291] - -// CHECK: t2rpntlvwz0 tmm2, [2*rbp - 32] -// CHECK: encoding: [0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz0 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz0t1 tmm6, [rbp + 8*r14 + 268435456] -// CHECK: encoding: [0xc4,0xa2,0x78,0x6f,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz0t1 tmm7, [rbp + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291] -// CHECK: encoding: [0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291] - -// CHECK: t2rpntlvwz0t1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz0t1 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz1 tmm0, [rbp + 8*r14 + 268435456] -// CHECK: encoding: [0xc4,0xa2,0x79,0x6e,0x84,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz1 tmm1, [rbp + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz1 tmm2, [r8 + 4*rax + 291] -// CHECK: encoding: [0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz1 tmm2, [r8 + 4*rax + 291] - -// CHECK: t2rpntlvwz1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz1 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz1t1 tmm6, [rbp + 8*r14 + 268435456] -// CHECK: encoding: [0xc4,0xa2,0x79,0x6f,0xb4,0xf5,0x00,0x00,0x00,0x10] - t2rpntlvwz1t1 tmm6, [rbp + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291] -// CHECK: encoding: [0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00] - t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291] - -// CHECK: t2rpntlvwz1t1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] - t2rpntlvwz1t1 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456] -// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291] -// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291] - -// CHECK: {evex} t2rpntlvwz0 tmm2, [2*rbp - 32] -// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz0 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456] -// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291] -// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291] - -// CHECK: {evex} t2rpntlvwz0t1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz0t1 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456] -// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291] -// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291] - -// CHECK: {evex} t2rpntlvwz1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz1 tmm2, [2*rbp - 32] - -// CHECK: t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456] -// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10] - t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456] - -// CHECK: t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291] -// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00] - t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291] - -// CHECK: {evex} t2rpntlvwz1t1 tmm2, [2*rbp - 32] -// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] - {evex} t2rpntlvwz1t1 tmm2, [2*rbp - 32] - -// CHECK: ttransposed tmm5, tmm1 -// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9] - ttransposed tmm5, tmm1 - -// CHECK: ttransposed tmm3, tmm2 -// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda] - ttransposed tmm3, tmm2 - -// CHECK: ttdpbf16ps tmm5, tmm0, tmm4 -// CHECK: encoding: [0xc4,0xe2,0x5a,0x6c,0xe8] - ttdpbf16ps tmm5, tmm0, tmm4 - -// CHECK: ttdpbf16ps tmm3, tmm2, tmm1 -// CHECK: encoding: [0xc4,0xe2,0x72,0x6c,0xda] - ttdpbf16ps tmm3, tmm2, tmm1 - -// CHECK: ttdpfp16ps tmm1, tmm0, tmm4 -// CHECK: encoding: [0xc4,0xe2,0x5b,0x6c,0xc8] - ttdpfp16ps tmm1, tmm0, tmm4 - -// CHECK: ttdpfp16ps tmm3, tmm2, tmm1 -// CHECK: encoding: [0xc4,0xe2,0x73,0x6c,0xda] - ttdpfp16ps tmm3, tmm2, tmm1 - -// CHECK: ttcmmimfp16ps tmm6, tmm5, tmm4 -// CHECK: encoding: [0xc4,0xe2,0x5b,0x6b,0xf5] - ttcmmimfp16ps tmm6, tmm5, tmm4 - -// CHECK: ttcmmimfp16ps tmm3, tmm2, tmm1 -// CHECK: encoding: [0xc4,0xe2,0x73,0x6b,0xda] - ttcmmimfp16ps tmm3, tmm2, tmm1 - -// CHECK: ttcmmrlfp16ps tmm6, tmm5, tmm4 -// CHECK: encoding: [0xc4,0xe2,0x5a,0x6b,0xf5] - ttcmmrlfp16ps tmm6, tmm5, tmm4 - -// CHECK: ttcmmrlfp16ps tmm3, tmm2, tmm1 -// CHECK: encoding: [0xc4,0xe2,0x72,0x6b,0xda] - ttcmmrlfp16ps tmm3, tmm2, tmm1 - -// CHECK: tconjtcmmimfp16ps tmm6, tmm5, tmm4 -// CHECK: encoding: [0xc4,0xe2,0x58,0x6b,0xf5] - tconjtcmmimfp16ps tmm6, tmm5, tmm4 - -// CHECK: tconjtcmmimfp16ps tmm3, tmm2, tmm1 -// CHECK: encoding: [0xc4,0xe2,0x70,0x6b,0xda] - tconjtcmmimfp16ps tmm3, tmm2, tmm1 - -// CHECK: tconjtfp16 tmm6, tmm5 -// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xf5] - tconjtfp16 tmm6, tmm5 - -// CHECK: tconjtfp16 tmm3, tmm2 -// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xda] - tconjtfp16 tmm3, tmm2 diff --git a/llvm/test/TableGen/x86-instr-mapping.inc b/llvm/test/TableGen/x86-instr-mapping.inc index f621979b2af95..6d2873ed4e749 100644 --- a/llvm/test/TableGen/x86-instr-mapping.inc +++ b/llvm/test/TableGen/x86-instr-mapping.inc @@ -167,14 +167,6 @@ static const X86TableEntry X86CompressEVEXTable[] = { { X86::SHRX64rm_EVEX, X86::SHRX64rm }, { X86::SHRX64rr_EVEX, X86::SHRX64rr }, { X86::STTILECFG_EVEX, X86::STTILECFG }, - { X86::T2RPNTLVWZ0RST1_EVEX, X86::T2RPNTLVWZ0RST1 }, - { X86::T2RPNTLVWZ0RS_EVEX, X86::T2RPNTLVWZ0RS }, - { X86::T2RPNTLVWZ0T1_EVEX, X86::T2RPNTLVWZ0T1 }, - { X86::T2RPNTLVWZ0_EVEX, X86::T2RPNTLVWZ0 }, - { X86::T2RPNTLVWZ1RST1_EVEX, X86::T2RPNTLVWZ1RST1 }, - { X86::T2RPNTLVWZ1RS_EVEX, X86::T2RPNTLVWZ1RS }, - { X86::T2RPNTLVWZ1T1_EVEX, X86::T2RPNTLVWZ1T1 }, - { X86::T2RPNTLVWZ1_EVEX, X86::T2RPNTLVWZ1 }, { X86::TILELOADDRST1_EVEX, X86::TILELOADDRST1 }, { X86::TILELOADDRS_EVEX, X86::TILELOADDRS }, { X86::TILELOADDT1_EVEX, X86::TILELOADDT1 }, diff --git a/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt b/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt index dfbac4ce0c4d3..141a56ad10903 100644 --- a/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt +++ b/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt @@ -1,33 +1,33 @@ MAX_RELATION=4 -187 7072 1 -187 6968 2 +187 7051 1 +187 6948 2 187 187 0 -187 7072 1 -187 6969 2 +187 7051 1 +187 6949 2 187 10 0 -10 7072 1 -10 7072 2 -10 7072 3 -10 6961 4 +10 7051 1 +10 7051 2 +10 7051 3 +10 6941 4 10 187 0 -187 6952 1 -187 7072 2 -187 1555 0 -1555 6882 1 -1555 6952 2 -187 7072 1 -187 6968 2 +187 6932 1 +187 7051 2 +187 1543 0 +1543 6862 1 +1543 6932 2 +187 7051 1 +187 6948 2 187 187 0 -187 7072 1 -187 6969 2 +187 7051 1 +187 6949 2 187 601 0 -601 7072 1 -601 7072 2 -601 7072 3 -601 6961 4 +601 7051 1 +601 7051 2 +601 7051 3 +601 6941 4 601 187 0 -187 6952 1 -187 7072 2 -187 1555 0 -1555 6882 1 -1555 6952 2 +187 6932 1 +187 7051 2 +187 1543 0 +1543 6862 1 +1543 6932 2 diff --git a/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt b/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt index dc436d123fd35..dbbbbc746a769 100644 --- a/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt +++ b/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt @@ -1,4 +1,4 @@ -7173 +7151 AAA 0 AAD 1 AADD 2 @@ -1440,5735 +1440,5713 @@ PSUBWrm 1437 PSUBWrr 1438 PSWAPDrm 1439 PSWAPDrr 1440 -PT 1441 -PTCMMIMFP 1442 -PTCMMRLFP 1443 -PTCONJTCMMIMFP 1444 -PTCONJTFP 1445 -PTCVTROWD 1446 -PTCVTROWPS 1447 -PTDPBF 1448 -PTDPBHF 1449 -PTDPBSSD 1450 -PTDPBSSDV 1451 -PTDPBSUD 1452 -PTDPBSUDV 1453 -PTDPBUSD 1454 -PTDPBUSDV 1455 -PTDPBUUD 1456 -PTDPBUUDV 1457 -PTDPFP 1458 -PTDPHBF 1459 -PTDPHF 1460 -PTESTrm 1461 -PTESTrr 1462 -PTILELOADD 1463 -PTILELOADDRS 1464 -PTILELOADDRST 1465 -PTILELOADDRSV 1466 -PTILELOADDT 1467 -PTILELOADDV 1468 -PTILEMOVROWrre 1469 -PTILEMOVROWrreV 1470 -PTILEMOVROWrri 1471 -PTILEMOVROWrriV 1472 -PTILEPAIRLOAD 1473 -PTILEPAIRSTORE 1474 -PTILESTORED 1475 -PTILESTOREDV 1476 -PTILEZERO 1477 -PTILEZEROV 1478 -PTMMULTF 1479 -PTTCMMIMFP 1480 -PTTCMMRLFP 1481 -PTTDPBF 1482 -PTTDPFP 1483 -PTTMMULTF 1484 -PTTRANSPOSED 1485 -PTTRANSPOSEDV 1486 -PTWRITE 1487 -PTWRITEm 1488 -PTWRITEr 1489 -PUNPCKHBWrm 1490 -PUNPCKHBWrr 1491 -PUNPCKHDQrm 1492 -PUNPCKHDQrr 1493 -PUNPCKHQDQrm 1494 -PUNPCKHQDQrr 1495 -PUNPCKHWDrm 1496 -PUNPCKHWDrr 1497 -PUNPCKLBWrm 1498 -PUNPCKLBWrr 1499 -PUNPCKLDQrm 1500 -PUNPCKLDQrr 1501 -PUNPCKLQDQrm 1502 -PUNPCKLQDQrr 1503 -PUNPCKLWDrm 1504 -PUNPCKLWDrr 1505 -PUSH 1506 -PUSHA 1507 -PUSHCS 1508 -PUSHDS 1509 -PUSHES 1510 -PUSHF 1511 -PUSHFS 1512 -PUSHGS 1513 -PUSHP 1514 -PUSHSS 1515 -PVALIDATE 1516 -PXORrm 1517 -PXORrr 1518 -RCL 1519 -RCPPSm 1520 -RCPPSr 1521 -RCPSSm 1522 -RCPSSm_Int 1523 -RCPSSr 1524 -RCPSSr_Int 1525 -RCR 1526 -RDFLAGS 1527 -RDFSBASE 1528 -RDGSBASE 1529 -RDMSR 1530 -RDMSRLIST 1531 -RDMSRri 1532 -RDMSRri_EVEX 1533 -RDPID 1534 -RDPKRUr 1535 -RDPMC 1536 -RDPRU 1537 -RDRAND 1538 -RDSEED 1539 -RDSSPD 1540 -RDSSPQ 1541 -RDTSC 1542 -RDTSCP 1543 -REG_SEQUENCE 1544 -REPNE_PREFIX 1545 -REP_MOVSB 1546 -REP_MOVSD 1547 -REP_MOVSQ 1548 -REP_MOVSW 1549 -REP_PREFIX 1550 -REP_STOSB 1551 -REP_STOSD 1552 -REP_STOSQ 1553 -REP_STOSW 1554 -RET 1555 -RETI 1556 -REX 1557 -RMPADJUST 1558 -RMPQUERY 1559 -RMPUPDATE 1560 -ROL 1561 -ROR 1562 -RORX 1563 -ROUNDPDmi 1564 -ROUNDPDri 1565 -ROUNDPSmi 1566 -ROUNDPSri 1567 -ROUNDSDmi 1568 -ROUNDSDmi_Int 1569 -ROUNDSDri 1570 -ROUNDSDri_Int 1571 -ROUNDSSmi 1572 -ROUNDSSmi_Int 1573 -ROUNDSSri 1574 -ROUNDSSri_Int 1575 -RSM 1576 -RSQRTPSm 1577 -RSQRTPSr 1578 -RSQRTSSm 1579 -RSQRTSSm_Int 1580 -RSQRTSSr 1581 -RSQRTSSr_Int 1582 -RSTORSSP 1583 -SAHF 1584 -SALC 1585 -SAR 1586 -SARX 1587 -SAVEPREVSSP 1588 -SBB 1589 -SCASB 1590 -SCASL 1591 -SCASQ 1592 -SCASW 1593 -SEAMCALL 1594 -SEAMOPS 1595 -SEAMRET 1596 -SEG_ALLOCA 1597 -SEH_BeginEpilogue 1598 -SEH_EndEpilogue 1599 -SEH_EndPrologue 1600 -SEH_PushFrame 1601 -SEH_PushReg 1602 -SEH_SaveReg 1603 -SEH_SaveXMM 1604 -SEH_SetFrame 1605 -SEH_StackAlign 1606 -SEH_StackAlloc 1607 -SEH_UnwindV 1608 -SEH_UnwindVersion 1609 -SENDUIPI 1610 -SERIALIZE 1611 -SETB_C 1612 -SETCCm 1613 -SETCCm_EVEX 1614 -SETCCr 1615 -SETCCr_EVEX 1616 -SETSSBSY 1617 -SETZUCCm 1618 -SETZUCCr 1619 -SFENCE 1620 -SGDT 1621 -SHA 1622 -SHL 1623 -SHLD 1624 -SHLDROT 1625 -SHLX 1626 -SHR 1627 -SHRD 1628 -SHRDROT 1629 -SHRX 1630 -SHUFPDrmi 1631 -SHUFPDrri 1632 -SHUFPSrmi 1633 -SHUFPSrri 1634 -SIDT 1635 -SKINIT 1636 -SLDT 1637 -SLWPCB 1638 -SMSW 1639 -SQRTPDm 1640 -SQRTPDr 1641 -SQRTPSm 1642 -SQRTPSr 1643 -SQRTSDm 1644 -SQRTSDm_Int 1645 -SQRTSDr 1646 -SQRTSDr_Int 1647 -SQRTSSm 1648 -SQRTSSm_Int 1649 -SQRTSSr 1650 -SQRTSSr_Int 1651 -SQRT_F 1652 -SQRT_Fp 1653 -SS_PREFIX 1654 -STAC 1655 -STACKALLOC_W_PROBING 1656 -STACKMAP 1657 -STATEPOINT 1658 -STC 1659 -STD 1660 -STGI 1661 -STI 1662 -STMXCSR 1663 -STOSB 1664 -STOSL 1665 -STOSQ 1666 -STOSW 1667 -STR 1668 -STRm 1669 -STTILECFG 1670 -STTILECFG_EVEX 1671 -STUI 1672 -ST_F 1673 -ST_FP 1674 -ST_FPrr 1675 -ST_Fp 1676 -ST_FpP 1677 -ST_Frr 1678 -SUB 1679 -SUBPDrm 1680 -SUBPDrr 1681 -SUBPSrm 1682 -SUBPSrr 1683 -SUBREG_TO_REG 1684 -SUBR_F 1685 -SUBR_FI 1686 -SUBR_FPrST 1687 -SUBR_FST 1688 -SUBR_Fp 1689 -SUBR_FpI 1690 -SUBR_FrST 1691 -SUBSDrm 1692 -SUBSDrm_Int 1693 -SUBSDrr 1694 -SUBSDrr_Int 1695 -SUBSSrm 1696 -SUBSSrm_Int 1697 -SUBSSrr 1698 -SUBSSrr_Int 1699 -SUB_F 1700 -SUB_FI 1701 -SUB_FPrST 1702 -SUB_FST 1703 -SUB_Fp 1704 -SUB_FpI 1705 -SUB_FrST 1706 -SWAPGS 1707 -SYSCALL 1708 -SYSENTER 1709 -SYSEXIT 1710 -SYSRET 1711 -T 1712 -TAILJMPd 1713 -TAILJMPd_CC 1714 -TAILJMPm 1715 -TAILJMPr 1716 -TCMMIMFP 1717 -TCMMRLFP 1718 -TCONJTCMMIMFP 1719 -TCONJTFP 1720 -TCRETURN_HIPE 1721 -TCRETURN_WIN 1722 -TCRETURN_WINmi 1723 -TCRETURNdi 1724 -TCRETURNdicc 1725 -TCRETURNmi 1726 -TCRETURNri 1727 -TCVTROWD 1728 -TCVTROWPS 1729 -TDCALL 1730 -TDPBF 1731 -TDPBHF 1732 -TDPBSSD 1733 -TDPBSUD 1734 -TDPBUSD 1735 -TDPBUUD 1736 -TDPFP 1737 -TDPHBF 1738 -TDPHF 1739 -TEST 1740 -TESTUI 1741 -TILELOADD 1742 -TILELOADDRS 1743 -TILELOADDRST 1744 -TILELOADDRS_EVEX 1745 -TILELOADDT 1746 -TILELOADD_EVEX 1747 -TILEMOVROWrre 1748 -TILEMOVROWrri 1749 -TILERELEASE 1750 -TILESTORED 1751 -TILESTORED_EVEX 1752 -TILEZERO 1753 -TLBSYNC 1754 -TLSCall 1755 -TLS_addr 1756 -TLS_addrX 1757 -TLS_base_addr 1758 -TLS_base_addrX 1759 -TLS_desc 1760 -TMMULTF 1761 -TPAUSE 1762 -TRAP 1763 -TST_F 1764 -TST_Fp 1765 -TTCMMIMFP 1766 -TTCMMRLFP 1767 -TTDPBF 1768 -TTDPFP 1769 -TTMMULTF 1770 -TTRANSPOSED 1771 -TZCNT 1772 -TZMSK 1773 -UBSAN_UD 1774 -UCOMISDrm 1775 -UCOMISDrm_Int 1776 -UCOMISDrr 1777 -UCOMISDrr_Int 1778 -UCOMISSrm 1779 -UCOMISSrm_Int 1780 -UCOMISSrr 1781 -UCOMISSrr_Int 1782 -UCOM_FIPr 1783 -UCOM_FIr 1784 -UCOM_FPPr 1785 -UCOM_FPr 1786 -UCOM_FpIr 1787 -UCOM_Fpr 1788 -UCOM_Fr 1789 -UD 1790 -UIRET 1791 -UMONITOR 1792 -UMWAIT 1793 -UNPCKHPDrm 1794 -UNPCKHPDrr 1795 -UNPCKHPSrm 1796 -UNPCKHPSrr 1797 -UNPCKLPDrm 1798 -UNPCKLPDrr 1799 -UNPCKLPSrm 1800 -UNPCKLPSrr 1801 -URDMSRri 1802 -URDMSRri_EVEX 1803 -URDMSRrr 1804 -URDMSRrr_EVEX 1805 -UWRMSRir 1806 -UWRMSRir_EVEX 1807 -UWRMSRrr 1808 -UWRMSRrr_EVEX 1809 -V 1810 -VAARG 1811 -VAARG_X 1812 -VADDBF 1813 -VADDPDYrm 1814 -VADDPDYrr 1815 -VADDPDZ 1816 -VADDPDZrm 1817 -VADDPDZrmb 1818 -VADDPDZrmbk 1819 -VADDPDZrmbkz 1820 -VADDPDZrmk 1821 -VADDPDZrmkz 1822 -VADDPDZrr 1823 -VADDPDZrrb 1824 -VADDPDZrrbk 1825 -VADDPDZrrbkz 1826 -VADDPDZrrk 1827 -VADDPDZrrkz 1828 -VADDPDrm 1829 -VADDPDrr 1830 -VADDPHZ 1831 -VADDPHZrm 1832 -VADDPHZrmb 1833 -VADDPHZrmbk 1834 -VADDPHZrmbkz 1835 -VADDPHZrmk 1836 -VADDPHZrmkz 1837 -VADDPHZrr 1838 -VADDPHZrrb 1839 -VADDPHZrrbk 1840 -VADDPHZrrbkz 1841 -VADDPHZrrk 1842 -VADDPHZrrkz 1843 -VADDPSYrm 1844 -VADDPSYrr 1845 -VADDPSZ 1846 -VADDPSZrm 1847 -VADDPSZrmb 1848 -VADDPSZrmbk 1849 -VADDPSZrmbkz 1850 -VADDPSZrmk 1851 -VADDPSZrmkz 1852 -VADDPSZrr 1853 -VADDPSZrrb 1854 -VADDPSZrrbk 1855 -VADDPSZrrbkz 1856 -VADDPSZrrk 1857 -VADDPSZrrkz 1858 -VADDPSrm 1859 -VADDPSrr 1860 -VADDSDZrm 1861 -VADDSDZrm_Int 1862 -VADDSDZrmk_Int 1863 -VADDSDZrmkz_Int 1864 -VADDSDZrr 1865 -VADDSDZrr_Int 1866 -VADDSDZrrb_Int 1867 -VADDSDZrrbk_Int 1868 -VADDSDZrrbkz_Int 1869 -VADDSDZrrk_Int 1870 -VADDSDZrrkz_Int 1871 -VADDSDrm 1872 -VADDSDrm_Int 1873 -VADDSDrr 1874 -VADDSDrr_Int 1875 -VADDSHZrm 1876 -VADDSHZrm_Int 1877 -VADDSHZrmk_Int 1878 -VADDSHZrmkz_Int 1879 -VADDSHZrr 1880 -VADDSHZrr_Int 1881 -VADDSHZrrb_Int 1882 -VADDSHZrrbk_Int 1883 -VADDSHZrrbkz_Int 1884 -VADDSHZrrk_Int 1885 -VADDSHZrrkz_Int 1886 -VADDSSZrm 1887 -VADDSSZrm_Int 1888 -VADDSSZrmk_Int 1889 -VADDSSZrmkz_Int 1890 -VADDSSZrr 1891 -VADDSSZrr_Int 1892 -VADDSSZrrb_Int 1893 -VADDSSZrrbk_Int 1894 -VADDSSZrrbkz_Int 1895 -VADDSSZrrk_Int 1896 -VADDSSZrrkz_Int 1897 -VADDSSrm 1898 -VADDSSrm_Int 1899 -VADDSSrr 1900 -VADDSSrr_Int 1901 -VADDSUBPDYrm 1902 -VADDSUBPDYrr 1903 -VADDSUBPDrm 1904 -VADDSUBPDrr 1905 -VADDSUBPSYrm 1906 -VADDSUBPSYrr 1907 -VADDSUBPSrm 1908 -VADDSUBPSrr 1909 -VAESDECLASTYrm 1910 -VAESDECLASTYrr 1911 -VAESDECLASTZ 1912 -VAESDECLASTZrm 1913 -VAESDECLASTZrr 1914 -VAESDECLASTrm 1915 -VAESDECLASTrr 1916 -VAESDECYrm 1917 -VAESDECYrr 1918 -VAESDECZ 1919 -VAESDECZrm 1920 -VAESDECZrr 1921 -VAESDECrm 1922 -VAESDECrr 1923 -VAESENCLASTYrm 1924 -VAESENCLASTYrr 1925 -VAESENCLASTZ 1926 -VAESENCLASTZrm 1927 -VAESENCLASTZrr 1928 -VAESENCLASTrm 1929 -VAESENCLASTrr 1930 -VAESENCYrm 1931 -VAESENCYrr 1932 -VAESENCZ 1933 -VAESENCZrm 1934 -VAESENCZrr 1935 -VAESENCrm 1936 -VAESENCrr 1937 -VAESIMCrm 1938 -VAESIMCrr 1939 -VAESKEYGENASSISTrmi 1940 -VAESKEYGENASSISTrri 1941 -VALIGNDZ 1942 -VALIGNDZrmbi 1943 -VALIGNDZrmbik 1944 -VALIGNDZrmbikz 1945 -VALIGNDZrmi 1946 -VALIGNDZrmik 1947 -VALIGNDZrmikz 1948 -VALIGNDZrri 1949 -VALIGNDZrrik 1950 -VALIGNDZrrikz 1951 -VALIGNQZ 1952 -VALIGNQZrmbi 1953 -VALIGNQZrmbik 1954 -VALIGNQZrmbikz 1955 -VALIGNQZrmi 1956 -VALIGNQZrmik 1957 -VALIGNQZrmikz 1958 -VALIGNQZrri 1959 -VALIGNQZrrik 1960 -VALIGNQZrrikz 1961 -VANDNPDYrm 1962 -VANDNPDYrr 1963 -VANDNPDZ 1964 -VANDNPDZrm 1965 -VANDNPDZrmb 1966 -VANDNPDZrmbk 1967 -VANDNPDZrmbkz 1968 -VANDNPDZrmk 1969 -VANDNPDZrmkz 1970 -VANDNPDZrr 1971 -VANDNPDZrrk 1972 -VANDNPDZrrkz 1973 -VANDNPDrm 1974 -VANDNPDrr 1975 -VANDNPSYrm 1976 -VANDNPSYrr 1977 -VANDNPSZ 1978 -VANDNPSZrm 1979 -VANDNPSZrmb 1980 -VANDNPSZrmbk 1981 -VANDNPSZrmbkz 1982 -VANDNPSZrmk 1983 -VANDNPSZrmkz 1984 -VANDNPSZrr 1985 -VANDNPSZrrk 1986 -VANDNPSZrrkz 1987 -VANDNPSrm 1988 -VANDNPSrr 1989 -VANDPDYrm 1990 -VANDPDYrr 1991 -VANDPDZ 1992 -VANDPDZrm 1993 -VANDPDZrmb 1994 -VANDPDZrmbk 1995 -VANDPDZrmbkz 1996 -VANDPDZrmk 1997 -VANDPDZrmkz 1998 -VANDPDZrr 1999 -VANDPDZrrk 2000 -VANDPDZrrkz 2001 -VANDPDrm 2002 -VANDPDrr 2003 -VANDPSYrm 2004 -VANDPSYrr 2005 -VANDPSZ 2006 -VANDPSZrm 2007 -VANDPSZrmb 2008 -VANDPSZrmbk 2009 -VANDPSZrmbkz 2010 -VANDPSZrmk 2011 -VANDPSZrmkz 2012 -VANDPSZrr 2013 -VANDPSZrrk 2014 -VANDPSZrrkz 2015 -VANDPSrm 2016 -VANDPSrr 2017 -VASTART_SAVE_XMM_REGS 2018 -VBCSTNEBF 2019 -VBCSTNESH 2020 -VBLENDMPDZ 2021 -VBLENDMPDZrm 2022 -VBLENDMPDZrmb 2023 -VBLENDMPDZrmbk 2024 -VBLENDMPDZrmbkz 2025 -VBLENDMPDZrmk 2026 -VBLENDMPDZrmkz 2027 -VBLENDMPDZrr 2028 -VBLENDMPDZrrk 2029 -VBLENDMPDZrrkz 2030 -VBLENDMPSZ 2031 -VBLENDMPSZrm 2032 -VBLENDMPSZrmb 2033 -VBLENDMPSZrmbk 2034 -VBLENDMPSZrmbkz 2035 -VBLENDMPSZrmk 2036 -VBLENDMPSZrmkz 2037 -VBLENDMPSZrr 2038 -VBLENDMPSZrrk 2039 -VBLENDMPSZrrkz 2040 -VBLENDPDYrmi 2041 -VBLENDPDYrri 2042 -VBLENDPDrmi 2043 -VBLENDPDrri 2044 -VBLENDPSYrmi 2045 -VBLENDPSYrri 2046 -VBLENDPSrmi 2047 -VBLENDPSrri 2048 -VBLENDVPDYrmr 2049 -VBLENDVPDYrrr 2050 -VBLENDVPDrmr 2051 -VBLENDVPDrrr 2052 -VBLENDVPSYrmr 2053 -VBLENDVPSYrrr 2054 -VBLENDVPSrmr 2055 -VBLENDVPSrrr 2056 -VBROADCASTF 2057 -VBROADCASTI 2058 -VBROADCASTSDYrm 2059 -VBROADCASTSDYrr 2060 -VBROADCASTSDZ 2061 -VBROADCASTSDZrm 2062 -VBROADCASTSDZrmk 2063 -VBROADCASTSDZrmkz 2064 -VBROADCASTSDZrr 2065 -VBROADCASTSDZrrk 2066 -VBROADCASTSDZrrkz 2067 -VBROADCASTSSYrm 2068 -VBROADCASTSSYrr 2069 -VBROADCASTSSZ 2070 -VBROADCASTSSZrm 2071 -VBROADCASTSSZrmk 2072 -VBROADCASTSSZrmkz 2073 -VBROADCASTSSZrr 2074 -VBROADCASTSSZrrk 2075 -VBROADCASTSSZrrkz 2076 -VBROADCASTSSrm 2077 -VBROADCASTSSrr 2078 -VCMPBF 2079 -VCMPPDYrmi 2080 -VCMPPDYrri 2081 -VCMPPDZ 2082 -VCMPPDZrmbi 2083 -VCMPPDZrmbik 2084 -VCMPPDZrmi 2085 -VCMPPDZrmik 2086 -VCMPPDZrri 2087 -VCMPPDZrrib 2088 -VCMPPDZrribk 2089 -VCMPPDZrrik 2090 -VCMPPDrmi 2091 -VCMPPDrri 2092 -VCMPPHZ 2093 -VCMPPHZrmbi 2094 -VCMPPHZrmbik 2095 -VCMPPHZrmi 2096 -VCMPPHZrmik 2097 -VCMPPHZrri 2098 -VCMPPHZrrib 2099 -VCMPPHZrribk 2100 -VCMPPHZrrik 2101 -VCMPPSYrmi 2102 -VCMPPSYrri 2103 -VCMPPSZ 2104 -VCMPPSZrmbi 2105 -VCMPPSZrmbik 2106 -VCMPPSZrmi 2107 -VCMPPSZrmik 2108 -VCMPPSZrri 2109 -VCMPPSZrrib 2110 -VCMPPSZrribk 2111 -VCMPPSZrrik 2112 -VCMPPSrmi 2113 -VCMPPSrri 2114 -VCMPSDZrmi 2115 -VCMPSDZrmi_Int 2116 -VCMPSDZrmik_Int 2117 -VCMPSDZrri 2118 -VCMPSDZrri_Int 2119 -VCMPSDZrrib_Int 2120 -VCMPSDZrribk_Int 2121 -VCMPSDZrrik_Int 2122 -VCMPSDrmi 2123 -VCMPSDrmi_Int 2124 -VCMPSDrri 2125 -VCMPSDrri_Int 2126 -VCMPSHZrmi 2127 -VCMPSHZrmi_Int 2128 -VCMPSHZrmik_Int 2129 -VCMPSHZrri 2130 -VCMPSHZrri_Int 2131 -VCMPSHZrrib_Int 2132 -VCMPSHZrribk_Int 2133 -VCMPSHZrrik_Int 2134 -VCMPSSZrmi 2135 -VCMPSSZrmi_Int 2136 -VCMPSSZrmik_Int 2137 -VCMPSSZrri 2138 -VCMPSSZrri_Int 2139 -VCMPSSZrrib_Int 2140 -VCMPSSZrribk_Int 2141 -VCMPSSZrrik_Int 2142 -VCMPSSrmi 2143 -VCMPSSrmi_Int 2144 -VCMPSSrri 2145 -VCMPSSrri_Int 2146 -VCOMISBF 2147 -VCOMISDZrm 2148 -VCOMISDZrm_Int 2149 -VCOMISDZrr 2150 -VCOMISDZrr_Int 2151 -VCOMISDZrrb 2152 -VCOMISDrm 2153 -VCOMISDrm_Int 2154 -VCOMISDrr 2155 -VCOMISDrr_Int 2156 -VCOMISHZrm 2157 -VCOMISHZrm_Int 2158 -VCOMISHZrr 2159 -VCOMISHZrr_Int 2160 -VCOMISHZrrb 2161 -VCOMISSZrm 2162 -VCOMISSZrm_Int 2163 -VCOMISSZrr 2164 -VCOMISSZrr_Int 2165 -VCOMISSZrrb 2166 -VCOMISSrm 2167 -VCOMISSrm_Int 2168 -VCOMISSrr 2169 -VCOMISSrr_Int 2170 -VCOMPRESSPDZ 2171 -VCOMPRESSPDZmr 2172 -VCOMPRESSPDZmrk 2173 -VCOMPRESSPDZrr 2174 -VCOMPRESSPDZrrk 2175 -VCOMPRESSPDZrrkz 2176 -VCOMPRESSPSZ 2177 -VCOMPRESSPSZmr 2178 -VCOMPRESSPSZmrk 2179 -VCOMPRESSPSZrr 2180 -VCOMPRESSPSZrrk 2181 -VCOMPRESSPSZrrkz 2182 -VCOMXSDZrm_Int 2183 -VCOMXSDZrr_Int 2184 -VCOMXSDZrrb_Int 2185 -VCOMXSHZrm_Int 2186 -VCOMXSHZrr_Int 2187 -VCOMXSHZrrb_Int 2188 -VCOMXSSZrm_Int 2189 -VCOMXSSZrr_Int 2190 -VCOMXSSZrrb_Int 2191 -VCVT 2192 -VCVTBF 2193 -VCVTBIASPH 2194 -VCVTDQ 2195 -VCVTHF 2196 -VCVTNE 2197 -VCVTNEEBF 2198 -VCVTNEEPH 2199 -VCVTNEOBF 2200 -VCVTNEOPH 2201 -VCVTNEPS 2202 -VCVTPD 2203 -VCVTPH 2204 -VCVTPS 2205 -VCVTQQ 2206 -VCVTSD 2207 -VCVTSH 2208 -VCVTSI 2209 -VCVTSS 2210 -VCVTTBF 2211 -VCVTTPD 2212 -VCVTTPH 2213 -VCVTTPS 2214 -VCVTTSD 2215 -VCVTTSH 2216 -VCVTTSS 2217 -VCVTUDQ 2218 -VCVTUQQ 2219 -VCVTUSI 2220 -VCVTUW 2221 -VCVTW 2222 -VDBPSADBWZ 2223 -VDBPSADBWZrmi 2224 -VDBPSADBWZrmik 2225 -VDBPSADBWZrmikz 2226 -VDBPSADBWZrri 2227 -VDBPSADBWZrrik 2228 -VDBPSADBWZrrikz 2229 -VDIVBF 2230 -VDIVPDYrm 2231 -VDIVPDYrr 2232 -VDIVPDZ 2233 -VDIVPDZrm 2234 -VDIVPDZrmb 2235 -VDIVPDZrmbk 2236 -VDIVPDZrmbkz 2237 -VDIVPDZrmk 2238 -VDIVPDZrmkz 2239 -VDIVPDZrr 2240 -VDIVPDZrrb 2241 -VDIVPDZrrbk 2242 -VDIVPDZrrbkz 2243 -VDIVPDZrrk 2244 -VDIVPDZrrkz 2245 -VDIVPDrm 2246 -VDIVPDrr 2247 -VDIVPHZ 2248 -VDIVPHZrm 2249 -VDIVPHZrmb 2250 -VDIVPHZrmbk 2251 -VDIVPHZrmbkz 2252 -VDIVPHZrmk 2253 -VDIVPHZrmkz 2254 -VDIVPHZrr 2255 -VDIVPHZrrb 2256 -VDIVPHZrrbk 2257 -VDIVPHZrrbkz 2258 -VDIVPHZrrk 2259 -VDIVPHZrrkz 2260 -VDIVPSYrm 2261 -VDIVPSYrr 2262 -VDIVPSZ 2263 -VDIVPSZrm 2264 -VDIVPSZrmb 2265 -VDIVPSZrmbk 2266 -VDIVPSZrmbkz 2267 -VDIVPSZrmk 2268 -VDIVPSZrmkz 2269 -VDIVPSZrr 2270 -VDIVPSZrrb 2271 -VDIVPSZrrbk 2272 -VDIVPSZrrbkz 2273 -VDIVPSZrrk 2274 -VDIVPSZrrkz 2275 -VDIVPSrm 2276 -VDIVPSrr 2277 -VDIVSDZrm 2278 -VDIVSDZrm_Int 2279 -VDIVSDZrmk_Int 2280 -VDIVSDZrmkz_Int 2281 -VDIVSDZrr 2282 -VDIVSDZrr_Int 2283 -VDIVSDZrrb_Int 2284 -VDIVSDZrrbk_Int 2285 -VDIVSDZrrbkz_Int 2286 -VDIVSDZrrk_Int 2287 -VDIVSDZrrkz_Int 2288 -VDIVSDrm 2289 -VDIVSDrm_Int 2290 -VDIVSDrr 2291 -VDIVSDrr_Int 2292 -VDIVSHZrm 2293 -VDIVSHZrm_Int 2294 -VDIVSHZrmk_Int 2295 -VDIVSHZrmkz_Int 2296 -VDIVSHZrr 2297 -VDIVSHZrr_Int 2298 -VDIVSHZrrb_Int 2299 -VDIVSHZrrbk_Int 2300 -VDIVSHZrrbkz_Int 2301 -VDIVSHZrrk_Int 2302 -VDIVSHZrrkz_Int 2303 -VDIVSSZrm 2304 -VDIVSSZrm_Int 2305 -VDIVSSZrmk_Int 2306 -VDIVSSZrmkz_Int 2307 -VDIVSSZrr 2308 -VDIVSSZrr_Int 2309 -VDIVSSZrrb_Int 2310 -VDIVSSZrrbk_Int 2311 -VDIVSSZrrbkz_Int 2312 -VDIVSSZrrk_Int 2313 -VDIVSSZrrkz_Int 2314 -VDIVSSrm 2315 -VDIVSSrm_Int 2316 -VDIVSSrr 2317 -VDIVSSrr_Int 2318 -VDPBF 2319 -VDPPDrmi 2320 -VDPPDrri 2321 -VDPPHPSZ 2322 -VDPPHPSZm 2323 -VDPPHPSZmb 2324 -VDPPHPSZmbk 2325 -VDPPHPSZmbkz 2326 -VDPPHPSZmk 2327 -VDPPHPSZmkz 2328 -VDPPHPSZr 2329 -VDPPHPSZrk 2330 -VDPPHPSZrkz 2331 -VDPPSYrmi 2332 -VDPPSYrri 2333 -VDPPSrmi 2334 -VDPPSrri 2335 -VERRm 2336 -VERRr 2337 -VERWm 2338 -VERWr 2339 -VEXP 2340 -VEXPANDPDZ 2341 -VEXPANDPDZrm 2342 -VEXPANDPDZrmk 2343 -VEXPANDPDZrmkz 2344 -VEXPANDPDZrr 2345 -VEXPANDPDZrrk 2346 -VEXPANDPDZrrkz 2347 -VEXPANDPSZ 2348 -VEXPANDPSZrm 2349 -VEXPANDPSZrmk 2350 -VEXPANDPSZrmkz 2351 -VEXPANDPSZrr 2352 -VEXPANDPSZrrk 2353 -VEXPANDPSZrrkz 2354 -VEXTRACTF 2355 -VEXTRACTI 2356 -VEXTRACTPSZmri 2357 -VEXTRACTPSZrri 2358 -VEXTRACTPSmri 2359 -VEXTRACTPSrri 2360 -VFCMADDCPHZ 2361 -VFCMADDCPHZm 2362 -VFCMADDCPHZmb 2363 -VFCMADDCPHZmbk 2364 -VFCMADDCPHZmbkz 2365 -VFCMADDCPHZmk 2366 -VFCMADDCPHZmkz 2367 -VFCMADDCPHZr 2368 -VFCMADDCPHZrb 2369 -VFCMADDCPHZrbk 2370 -VFCMADDCPHZrbkz 2371 -VFCMADDCPHZrk 2372 -VFCMADDCPHZrkz 2373 -VFCMADDCSHZm 2374 -VFCMADDCSHZmk 2375 -VFCMADDCSHZmkz 2376 -VFCMADDCSHZr 2377 -VFCMADDCSHZrb 2378 -VFCMADDCSHZrbk 2379 -VFCMADDCSHZrbkz 2380 -VFCMADDCSHZrk 2381 -VFCMADDCSHZrkz 2382 -VFCMULCPHZ 2383 -VFCMULCPHZrm 2384 -VFCMULCPHZrmb 2385 -VFCMULCPHZrmbk 2386 -VFCMULCPHZrmbkz 2387 -VFCMULCPHZrmk 2388 -VFCMULCPHZrmkz 2389 -VFCMULCPHZrr 2390 -VFCMULCPHZrrb 2391 -VFCMULCPHZrrbk 2392 -VFCMULCPHZrrbkz 2393 -VFCMULCPHZrrk 2394 -VFCMULCPHZrrkz 2395 -VFCMULCSHZrm 2396 -VFCMULCSHZrmk 2397 -VFCMULCSHZrmkz 2398 -VFCMULCSHZrr 2399 -VFCMULCSHZrrb 2400 -VFCMULCSHZrrbk 2401 -VFCMULCSHZrrbkz 2402 -VFCMULCSHZrrk 2403 -VFCMULCSHZrrkz 2404 -VFIXUPIMMPDZ 2405 -VFIXUPIMMPDZrmbi 2406 -VFIXUPIMMPDZrmbik 2407 -VFIXUPIMMPDZrmbikz 2408 -VFIXUPIMMPDZrmi 2409 -VFIXUPIMMPDZrmik 2410 -VFIXUPIMMPDZrmikz 2411 -VFIXUPIMMPDZrri 2412 -VFIXUPIMMPDZrrib 2413 -VFIXUPIMMPDZrribk 2414 -VFIXUPIMMPDZrribkz 2415 -VFIXUPIMMPDZrrik 2416 -VFIXUPIMMPDZrrikz 2417 -VFIXUPIMMPSZ 2418 -VFIXUPIMMPSZrmbi 2419 -VFIXUPIMMPSZrmbik 2420 -VFIXUPIMMPSZrmbikz 2421 -VFIXUPIMMPSZrmi 2422 -VFIXUPIMMPSZrmik 2423 -VFIXUPIMMPSZrmikz 2424 -VFIXUPIMMPSZrri 2425 -VFIXUPIMMPSZrrib 2426 -VFIXUPIMMPSZrribk 2427 -VFIXUPIMMPSZrribkz 2428 -VFIXUPIMMPSZrrik 2429 -VFIXUPIMMPSZrrikz 2430 -VFIXUPIMMSDZrmi 2431 -VFIXUPIMMSDZrmik 2432 -VFIXUPIMMSDZrmikz 2433 -VFIXUPIMMSDZrri 2434 -VFIXUPIMMSDZrrib 2435 -VFIXUPIMMSDZrribk 2436 -VFIXUPIMMSDZrribkz 2437 -VFIXUPIMMSDZrrik 2438 -VFIXUPIMMSDZrrikz 2439 -VFIXUPIMMSSZrmi 2440 -VFIXUPIMMSSZrmik 2441 -VFIXUPIMMSSZrmikz 2442 -VFIXUPIMMSSZrri 2443 -VFIXUPIMMSSZrrib 2444 -VFIXUPIMMSSZrribk 2445 -VFIXUPIMMSSZrribkz 2446 -VFIXUPIMMSSZrrik 2447 -VFIXUPIMMSSZrrikz 2448 -VFMADD 2449 -VFMADDCPHZ 2450 -VFMADDCPHZm 2451 -VFMADDCPHZmb 2452 -VFMADDCPHZmbk 2453 -VFMADDCPHZmbkz 2454 -VFMADDCPHZmk 2455 -VFMADDCPHZmkz 2456 -VFMADDCPHZr 2457 -VFMADDCPHZrb 2458 -VFMADDCPHZrbk 2459 -VFMADDCPHZrbkz 2460 -VFMADDCPHZrk 2461 -VFMADDCPHZrkz 2462 -VFMADDCSHZm 2463 -VFMADDCSHZmk 2464 -VFMADDCSHZmkz 2465 -VFMADDCSHZr 2466 -VFMADDCSHZrb 2467 -VFMADDCSHZrbk 2468 -VFMADDCSHZrbkz 2469 -VFMADDCSHZrk 2470 -VFMADDCSHZrkz 2471 -VFMADDPD 2472 -VFMADDPS 2473 -VFMADDSD 2474 -VFMADDSS 2475 -VFMADDSUB 2476 -VFMADDSUBPD 2477 -VFMADDSUBPS 2478 -VFMSUB 2479 -VFMSUBADD 2480 -VFMSUBADDPD 2481 -VFMSUBADDPS 2482 -VFMSUBPD 2483 -VFMSUBPS 2484 -VFMSUBSD 2485 -VFMSUBSS 2486 -VFMULCPHZ 2487 -VFMULCPHZrm 2488 -VFMULCPHZrmb 2489 -VFMULCPHZrmbk 2490 -VFMULCPHZrmbkz 2491 -VFMULCPHZrmk 2492 -VFMULCPHZrmkz 2493 -VFMULCPHZrr 2494 -VFMULCPHZrrb 2495 -VFMULCPHZrrbk 2496 -VFMULCPHZrrbkz 2497 -VFMULCPHZrrk 2498 -VFMULCPHZrrkz 2499 -VFMULCSHZrm 2500 -VFMULCSHZrmk 2501 -VFMULCSHZrmkz 2502 -VFMULCSHZrr 2503 -VFMULCSHZrrb 2504 -VFMULCSHZrrbk 2505 -VFMULCSHZrrbkz 2506 -VFMULCSHZrrk 2507 -VFMULCSHZrrkz 2508 -VFNMADD 2509 -VFNMADDPD 2510 -VFNMADDPS 2511 -VFNMADDSD 2512 -VFNMADDSS 2513 -VFNMSUB 2514 -VFNMSUBPD 2515 -VFNMSUBPS 2516 -VFNMSUBSD 2517 -VFNMSUBSS 2518 -VFPCLASSBF 2519 -VFPCLASSPDZ 2520 -VFPCLASSPDZmbi 2521 -VFPCLASSPDZmbik 2522 -VFPCLASSPDZmi 2523 -VFPCLASSPDZmik 2524 -VFPCLASSPDZri 2525 -VFPCLASSPDZrik 2526 -VFPCLASSPHZ 2527 -VFPCLASSPHZmbi 2528 -VFPCLASSPHZmbik 2529 -VFPCLASSPHZmi 2530 -VFPCLASSPHZmik 2531 -VFPCLASSPHZri 2532 -VFPCLASSPHZrik 2533 -VFPCLASSPSZ 2534 -VFPCLASSPSZmbi 2535 -VFPCLASSPSZmbik 2536 -VFPCLASSPSZmi 2537 -VFPCLASSPSZmik 2538 -VFPCLASSPSZri 2539 -VFPCLASSPSZrik 2540 -VFPCLASSSDZmi 2541 -VFPCLASSSDZmik 2542 -VFPCLASSSDZri 2543 -VFPCLASSSDZrik 2544 -VFPCLASSSHZmi 2545 -VFPCLASSSHZmik 2546 -VFPCLASSSHZri 2547 -VFPCLASSSHZrik 2548 -VFPCLASSSSZmi 2549 -VFPCLASSSSZmik 2550 -VFPCLASSSSZri 2551 -VFPCLASSSSZrik 2552 -VFRCZPDYrm 2553 -VFRCZPDYrr 2554 -VFRCZPDrm 2555 -VFRCZPDrr 2556 -VFRCZPSYrm 2557 -VFRCZPSYrr 2558 -VFRCZPSrm 2559 -VFRCZPSrr 2560 -VFRCZSDrm 2561 -VFRCZSDrr 2562 -VFRCZSSrm 2563 -VFRCZSSrr 2564 -VGATHERDPDYrm 2565 -VGATHERDPDZ 2566 -VGATHERDPDZrm 2567 -VGATHERDPDrm 2568 -VGATHERDPSYrm 2569 -VGATHERDPSZ 2570 -VGATHERDPSZrm 2571 -VGATHERDPSrm 2572 -VGATHERPF 2573 -VGATHERQPDYrm 2574 -VGATHERQPDZ 2575 -VGATHERQPDZrm 2576 -VGATHERQPDrm 2577 -VGATHERQPSYrm 2578 -VGATHERQPSZ 2579 -VGATHERQPSZrm 2580 -VGATHERQPSrm 2581 -VGETEXPBF 2582 -VGETEXPPDZ 2583 -VGETEXPPDZm 2584 -VGETEXPPDZmb 2585 -VGETEXPPDZmbk 2586 -VGETEXPPDZmbkz 2587 -VGETEXPPDZmk 2588 -VGETEXPPDZmkz 2589 -VGETEXPPDZr 2590 -VGETEXPPDZrb 2591 -VGETEXPPDZrbk 2592 -VGETEXPPDZrbkz 2593 -VGETEXPPDZrk 2594 -VGETEXPPDZrkz 2595 -VGETEXPPHZ 2596 -VGETEXPPHZm 2597 -VGETEXPPHZmb 2598 -VGETEXPPHZmbk 2599 -VGETEXPPHZmbkz 2600 -VGETEXPPHZmk 2601 -VGETEXPPHZmkz 2602 -VGETEXPPHZr 2603 -VGETEXPPHZrb 2604 -VGETEXPPHZrbk 2605 -VGETEXPPHZrbkz 2606 -VGETEXPPHZrk 2607 -VGETEXPPHZrkz 2608 -VGETEXPPSZ 2609 -VGETEXPPSZm 2610 -VGETEXPPSZmb 2611 -VGETEXPPSZmbk 2612 -VGETEXPPSZmbkz 2613 -VGETEXPPSZmk 2614 -VGETEXPPSZmkz 2615 -VGETEXPPSZr 2616 -VGETEXPPSZrb 2617 -VGETEXPPSZrbk 2618 -VGETEXPPSZrbkz 2619 -VGETEXPPSZrk 2620 -VGETEXPPSZrkz 2621 -VGETEXPSDZm 2622 -VGETEXPSDZmk 2623 -VGETEXPSDZmkz 2624 -VGETEXPSDZr 2625 -VGETEXPSDZrb 2626 -VGETEXPSDZrbk 2627 -VGETEXPSDZrbkz 2628 -VGETEXPSDZrk 2629 -VGETEXPSDZrkz 2630 -VGETEXPSHZm 2631 -VGETEXPSHZmk 2632 -VGETEXPSHZmkz 2633 -VGETEXPSHZr 2634 -VGETEXPSHZrb 2635 -VGETEXPSHZrbk 2636 -VGETEXPSHZrbkz 2637 -VGETEXPSHZrk 2638 -VGETEXPSHZrkz 2639 -VGETEXPSSZm 2640 -VGETEXPSSZmk 2641 -VGETEXPSSZmkz 2642 -VGETEXPSSZr 2643 -VGETEXPSSZrb 2644 -VGETEXPSSZrbk 2645 -VGETEXPSSZrbkz 2646 -VGETEXPSSZrk 2647 -VGETEXPSSZrkz 2648 -VGETMANTBF 2649 -VGETMANTPDZ 2650 -VGETMANTPDZrmbi 2651 -VGETMANTPDZrmbik 2652 -VGETMANTPDZrmbikz 2653 -VGETMANTPDZrmi 2654 -VGETMANTPDZrmik 2655 -VGETMANTPDZrmikz 2656 -VGETMANTPDZrri 2657 -VGETMANTPDZrrib 2658 -VGETMANTPDZrribk 2659 -VGETMANTPDZrribkz 2660 -VGETMANTPDZrrik 2661 -VGETMANTPDZrrikz 2662 -VGETMANTPHZ 2663 -VGETMANTPHZrmbi 2664 -VGETMANTPHZrmbik 2665 -VGETMANTPHZrmbikz 2666 -VGETMANTPHZrmi 2667 -VGETMANTPHZrmik 2668 -VGETMANTPHZrmikz 2669 -VGETMANTPHZrri 2670 -VGETMANTPHZrrib 2671 -VGETMANTPHZrribk 2672 -VGETMANTPHZrribkz 2673 -VGETMANTPHZrrik 2674 -VGETMANTPHZrrikz 2675 -VGETMANTPSZ 2676 -VGETMANTPSZrmbi 2677 -VGETMANTPSZrmbik 2678 -VGETMANTPSZrmbikz 2679 -VGETMANTPSZrmi 2680 -VGETMANTPSZrmik 2681 -VGETMANTPSZrmikz 2682 -VGETMANTPSZrri 2683 -VGETMANTPSZrrib 2684 -VGETMANTPSZrribk 2685 -VGETMANTPSZrribkz 2686 -VGETMANTPSZrrik 2687 -VGETMANTPSZrrikz 2688 -VGETMANTSDZrmi 2689 -VGETMANTSDZrmik 2690 -VGETMANTSDZrmikz 2691 -VGETMANTSDZrri 2692 -VGETMANTSDZrrib 2693 -VGETMANTSDZrribk 2694 -VGETMANTSDZrribkz 2695 -VGETMANTSDZrrik 2696 -VGETMANTSDZrrikz 2697 -VGETMANTSHZrmi 2698 -VGETMANTSHZrmik 2699 -VGETMANTSHZrmikz 2700 -VGETMANTSHZrri 2701 -VGETMANTSHZrrib 2702 -VGETMANTSHZrribk 2703 -VGETMANTSHZrribkz 2704 -VGETMANTSHZrrik 2705 -VGETMANTSHZrrikz 2706 -VGETMANTSSZrmi 2707 -VGETMANTSSZrmik 2708 -VGETMANTSSZrmikz 2709 -VGETMANTSSZrri 2710 -VGETMANTSSZrrib 2711 -VGETMANTSSZrribk 2712 -VGETMANTSSZrribkz 2713 -VGETMANTSSZrrik 2714 -VGETMANTSSZrrikz 2715 -VGF 2716 -VHADDPDYrm 2717 -VHADDPDYrr 2718 -VHADDPDrm 2719 -VHADDPDrr 2720 -VHADDPSYrm 2721 -VHADDPSYrr 2722 -VHADDPSrm 2723 -VHADDPSrr 2724 -VHSUBPDYrm 2725 -VHSUBPDYrr 2726 -VHSUBPDrm 2727 -VHSUBPDrr 2728 -VHSUBPSYrm 2729 -VHSUBPSYrr 2730 -VHSUBPSrm 2731 -VHSUBPSrr 2732 -VINSERTF 2733 -VINSERTI 2734 -VINSERTPSZrmi 2735 -VINSERTPSZrri 2736 -VINSERTPSrmi 2737 -VINSERTPSrri 2738 -VLDDQUYrm 2739 -VLDDQUrm 2740 -VLDMXCSR 2741 -VMASKMOVDQU 2742 -VMASKMOVPDYmr 2743 -VMASKMOVPDYrm 2744 -VMASKMOVPDmr 2745 -VMASKMOVPDrm 2746 -VMASKMOVPSYmr 2747 -VMASKMOVPSYrm 2748 -VMASKMOVPSmr 2749 -VMASKMOVPSrm 2750 -VMAXBF 2751 -VMAXCPDYrm 2752 -VMAXCPDYrr 2753 -VMAXCPDZ 2754 -VMAXCPDZrm 2755 -VMAXCPDZrmb 2756 -VMAXCPDZrmbk 2757 -VMAXCPDZrmbkz 2758 -VMAXCPDZrmk 2759 -VMAXCPDZrmkz 2760 -VMAXCPDZrr 2761 -VMAXCPDZrrk 2762 -VMAXCPDZrrkz 2763 -VMAXCPDrm 2764 -VMAXCPDrr 2765 -VMAXCPHZ 2766 -VMAXCPHZrm 2767 -VMAXCPHZrmb 2768 -VMAXCPHZrmbk 2769 -VMAXCPHZrmbkz 2770 -VMAXCPHZrmk 2771 -VMAXCPHZrmkz 2772 -VMAXCPHZrr 2773 -VMAXCPHZrrk 2774 -VMAXCPHZrrkz 2775 -VMAXCPSYrm 2776 -VMAXCPSYrr 2777 -VMAXCPSZ 2778 -VMAXCPSZrm 2779 -VMAXCPSZrmb 2780 -VMAXCPSZrmbk 2781 -VMAXCPSZrmbkz 2782 -VMAXCPSZrmk 2783 -VMAXCPSZrmkz 2784 -VMAXCPSZrr 2785 -VMAXCPSZrrk 2786 -VMAXCPSZrrkz 2787 -VMAXCPSrm 2788 -VMAXCPSrr 2789 -VMAXCSDZrm 2790 -VMAXCSDZrr 2791 -VMAXCSDrm 2792 -VMAXCSDrr 2793 -VMAXCSHZrm 2794 -VMAXCSHZrr 2795 -VMAXCSSZrm 2796 -VMAXCSSZrr 2797 -VMAXCSSrm 2798 -VMAXCSSrr 2799 -VMAXPDYrm 2800 -VMAXPDYrr 2801 -VMAXPDZ 2802 -VMAXPDZrm 2803 -VMAXPDZrmb 2804 -VMAXPDZrmbk 2805 -VMAXPDZrmbkz 2806 -VMAXPDZrmk 2807 -VMAXPDZrmkz 2808 -VMAXPDZrr 2809 -VMAXPDZrrb 2810 -VMAXPDZrrbk 2811 -VMAXPDZrrbkz 2812 -VMAXPDZrrk 2813 -VMAXPDZrrkz 2814 -VMAXPDrm 2815 -VMAXPDrr 2816 -VMAXPHZ 2817 -VMAXPHZrm 2818 -VMAXPHZrmb 2819 -VMAXPHZrmbk 2820 -VMAXPHZrmbkz 2821 -VMAXPHZrmk 2822 -VMAXPHZrmkz 2823 -VMAXPHZrr 2824 -VMAXPHZrrb 2825 -VMAXPHZrrbk 2826 -VMAXPHZrrbkz 2827 -VMAXPHZrrk 2828 -VMAXPHZrrkz 2829 -VMAXPSYrm 2830 -VMAXPSYrr 2831 -VMAXPSZ 2832 -VMAXPSZrm 2833 -VMAXPSZrmb 2834 -VMAXPSZrmbk 2835 -VMAXPSZrmbkz 2836 -VMAXPSZrmk 2837 -VMAXPSZrmkz 2838 -VMAXPSZrr 2839 -VMAXPSZrrb 2840 -VMAXPSZrrbk 2841 -VMAXPSZrrbkz 2842 -VMAXPSZrrk 2843 -VMAXPSZrrkz 2844 -VMAXPSrm 2845 -VMAXPSrr 2846 -VMAXSDZrm 2847 -VMAXSDZrm_Int 2848 -VMAXSDZrmk_Int 2849 -VMAXSDZrmkz_Int 2850 -VMAXSDZrr 2851 -VMAXSDZrr_Int 2852 -VMAXSDZrrb_Int 2853 -VMAXSDZrrbk_Int 2854 -VMAXSDZrrbkz_Int 2855 -VMAXSDZrrk_Int 2856 -VMAXSDZrrkz_Int 2857 -VMAXSDrm 2858 -VMAXSDrm_Int 2859 -VMAXSDrr 2860 -VMAXSDrr_Int 2861 -VMAXSHZrm 2862 -VMAXSHZrm_Int 2863 -VMAXSHZrmk_Int 2864 -VMAXSHZrmkz_Int 2865 -VMAXSHZrr 2866 -VMAXSHZrr_Int 2867 -VMAXSHZrrb_Int 2868 -VMAXSHZrrbk_Int 2869 -VMAXSHZrrbkz_Int 2870 -VMAXSHZrrk_Int 2871 -VMAXSHZrrkz_Int 2872 -VMAXSSZrm 2873 -VMAXSSZrm_Int 2874 -VMAXSSZrmk_Int 2875 -VMAXSSZrmkz_Int 2876 -VMAXSSZrr 2877 -VMAXSSZrr_Int 2878 -VMAXSSZrrb_Int 2879 -VMAXSSZrrbk_Int 2880 -VMAXSSZrrbkz_Int 2881 -VMAXSSZrrk_Int 2882 -VMAXSSZrrkz_Int 2883 -VMAXSSrm 2884 -VMAXSSrm_Int 2885 -VMAXSSrr 2886 -VMAXSSrr_Int 2887 -VMCALL 2888 -VMCLEARm 2889 -VMFUNC 2890 -VMINBF 2891 -VMINCPDYrm 2892 -VMINCPDYrr 2893 -VMINCPDZ 2894 -VMINCPDZrm 2895 -VMINCPDZrmb 2896 -VMINCPDZrmbk 2897 -VMINCPDZrmbkz 2898 -VMINCPDZrmk 2899 -VMINCPDZrmkz 2900 -VMINCPDZrr 2901 -VMINCPDZrrk 2902 -VMINCPDZrrkz 2903 -VMINCPDrm 2904 -VMINCPDrr 2905 -VMINCPHZ 2906 -VMINCPHZrm 2907 -VMINCPHZrmb 2908 -VMINCPHZrmbk 2909 -VMINCPHZrmbkz 2910 -VMINCPHZrmk 2911 -VMINCPHZrmkz 2912 -VMINCPHZrr 2913 -VMINCPHZrrk 2914 -VMINCPHZrrkz 2915 -VMINCPSYrm 2916 -VMINCPSYrr 2917 -VMINCPSZ 2918 -VMINCPSZrm 2919 -VMINCPSZrmb 2920 -VMINCPSZrmbk 2921 -VMINCPSZrmbkz 2922 -VMINCPSZrmk 2923 -VMINCPSZrmkz 2924 -VMINCPSZrr 2925 -VMINCPSZrrk 2926 -VMINCPSZrrkz 2927 -VMINCPSrm 2928 -VMINCPSrr 2929 -VMINCSDZrm 2930 -VMINCSDZrr 2931 -VMINCSDrm 2932 -VMINCSDrr 2933 -VMINCSHZrm 2934 -VMINCSHZrr 2935 -VMINCSSZrm 2936 -VMINCSSZrr 2937 -VMINCSSrm 2938 -VMINCSSrr 2939 -VMINMAXBF 2940 -VMINMAXPDZ 2941 -VMINMAXPDZrmbi 2942 -VMINMAXPDZrmbik 2943 -VMINMAXPDZrmbikz 2944 -VMINMAXPDZrmi 2945 -VMINMAXPDZrmik 2946 -VMINMAXPDZrmikz 2947 -VMINMAXPDZrri 2948 -VMINMAXPDZrrib 2949 -VMINMAXPDZrribk 2950 -VMINMAXPDZrribkz 2951 -VMINMAXPDZrrik 2952 -VMINMAXPDZrrikz 2953 -VMINMAXPHZ 2954 -VMINMAXPHZrmbi 2955 -VMINMAXPHZrmbik 2956 -VMINMAXPHZrmbikz 2957 -VMINMAXPHZrmi 2958 -VMINMAXPHZrmik 2959 -VMINMAXPHZrmikz 2960 -VMINMAXPHZrri 2961 -VMINMAXPHZrrib 2962 -VMINMAXPHZrribk 2963 -VMINMAXPHZrribkz 2964 -VMINMAXPHZrrik 2965 -VMINMAXPHZrrikz 2966 -VMINMAXPSZ 2967 -VMINMAXPSZrmbi 2968 -VMINMAXPSZrmbik 2969 -VMINMAXPSZrmbikz 2970 -VMINMAXPSZrmi 2971 -VMINMAXPSZrmik 2972 -VMINMAXPSZrmikz 2973 -VMINMAXPSZrri 2974 -VMINMAXPSZrrib 2975 -VMINMAXPSZrribk 2976 -VMINMAXPSZrribkz 2977 -VMINMAXPSZrrik 2978 -VMINMAXPSZrrikz 2979 -VMINMAXSDrmi 2980 -VMINMAXSDrmi_Int 2981 -VMINMAXSDrmik_Int 2982 -VMINMAXSDrmikz_Int 2983 -VMINMAXSDrri 2984 -VMINMAXSDrri_Int 2985 -VMINMAXSDrrib_Int 2986 -VMINMAXSDrribk_Int 2987 -VMINMAXSDrribkz_Int 2988 -VMINMAXSDrrik_Int 2989 -VMINMAXSDrrikz_Int 2990 -VMINMAXSHrmi 2991 -VMINMAXSHrmi_Int 2992 -VMINMAXSHrmik_Int 2993 -VMINMAXSHrmikz_Int 2994 -VMINMAXSHrri 2995 -VMINMAXSHrri_Int 2996 -VMINMAXSHrrib_Int 2997 -VMINMAXSHrribk_Int 2998 -VMINMAXSHrribkz_Int 2999 -VMINMAXSHrrik_Int 3000 -VMINMAXSHrrikz_Int 3001 -VMINMAXSSrmi 3002 -VMINMAXSSrmi_Int 3003 -VMINMAXSSrmik_Int 3004 -VMINMAXSSrmikz_Int 3005 -VMINMAXSSrri 3006 -VMINMAXSSrri_Int 3007 -VMINMAXSSrrib_Int 3008 -VMINMAXSSrribk_Int 3009 -VMINMAXSSrribkz_Int 3010 -VMINMAXSSrrik_Int 3011 -VMINMAXSSrrikz_Int 3012 -VMINPDYrm 3013 -VMINPDYrr 3014 -VMINPDZ 3015 -VMINPDZrm 3016 -VMINPDZrmb 3017 -VMINPDZrmbk 3018 -VMINPDZrmbkz 3019 -VMINPDZrmk 3020 -VMINPDZrmkz 3021 -VMINPDZrr 3022 -VMINPDZrrb 3023 -VMINPDZrrbk 3024 -VMINPDZrrbkz 3025 -VMINPDZrrk 3026 -VMINPDZrrkz 3027 -VMINPDrm 3028 -VMINPDrr 3029 -VMINPHZ 3030 -VMINPHZrm 3031 -VMINPHZrmb 3032 -VMINPHZrmbk 3033 -VMINPHZrmbkz 3034 -VMINPHZrmk 3035 -VMINPHZrmkz 3036 -VMINPHZrr 3037 -VMINPHZrrb 3038 -VMINPHZrrbk 3039 -VMINPHZrrbkz 3040 -VMINPHZrrk 3041 -VMINPHZrrkz 3042 -VMINPSYrm 3043 -VMINPSYrr 3044 -VMINPSZ 3045 -VMINPSZrm 3046 -VMINPSZrmb 3047 -VMINPSZrmbk 3048 -VMINPSZrmbkz 3049 -VMINPSZrmk 3050 -VMINPSZrmkz 3051 -VMINPSZrr 3052 -VMINPSZrrb 3053 -VMINPSZrrbk 3054 -VMINPSZrrbkz 3055 -VMINPSZrrk 3056 -VMINPSZrrkz 3057 -VMINPSrm 3058 -VMINPSrr 3059 -VMINSDZrm 3060 -VMINSDZrm_Int 3061 -VMINSDZrmk_Int 3062 -VMINSDZrmkz_Int 3063 -VMINSDZrr 3064 -VMINSDZrr_Int 3065 -VMINSDZrrb_Int 3066 -VMINSDZrrbk_Int 3067 -VMINSDZrrbkz_Int 3068 -VMINSDZrrk_Int 3069 -VMINSDZrrkz_Int 3070 -VMINSDrm 3071 -VMINSDrm_Int 3072 -VMINSDrr 3073 -VMINSDrr_Int 3074 -VMINSHZrm 3075 -VMINSHZrm_Int 3076 -VMINSHZrmk_Int 3077 -VMINSHZrmkz_Int 3078 -VMINSHZrr 3079 -VMINSHZrr_Int 3080 -VMINSHZrrb_Int 3081 -VMINSHZrrbk_Int 3082 -VMINSHZrrbkz_Int 3083 -VMINSHZrrk_Int 3084 -VMINSHZrrkz_Int 3085 -VMINSSZrm 3086 -VMINSSZrm_Int 3087 -VMINSSZrmk_Int 3088 -VMINSSZrmkz_Int 3089 -VMINSSZrr 3090 -VMINSSZrr_Int 3091 -VMINSSZrrb_Int 3092 -VMINSSZrrbk_Int 3093 -VMINSSZrrbkz_Int 3094 -VMINSSZrrk_Int 3095 -VMINSSZrrkz_Int 3096 -VMINSSrm 3097 -VMINSSrm_Int 3098 -VMINSSrr 3099 -VMINSSrr_Int 3100 -VMLAUNCH 3101 -VMLOAD 3102 -VMMCALL 3103 -VMOV 3104 -VMOVAPDYmr 3105 -VMOVAPDYrm 3106 -VMOVAPDYrr 3107 -VMOVAPDYrr_REV 3108 -VMOVAPDZ 3109 -VMOVAPDZmr 3110 -VMOVAPDZmrk 3111 -VMOVAPDZrm 3112 -VMOVAPDZrmk 3113 -VMOVAPDZrmkz 3114 -VMOVAPDZrr 3115 -VMOVAPDZrr_REV 3116 -VMOVAPDZrrk 3117 -VMOVAPDZrrk_REV 3118 -VMOVAPDZrrkz 3119 -VMOVAPDZrrkz_REV 3120 -VMOVAPDmr 3121 -VMOVAPDrm 3122 -VMOVAPDrr 3123 -VMOVAPDrr_REV 3124 -VMOVAPSYmr 3125 -VMOVAPSYrm 3126 -VMOVAPSYrr 3127 -VMOVAPSYrr_REV 3128 -VMOVAPSZ 3129 -VMOVAPSZmr 3130 -VMOVAPSZmrk 3131 -VMOVAPSZrm 3132 -VMOVAPSZrmk 3133 -VMOVAPSZrmkz 3134 -VMOVAPSZrr 3135 -VMOVAPSZrr_REV 3136 -VMOVAPSZrrk 3137 -VMOVAPSZrrk_REV 3138 -VMOVAPSZrrkz 3139 -VMOVAPSZrrkz_REV 3140 -VMOVAPSmr 3141 -VMOVAPSrm 3142 -VMOVAPSrr 3143 -VMOVAPSrr_REV 3144 -VMOVDDUPYrm 3145 -VMOVDDUPYrr 3146 -VMOVDDUPZ 3147 -VMOVDDUPZrm 3148 -VMOVDDUPZrmk 3149 -VMOVDDUPZrmkz 3150 -VMOVDDUPZrr 3151 -VMOVDDUPZrrk 3152 -VMOVDDUPZrrkz 3153 -VMOVDDUPrm 3154 -VMOVDDUPrr 3155 -VMOVDI 3156 -VMOVDQA 3157 -VMOVDQAYmr 3158 -VMOVDQAYrm 3159 -VMOVDQAYrr 3160 -VMOVDQAYrr_REV 3161 -VMOVDQAmr 3162 -VMOVDQArm 3163 -VMOVDQArr 3164 -VMOVDQArr_REV 3165 -VMOVDQU 3166 -VMOVDQUYmr 3167 -VMOVDQUYrm 3168 -VMOVDQUYrr 3169 -VMOVDQUYrr_REV 3170 -VMOVDQUmr 3171 -VMOVDQUrm 3172 -VMOVDQUrr 3173 -VMOVDQUrr_REV 3174 -VMOVHLPSZrr 3175 -VMOVHLPSrr 3176 -VMOVHPDZ 3177 -VMOVHPDmr 3178 -VMOVHPDrm 3179 -VMOVHPSZ 3180 -VMOVHPSmr 3181 -VMOVHPSrm 3182 -VMOVLHPSZrr 3183 -VMOVLHPSrr 3184 -VMOVLPDZ 3185 -VMOVLPDmr 3186 -VMOVLPDrm 3187 -VMOVLPSZ 3188 -VMOVLPSmr 3189 -VMOVLPSrm 3190 -VMOVMSKPDYrr 3191 -VMOVMSKPDrr 3192 -VMOVMSKPSYrr 3193 -VMOVMSKPSrr 3194 -VMOVNTDQAYrm 3195 -VMOVNTDQAZ 3196 -VMOVNTDQAZrm 3197 -VMOVNTDQArm 3198 -VMOVNTDQYmr 3199 -VMOVNTDQZ 3200 -VMOVNTDQZmr 3201 -VMOVNTDQmr 3202 -VMOVNTPDYmr 3203 -VMOVNTPDZ 3204 -VMOVNTPDZmr 3205 -VMOVNTPDmr 3206 -VMOVNTPSYmr 3207 -VMOVNTPSZ 3208 -VMOVNTPSZmr 3209 -VMOVNTPSmr 3210 -VMOVPDI 3211 -VMOVPQI 3212 -VMOVPQIto 3213 -VMOVQI 3214 -VMOVRSBZ 3215 -VMOVRSBZm 3216 -VMOVRSBZmk 3217 -VMOVRSBZmkz 3218 -VMOVRSDZ 3219 -VMOVRSDZm 3220 -VMOVRSDZmk 3221 -VMOVRSDZmkz 3222 -VMOVRSQZ 3223 -VMOVRSQZm 3224 -VMOVRSQZmk 3225 -VMOVRSQZmkz 3226 -VMOVRSWZ 3227 -VMOVRSWZm 3228 -VMOVRSWZmk 3229 -VMOVRSWZmkz 3230 -VMOVSDZmr 3231 -VMOVSDZmrk 3232 -VMOVSDZrm 3233 -VMOVSDZrm_alt 3234 -VMOVSDZrmk 3235 -VMOVSDZrmkz 3236 -VMOVSDZrr 3237 -VMOVSDZrr_REV 3238 -VMOVSDZrrk 3239 -VMOVSDZrrk_REV 3240 -VMOVSDZrrkz 3241 -VMOVSDZrrkz_REV 3242 -VMOVSDmr 3243 -VMOVSDrm 3244 -VMOVSDrm_alt 3245 -VMOVSDrr 3246 -VMOVSDrr_REV 3247 -VMOVSDto 3248 -VMOVSH 3249 -VMOVSHDUPYrm 3250 -VMOVSHDUPYrr 3251 -VMOVSHDUPZ 3252 -VMOVSHDUPZrm 3253 -VMOVSHDUPZrmk 3254 -VMOVSHDUPZrmkz 3255 -VMOVSHDUPZrr 3256 -VMOVSHDUPZrrk 3257 -VMOVSHDUPZrrkz 3258 -VMOVSHDUPrm 3259 -VMOVSHDUPrr 3260 -VMOVSHZmr 3261 -VMOVSHZmrk 3262 -VMOVSHZrm 3263 -VMOVSHZrm_alt 3264 -VMOVSHZrmk 3265 -VMOVSHZrmkz 3266 -VMOVSHZrr 3267 -VMOVSHZrr_REV 3268 -VMOVSHZrrk 3269 -VMOVSHZrrk_REV 3270 -VMOVSHZrrkz 3271 -VMOVSHZrrkz_REV 3272 -VMOVSHtoW 3273 -VMOVSLDUPYrm 3274 -VMOVSLDUPYrr 3275 -VMOVSLDUPZ 3276 -VMOVSLDUPZrm 3277 -VMOVSLDUPZrmk 3278 -VMOVSLDUPZrmkz 3279 -VMOVSLDUPZrr 3280 -VMOVSLDUPZrrk 3281 -VMOVSLDUPZrrkz 3282 -VMOVSLDUPrm 3283 -VMOVSLDUPrr 3284 -VMOVSS 3285 -VMOVSSZmr 3286 -VMOVSSZmrk 3287 -VMOVSSZrm 3288 -VMOVSSZrm_alt 3289 -VMOVSSZrmk 3290 -VMOVSSZrmkz 3291 -VMOVSSZrr 3292 -VMOVSSZrr_REV 3293 -VMOVSSZrrk 3294 -VMOVSSZrrk_REV 3295 -VMOVSSZrrkz 3296 -VMOVSSZrrkz_REV 3297 -VMOVSSmr 3298 -VMOVSSrm 3299 -VMOVSSrm_alt 3300 -VMOVSSrr 3301 -VMOVSSrr_REV 3302 -VMOVUPDYmr 3303 -VMOVUPDYrm 3304 -VMOVUPDYrr 3305 -VMOVUPDYrr_REV 3306 -VMOVUPDZ 3307 -VMOVUPDZmr 3308 -VMOVUPDZmrk 3309 -VMOVUPDZrm 3310 -VMOVUPDZrmk 3311 -VMOVUPDZrmkz 3312 -VMOVUPDZrr 3313 -VMOVUPDZrr_REV 3314 -VMOVUPDZrrk 3315 -VMOVUPDZrrk_REV 3316 -VMOVUPDZrrkz 3317 -VMOVUPDZrrkz_REV 3318 -VMOVUPDmr 3319 -VMOVUPDrm 3320 -VMOVUPDrr 3321 -VMOVUPDrr_REV 3322 -VMOVUPSYmr 3323 -VMOVUPSYrm 3324 -VMOVUPSYrr 3325 -VMOVUPSYrr_REV 3326 -VMOVUPSZ 3327 -VMOVUPSZmr 3328 -VMOVUPSZmrk 3329 -VMOVUPSZrm 3330 -VMOVUPSZrmk 3331 -VMOVUPSZrmkz 3332 -VMOVUPSZrr 3333 -VMOVUPSZrr_REV 3334 -VMOVUPSZrrk 3335 -VMOVUPSZrrk_REV 3336 -VMOVUPSZrrkz 3337 -VMOVUPSZrrkz_REV 3338 -VMOVUPSmr 3339 -VMOVUPSrm 3340 -VMOVUPSrr 3341 -VMOVUPSrr_REV 3342 -VMOVW 3343 -VMOVWmr 3344 -VMOVWrm 3345 -VMOVZPDILo 3346 -VMOVZPQILo 3347 -VMOVZPWILo 3348 -VMPSADBWYrmi 3349 -VMPSADBWYrri 3350 -VMPSADBWZ 3351 -VMPSADBWZrmi 3352 -VMPSADBWZrmik 3353 -VMPSADBWZrmikz 3354 -VMPSADBWZrri 3355 -VMPSADBWZrrik 3356 -VMPSADBWZrrikz 3357 -VMPSADBWrmi 3358 -VMPSADBWrri 3359 -VMPTRLDm 3360 -VMPTRSTm 3361 -VMREAD 3362 -VMRESUME 3363 -VMRUN 3364 -VMSAVE 3365 -VMULBF 3366 -VMULPDYrm 3367 -VMULPDYrr 3368 -VMULPDZ 3369 -VMULPDZrm 3370 -VMULPDZrmb 3371 -VMULPDZrmbk 3372 -VMULPDZrmbkz 3373 -VMULPDZrmk 3374 -VMULPDZrmkz 3375 -VMULPDZrr 3376 -VMULPDZrrb 3377 -VMULPDZrrbk 3378 -VMULPDZrrbkz 3379 -VMULPDZrrk 3380 -VMULPDZrrkz 3381 -VMULPDrm 3382 -VMULPDrr 3383 -VMULPHZ 3384 -VMULPHZrm 3385 -VMULPHZrmb 3386 -VMULPHZrmbk 3387 -VMULPHZrmbkz 3388 -VMULPHZrmk 3389 -VMULPHZrmkz 3390 -VMULPHZrr 3391 -VMULPHZrrb 3392 -VMULPHZrrbk 3393 -VMULPHZrrbkz 3394 -VMULPHZrrk 3395 -VMULPHZrrkz 3396 -VMULPSYrm 3397 -VMULPSYrr 3398 -VMULPSZ 3399 -VMULPSZrm 3400 -VMULPSZrmb 3401 -VMULPSZrmbk 3402 -VMULPSZrmbkz 3403 -VMULPSZrmk 3404 -VMULPSZrmkz 3405 -VMULPSZrr 3406 -VMULPSZrrb 3407 -VMULPSZrrbk 3408 -VMULPSZrrbkz 3409 -VMULPSZrrk 3410 -VMULPSZrrkz 3411 -VMULPSrm 3412 -VMULPSrr 3413 -VMULSDZrm 3414 -VMULSDZrm_Int 3415 -VMULSDZrmk_Int 3416 -VMULSDZrmkz_Int 3417 -VMULSDZrr 3418 -VMULSDZrr_Int 3419 -VMULSDZrrb_Int 3420 -VMULSDZrrbk_Int 3421 -VMULSDZrrbkz_Int 3422 -VMULSDZrrk_Int 3423 -VMULSDZrrkz_Int 3424 -VMULSDrm 3425 -VMULSDrm_Int 3426 -VMULSDrr 3427 -VMULSDrr_Int 3428 -VMULSHZrm 3429 -VMULSHZrm_Int 3430 -VMULSHZrmk_Int 3431 -VMULSHZrmkz_Int 3432 -VMULSHZrr 3433 -VMULSHZrr_Int 3434 -VMULSHZrrb_Int 3435 -VMULSHZrrbk_Int 3436 -VMULSHZrrbkz_Int 3437 -VMULSHZrrk_Int 3438 -VMULSHZrrkz_Int 3439 -VMULSSZrm 3440 -VMULSSZrm_Int 3441 -VMULSSZrmk_Int 3442 -VMULSSZrmkz_Int 3443 -VMULSSZrr 3444 -VMULSSZrr_Int 3445 -VMULSSZrrb_Int 3446 -VMULSSZrrbk_Int 3447 -VMULSSZrrbkz_Int 3448 -VMULSSZrrk_Int 3449 -VMULSSZrrkz_Int 3450 -VMULSSrm 3451 -VMULSSrm_Int 3452 -VMULSSrr 3453 -VMULSSrr_Int 3454 -VMWRITE 3455 -VMXOFF 3456 -VMXON 3457 -VORPDYrm 3458 -VORPDYrr 3459 -VORPDZ 3460 -VORPDZrm 3461 -VORPDZrmb 3462 -VORPDZrmbk 3463 -VORPDZrmbkz 3464 -VORPDZrmk 3465 -VORPDZrmkz 3466 -VORPDZrr 3467 -VORPDZrrk 3468 -VORPDZrrkz 3469 -VORPDrm 3470 -VORPDrr 3471 -VORPSYrm 3472 -VORPSYrr 3473 -VORPSZ 3474 -VORPSZrm 3475 -VORPSZrmb 3476 -VORPSZrmbk 3477 -VORPSZrmbkz 3478 -VORPSZrmk 3479 -VORPSZrmkz 3480 -VORPSZrr 3481 -VORPSZrrk 3482 -VORPSZrrkz 3483 -VORPSrm 3484 -VORPSrr 3485 -VP 3486 -VPABSBYrm 3487 -VPABSBYrr 3488 -VPABSBZ 3489 -VPABSBZrm 3490 -VPABSBZrmk 3491 -VPABSBZrmkz 3492 -VPABSBZrr 3493 -VPABSBZrrk 3494 -VPABSBZrrkz 3495 -VPABSBrm 3496 -VPABSBrr 3497 -VPABSDYrm 3498 -VPABSDYrr 3499 -VPABSDZ 3500 -VPABSDZrm 3501 -VPABSDZrmb 3502 -VPABSDZrmbk 3503 -VPABSDZrmbkz 3504 -VPABSDZrmk 3505 -VPABSDZrmkz 3506 -VPABSDZrr 3507 -VPABSDZrrk 3508 -VPABSDZrrkz 3509 -VPABSDrm 3510 -VPABSDrr 3511 -VPABSQZ 3512 -VPABSQZrm 3513 -VPABSQZrmb 3514 -VPABSQZrmbk 3515 -VPABSQZrmbkz 3516 -VPABSQZrmk 3517 -VPABSQZrmkz 3518 -VPABSQZrr 3519 -VPABSQZrrk 3520 -VPABSQZrrkz 3521 -VPABSWYrm 3522 -VPABSWYrr 3523 -VPABSWZ 3524 -VPABSWZrm 3525 -VPABSWZrmk 3526 -VPABSWZrmkz 3527 -VPABSWZrr 3528 -VPABSWZrrk 3529 -VPABSWZrrkz 3530 -VPABSWrm 3531 -VPABSWrr 3532 -VPACKSSDWYrm 3533 -VPACKSSDWYrr 3534 -VPACKSSDWZ 3535 -VPACKSSDWZrm 3536 -VPACKSSDWZrmb 3537 -VPACKSSDWZrmbk 3538 -VPACKSSDWZrmbkz 3539 -VPACKSSDWZrmk 3540 -VPACKSSDWZrmkz 3541 -VPACKSSDWZrr 3542 -VPACKSSDWZrrk 3543 -VPACKSSDWZrrkz 3544 -VPACKSSDWrm 3545 -VPACKSSDWrr 3546 -VPACKSSWBYrm 3547 -VPACKSSWBYrr 3548 -VPACKSSWBZ 3549 -VPACKSSWBZrm 3550 -VPACKSSWBZrmk 3551 -VPACKSSWBZrmkz 3552 -VPACKSSWBZrr 3553 -VPACKSSWBZrrk 3554 -VPACKSSWBZrrkz 3555 -VPACKSSWBrm 3556 -VPACKSSWBrr 3557 -VPACKUSDWYrm 3558 -VPACKUSDWYrr 3559 -VPACKUSDWZ 3560 -VPACKUSDWZrm 3561 -VPACKUSDWZrmb 3562 -VPACKUSDWZrmbk 3563 -VPACKUSDWZrmbkz 3564 -VPACKUSDWZrmk 3565 -VPACKUSDWZrmkz 3566 -VPACKUSDWZrr 3567 -VPACKUSDWZrrk 3568 -VPACKUSDWZrrkz 3569 -VPACKUSDWrm 3570 -VPACKUSDWrr 3571 -VPACKUSWBYrm 3572 -VPACKUSWBYrr 3573 -VPACKUSWBZ 3574 -VPACKUSWBZrm 3575 -VPACKUSWBZrmk 3576 -VPACKUSWBZrmkz 3577 -VPACKUSWBZrr 3578 -VPACKUSWBZrrk 3579 -VPACKUSWBZrrkz 3580 -VPACKUSWBrm 3581 -VPACKUSWBrr 3582 -VPADDBYrm 3583 -VPADDBYrr 3584 -VPADDBZ 3585 -VPADDBZrm 3586 -VPADDBZrmk 3587 -VPADDBZrmkz 3588 -VPADDBZrr 3589 -VPADDBZrrk 3590 -VPADDBZrrkz 3591 -VPADDBrm 3592 -VPADDBrr 3593 -VPADDDYrm 3594 -VPADDDYrr 3595 -VPADDDZ 3596 -VPADDDZrm 3597 -VPADDDZrmb 3598 -VPADDDZrmbk 3599 -VPADDDZrmbkz 3600 -VPADDDZrmk 3601 -VPADDDZrmkz 3602 -VPADDDZrr 3603 -VPADDDZrrk 3604 -VPADDDZrrkz 3605 -VPADDDrm 3606 -VPADDDrr 3607 -VPADDQYrm 3608 -VPADDQYrr 3609 -VPADDQZ 3610 -VPADDQZrm 3611 -VPADDQZrmb 3612 -VPADDQZrmbk 3613 -VPADDQZrmbkz 3614 -VPADDQZrmk 3615 -VPADDQZrmkz 3616 -VPADDQZrr 3617 -VPADDQZrrk 3618 -VPADDQZrrkz 3619 -VPADDQrm 3620 -VPADDQrr 3621 -VPADDSBYrm 3622 -VPADDSBYrr 3623 -VPADDSBZ 3624 -VPADDSBZrm 3625 -VPADDSBZrmk 3626 -VPADDSBZrmkz 3627 -VPADDSBZrr 3628 -VPADDSBZrrk 3629 -VPADDSBZrrkz 3630 -VPADDSBrm 3631 -VPADDSBrr 3632 -VPADDSWYrm 3633 -VPADDSWYrr 3634 -VPADDSWZ 3635 -VPADDSWZrm 3636 -VPADDSWZrmk 3637 -VPADDSWZrmkz 3638 -VPADDSWZrr 3639 -VPADDSWZrrk 3640 -VPADDSWZrrkz 3641 -VPADDSWrm 3642 -VPADDSWrr 3643 -VPADDUSBYrm 3644 -VPADDUSBYrr 3645 -VPADDUSBZ 3646 -VPADDUSBZrm 3647 -VPADDUSBZrmk 3648 -VPADDUSBZrmkz 3649 -VPADDUSBZrr 3650 -VPADDUSBZrrk 3651 -VPADDUSBZrrkz 3652 -VPADDUSBrm 3653 -VPADDUSBrr 3654 -VPADDUSWYrm 3655 -VPADDUSWYrr 3656 -VPADDUSWZ 3657 -VPADDUSWZrm 3658 -VPADDUSWZrmk 3659 -VPADDUSWZrmkz 3660 -VPADDUSWZrr 3661 -VPADDUSWZrrk 3662 -VPADDUSWZrrkz 3663 -VPADDUSWrm 3664 -VPADDUSWrr 3665 -VPADDWYrm 3666 -VPADDWYrr 3667 -VPADDWZ 3668 -VPADDWZrm 3669 -VPADDWZrmk 3670 -VPADDWZrmkz 3671 -VPADDWZrr 3672 -VPADDWZrrk 3673 -VPADDWZrrkz 3674 -VPADDWrm 3675 -VPADDWrr 3676 -VPALIGNRYrmi 3677 -VPALIGNRYrri 3678 -VPALIGNRZ 3679 -VPALIGNRZrmi 3680 -VPALIGNRZrmik 3681 -VPALIGNRZrmikz 3682 -VPALIGNRZrri 3683 -VPALIGNRZrrik 3684 -VPALIGNRZrrikz 3685 -VPALIGNRrmi 3686 -VPALIGNRrri 3687 -VPANDDZ 3688 -VPANDDZrm 3689 -VPANDDZrmb 3690 -VPANDDZrmbk 3691 -VPANDDZrmbkz 3692 -VPANDDZrmk 3693 -VPANDDZrmkz 3694 -VPANDDZrr 3695 -VPANDDZrrk 3696 -VPANDDZrrkz 3697 -VPANDNDZ 3698 -VPANDNDZrm 3699 -VPANDNDZrmb 3700 -VPANDNDZrmbk 3701 -VPANDNDZrmbkz 3702 -VPANDNDZrmk 3703 -VPANDNDZrmkz 3704 -VPANDNDZrr 3705 -VPANDNDZrrk 3706 -VPANDNDZrrkz 3707 -VPANDNQZ 3708 -VPANDNQZrm 3709 -VPANDNQZrmb 3710 -VPANDNQZrmbk 3711 -VPANDNQZrmbkz 3712 -VPANDNQZrmk 3713 -VPANDNQZrmkz 3714 -VPANDNQZrr 3715 -VPANDNQZrrk 3716 -VPANDNQZrrkz 3717 -VPANDNYrm 3718 -VPANDNYrr 3719 -VPANDNrm 3720 -VPANDNrr 3721 -VPANDQZ 3722 -VPANDQZrm 3723 -VPANDQZrmb 3724 -VPANDQZrmbk 3725 -VPANDQZrmbkz 3726 -VPANDQZrmk 3727 -VPANDQZrmkz 3728 -VPANDQZrr 3729 -VPANDQZrrk 3730 -VPANDQZrrkz 3731 -VPANDYrm 3732 -VPANDYrr 3733 -VPANDrm 3734 -VPANDrr 3735 -VPAVGBYrm 3736 -VPAVGBYrr 3737 -VPAVGBZ 3738 -VPAVGBZrm 3739 -VPAVGBZrmk 3740 -VPAVGBZrmkz 3741 -VPAVGBZrr 3742 -VPAVGBZrrk 3743 -VPAVGBZrrkz 3744 -VPAVGBrm 3745 -VPAVGBrr 3746 -VPAVGWYrm 3747 -VPAVGWYrr 3748 -VPAVGWZ 3749 -VPAVGWZrm 3750 -VPAVGWZrmk 3751 -VPAVGWZrmkz 3752 -VPAVGWZrr 3753 -VPAVGWZrrk 3754 -VPAVGWZrrkz 3755 -VPAVGWrm 3756 -VPAVGWrr 3757 -VPBLENDDYrmi 3758 -VPBLENDDYrri 3759 -VPBLENDDrmi 3760 -VPBLENDDrri 3761 -VPBLENDMBZ 3762 -VPBLENDMBZrm 3763 -VPBLENDMBZrmk 3764 -VPBLENDMBZrmkz 3765 -VPBLENDMBZrr 3766 -VPBLENDMBZrrk 3767 -VPBLENDMBZrrkz 3768 -VPBLENDMDZ 3769 -VPBLENDMDZrm 3770 -VPBLENDMDZrmb 3771 -VPBLENDMDZrmbk 3772 -VPBLENDMDZrmbkz 3773 -VPBLENDMDZrmk 3774 -VPBLENDMDZrmkz 3775 -VPBLENDMDZrr 3776 -VPBLENDMDZrrk 3777 -VPBLENDMDZrrkz 3778 -VPBLENDMQZ 3779 -VPBLENDMQZrm 3780 -VPBLENDMQZrmb 3781 -VPBLENDMQZrmbk 3782 -VPBLENDMQZrmbkz 3783 -VPBLENDMQZrmk 3784 -VPBLENDMQZrmkz 3785 -VPBLENDMQZrr 3786 -VPBLENDMQZrrk 3787 -VPBLENDMQZrrkz 3788 -VPBLENDMWZ 3789 -VPBLENDMWZrm 3790 -VPBLENDMWZrmk 3791 -VPBLENDMWZrmkz 3792 -VPBLENDMWZrr 3793 -VPBLENDMWZrrk 3794 -VPBLENDMWZrrkz 3795 -VPBLENDVBYrmr 3796 -VPBLENDVBYrrr 3797 -VPBLENDVBrmr 3798 -VPBLENDVBrrr 3799 -VPBLENDWYrmi 3800 -VPBLENDWYrri 3801 -VPBLENDWrmi 3802 -VPBLENDWrri 3803 -VPBROADCASTBYrm 3804 -VPBROADCASTBYrr 3805 -VPBROADCASTBZ 3806 -VPBROADCASTBZrm 3807 -VPBROADCASTBZrmk 3808 -VPBROADCASTBZrmkz 3809 -VPBROADCASTBZrr 3810 -VPBROADCASTBZrrk 3811 -VPBROADCASTBZrrkz 3812 -VPBROADCASTBrZ 3813 -VPBROADCASTBrZrr 3814 -VPBROADCASTBrZrrk 3815 -VPBROADCASTBrZrrkz 3816 -VPBROADCASTBrm 3817 -VPBROADCASTBrr 3818 -VPBROADCASTDYrm 3819 -VPBROADCASTDYrr 3820 -VPBROADCASTDZ 3821 -VPBROADCASTDZrm 3822 -VPBROADCASTDZrmk 3823 -VPBROADCASTDZrmkz 3824 -VPBROADCASTDZrr 3825 -VPBROADCASTDZrrk 3826 -VPBROADCASTDZrrkz 3827 -VPBROADCASTDrZ 3828 -VPBROADCASTDrZrr 3829 -VPBROADCASTDrZrrk 3830 -VPBROADCASTDrZrrkz 3831 -VPBROADCASTDrm 3832 -VPBROADCASTDrr 3833 -VPBROADCASTMB 3834 -VPBROADCASTMW 3835 -VPBROADCASTQYrm 3836 -VPBROADCASTQYrr 3837 -VPBROADCASTQZ 3838 -VPBROADCASTQZrm 3839 -VPBROADCASTQZrmk 3840 -VPBROADCASTQZrmkz 3841 -VPBROADCASTQZrr 3842 -VPBROADCASTQZrrk 3843 -VPBROADCASTQZrrkz 3844 -VPBROADCASTQrZ 3845 -VPBROADCASTQrZrr 3846 -VPBROADCASTQrZrrk 3847 -VPBROADCASTQrZrrkz 3848 -VPBROADCASTQrm 3849 -VPBROADCASTQrr 3850 -VPBROADCASTWYrm 3851 -VPBROADCASTWYrr 3852 -VPBROADCASTWZ 3853 -VPBROADCASTWZrm 3854 -VPBROADCASTWZrmk 3855 -VPBROADCASTWZrmkz 3856 -VPBROADCASTWZrr 3857 -VPBROADCASTWZrrk 3858 -VPBROADCASTWZrrkz 3859 -VPBROADCASTWrZ 3860 -VPBROADCASTWrZrr 3861 -VPBROADCASTWrZrrk 3862 -VPBROADCASTWrZrrkz 3863 -VPBROADCASTWrm 3864 -VPBROADCASTWrr 3865 -VPCLMULQDQYrmi 3866 -VPCLMULQDQYrri 3867 -VPCLMULQDQZ 3868 -VPCLMULQDQZrmi 3869 -VPCLMULQDQZrri 3870 -VPCLMULQDQrmi 3871 -VPCLMULQDQrri 3872 -VPCMOVYrmr 3873 -VPCMOVYrrm 3874 -VPCMOVYrrr 3875 -VPCMOVYrrr_REV 3876 -VPCMOVrmr 3877 -VPCMOVrrm 3878 -VPCMOVrrr 3879 -VPCMOVrrr_REV 3880 -VPCMPBZ 3881 -VPCMPBZrmi 3882 -VPCMPBZrmik 3883 -VPCMPBZrri 3884 -VPCMPBZrrik 3885 -VPCMPDZ 3886 -VPCMPDZrmbi 3887 -VPCMPDZrmbik 3888 -VPCMPDZrmi 3889 -VPCMPDZrmik 3890 -VPCMPDZrri 3891 -VPCMPDZrrik 3892 -VPCMPEQBYrm 3893 -VPCMPEQBYrr 3894 -VPCMPEQBZ 3895 -VPCMPEQBZrm 3896 -VPCMPEQBZrmk 3897 -VPCMPEQBZrr 3898 -VPCMPEQBZrrk 3899 -VPCMPEQBrm 3900 -VPCMPEQBrr 3901 -VPCMPEQDYrm 3902 -VPCMPEQDYrr 3903 -VPCMPEQDZ 3904 -VPCMPEQDZrm 3905 -VPCMPEQDZrmb 3906 -VPCMPEQDZrmbk 3907 -VPCMPEQDZrmk 3908 -VPCMPEQDZrr 3909 -VPCMPEQDZrrk 3910 -VPCMPEQDrm 3911 -VPCMPEQDrr 3912 -VPCMPEQQYrm 3913 -VPCMPEQQYrr 3914 -VPCMPEQQZ 3915 -VPCMPEQQZrm 3916 -VPCMPEQQZrmb 3917 -VPCMPEQQZrmbk 3918 -VPCMPEQQZrmk 3919 -VPCMPEQQZrr 3920 -VPCMPEQQZrrk 3921 -VPCMPEQQrm 3922 -VPCMPEQQrr 3923 -VPCMPEQWYrm 3924 -VPCMPEQWYrr 3925 -VPCMPEQWZ 3926 -VPCMPEQWZrm 3927 -VPCMPEQWZrmk 3928 -VPCMPEQWZrr 3929 -VPCMPEQWZrrk 3930 -VPCMPEQWrm 3931 -VPCMPEQWrr 3932 -VPCMPESTRIrmi 3933 -VPCMPESTRIrri 3934 -VPCMPESTRMrmi 3935 -VPCMPESTRMrri 3936 -VPCMPGTBYrm 3937 -VPCMPGTBYrr 3938 -VPCMPGTBZ 3939 -VPCMPGTBZrm 3940 -VPCMPGTBZrmk 3941 -VPCMPGTBZrr 3942 -VPCMPGTBZrrk 3943 -VPCMPGTBrm 3944 -VPCMPGTBrr 3945 -VPCMPGTDYrm 3946 -VPCMPGTDYrr 3947 -VPCMPGTDZ 3948 -VPCMPGTDZrm 3949 -VPCMPGTDZrmb 3950 -VPCMPGTDZrmbk 3951 -VPCMPGTDZrmk 3952 -VPCMPGTDZrr 3953 -VPCMPGTDZrrk 3954 -VPCMPGTDrm 3955 -VPCMPGTDrr 3956 -VPCMPGTQYrm 3957 -VPCMPGTQYrr 3958 -VPCMPGTQZ 3959 -VPCMPGTQZrm 3960 -VPCMPGTQZrmb 3961 -VPCMPGTQZrmbk 3962 -VPCMPGTQZrmk 3963 -VPCMPGTQZrr 3964 -VPCMPGTQZrrk 3965 -VPCMPGTQrm 3966 -VPCMPGTQrr 3967 -VPCMPGTWYrm 3968 -VPCMPGTWYrr 3969 -VPCMPGTWZ 3970 -VPCMPGTWZrm 3971 -VPCMPGTWZrmk 3972 -VPCMPGTWZrr 3973 -VPCMPGTWZrrk 3974 -VPCMPGTWrm 3975 -VPCMPGTWrr 3976 -VPCMPISTRIrmi 3977 -VPCMPISTRIrri 3978 -VPCMPISTRMrmi 3979 -VPCMPISTRMrri 3980 -VPCMPQZ 3981 -VPCMPQZrmbi 3982 -VPCMPQZrmbik 3983 -VPCMPQZrmi 3984 -VPCMPQZrmik 3985 -VPCMPQZrri 3986 -VPCMPQZrrik 3987 -VPCMPUBZ 3988 -VPCMPUBZrmi 3989 -VPCMPUBZrmik 3990 -VPCMPUBZrri 3991 -VPCMPUBZrrik 3992 -VPCMPUDZ 3993 -VPCMPUDZrmbi 3994 -VPCMPUDZrmbik 3995 -VPCMPUDZrmi 3996 -VPCMPUDZrmik 3997 -VPCMPUDZrri 3998 -VPCMPUDZrrik 3999 -VPCMPUQZ 4000 -VPCMPUQZrmbi 4001 -VPCMPUQZrmbik 4002 -VPCMPUQZrmi 4003 -VPCMPUQZrmik 4004 -VPCMPUQZrri 4005 -VPCMPUQZrrik 4006 -VPCMPUWZ 4007 -VPCMPUWZrmi 4008 -VPCMPUWZrmik 4009 -VPCMPUWZrri 4010 -VPCMPUWZrrik 4011 -VPCMPWZ 4012 -VPCMPWZrmi 4013 -VPCMPWZrmik 4014 -VPCMPWZrri 4015 -VPCMPWZrrik 4016 -VPCOMBmi 4017 -VPCOMBri 4018 -VPCOMDmi 4019 -VPCOMDri 4020 -VPCOMPRESSBZ 4021 -VPCOMPRESSBZmr 4022 -VPCOMPRESSBZmrk 4023 -VPCOMPRESSBZrr 4024 -VPCOMPRESSBZrrk 4025 -VPCOMPRESSBZrrkz 4026 -VPCOMPRESSDZ 4027 -VPCOMPRESSDZmr 4028 -VPCOMPRESSDZmrk 4029 -VPCOMPRESSDZrr 4030 -VPCOMPRESSDZrrk 4031 -VPCOMPRESSDZrrkz 4032 -VPCOMPRESSQZ 4033 -VPCOMPRESSQZmr 4034 -VPCOMPRESSQZmrk 4035 -VPCOMPRESSQZrr 4036 -VPCOMPRESSQZrrk 4037 -VPCOMPRESSQZrrkz 4038 -VPCOMPRESSWZ 4039 -VPCOMPRESSWZmr 4040 -VPCOMPRESSWZmrk 4041 -VPCOMPRESSWZrr 4042 -VPCOMPRESSWZrrk 4043 -VPCOMPRESSWZrrkz 4044 -VPCOMQmi 4045 -VPCOMQri 4046 -VPCOMUBmi 4047 -VPCOMUBri 4048 -VPCOMUDmi 4049 -VPCOMUDri 4050 -VPCOMUQmi 4051 -VPCOMUQri 4052 -VPCOMUWmi 4053 -VPCOMUWri 4054 -VPCOMWmi 4055 -VPCOMWri 4056 -VPCONFLICTDZ 4057 -VPCONFLICTDZrm 4058 -VPCONFLICTDZrmb 4059 -VPCONFLICTDZrmbk 4060 -VPCONFLICTDZrmbkz 4061 -VPCONFLICTDZrmk 4062 -VPCONFLICTDZrmkz 4063 -VPCONFLICTDZrr 4064 -VPCONFLICTDZrrk 4065 -VPCONFLICTDZrrkz 4066 -VPCONFLICTQZ 4067 -VPCONFLICTQZrm 4068 -VPCONFLICTQZrmb 4069 -VPCONFLICTQZrmbk 4070 -VPCONFLICTQZrmbkz 4071 -VPCONFLICTQZrmk 4072 -VPCONFLICTQZrmkz 4073 -VPCONFLICTQZrr 4074 -VPCONFLICTQZrrk 4075 -VPCONFLICTQZrrkz 4076 -VPDPBSSDSYrm 4077 -VPDPBSSDSYrr 4078 -VPDPBSSDSZ 4079 -VPDPBSSDSZrm 4080 -VPDPBSSDSZrmb 4081 -VPDPBSSDSZrmbk 4082 -VPDPBSSDSZrmbkz 4083 -VPDPBSSDSZrmk 4084 -VPDPBSSDSZrmkz 4085 -VPDPBSSDSZrr 4086 -VPDPBSSDSZrrk 4087 -VPDPBSSDSZrrkz 4088 -VPDPBSSDSrm 4089 -VPDPBSSDSrr 4090 -VPDPBSSDYrm 4091 -VPDPBSSDYrr 4092 -VPDPBSSDZ 4093 -VPDPBSSDZrm 4094 -VPDPBSSDZrmb 4095 -VPDPBSSDZrmbk 4096 -VPDPBSSDZrmbkz 4097 -VPDPBSSDZrmk 4098 -VPDPBSSDZrmkz 4099 -VPDPBSSDZrr 4100 -VPDPBSSDZrrk 4101 -VPDPBSSDZrrkz 4102 -VPDPBSSDrm 4103 -VPDPBSSDrr 4104 -VPDPBSUDSYrm 4105 -VPDPBSUDSYrr 4106 -VPDPBSUDSZ 4107 -VPDPBSUDSZrm 4108 -VPDPBSUDSZrmb 4109 -VPDPBSUDSZrmbk 4110 -VPDPBSUDSZrmbkz 4111 -VPDPBSUDSZrmk 4112 -VPDPBSUDSZrmkz 4113 -VPDPBSUDSZrr 4114 -VPDPBSUDSZrrk 4115 -VPDPBSUDSZrrkz 4116 -VPDPBSUDSrm 4117 -VPDPBSUDSrr 4118 -VPDPBSUDYrm 4119 -VPDPBSUDYrr 4120 -VPDPBSUDZ 4121 -VPDPBSUDZrm 4122 -VPDPBSUDZrmb 4123 -VPDPBSUDZrmbk 4124 -VPDPBSUDZrmbkz 4125 -VPDPBSUDZrmk 4126 -VPDPBSUDZrmkz 4127 -VPDPBSUDZrr 4128 -VPDPBSUDZrrk 4129 -VPDPBSUDZrrkz 4130 -VPDPBSUDrm 4131 -VPDPBSUDrr 4132 -VPDPBUSDSYrm 4133 -VPDPBUSDSYrr 4134 -VPDPBUSDSZ 4135 -VPDPBUSDSZrm 4136 -VPDPBUSDSZrmb 4137 -VPDPBUSDSZrmbk 4138 -VPDPBUSDSZrmbkz 4139 -VPDPBUSDSZrmk 4140 -VPDPBUSDSZrmkz 4141 -VPDPBUSDSZrr 4142 -VPDPBUSDSZrrk 4143 -VPDPBUSDSZrrkz 4144 -VPDPBUSDSrm 4145 -VPDPBUSDSrr 4146 -VPDPBUSDYrm 4147 -VPDPBUSDYrr 4148 -VPDPBUSDZ 4149 -VPDPBUSDZrm 4150 -VPDPBUSDZrmb 4151 -VPDPBUSDZrmbk 4152 -VPDPBUSDZrmbkz 4153 -VPDPBUSDZrmk 4154 -VPDPBUSDZrmkz 4155 -VPDPBUSDZrr 4156 -VPDPBUSDZrrk 4157 -VPDPBUSDZrrkz 4158 -VPDPBUSDrm 4159 -VPDPBUSDrr 4160 -VPDPBUUDSYrm 4161 -VPDPBUUDSYrr 4162 -VPDPBUUDSZ 4163 -VPDPBUUDSZrm 4164 -VPDPBUUDSZrmb 4165 -VPDPBUUDSZrmbk 4166 -VPDPBUUDSZrmbkz 4167 -VPDPBUUDSZrmk 4168 -VPDPBUUDSZrmkz 4169 -VPDPBUUDSZrr 4170 -VPDPBUUDSZrrk 4171 -VPDPBUUDSZrrkz 4172 -VPDPBUUDSrm 4173 -VPDPBUUDSrr 4174 -VPDPBUUDYrm 4175 -VPDPBUUDYrr 4176 -VPDPBUUDZ 4177 -VPDPBUUDZrm 4178 -VPDPBUUDZrmb 4179 -VPDPBUUDZrmbk 4180 -VPDPBUUDZrmbkz 4181 -VPDPBUUDZrmk 4182 -VPDPBUUDZrmkz 4183 -VPDPBUUDZrr 4184 -VPDPBUUDZrrk 4185 -VPDPBUUDZrrkz 4186 -VPDPBUUDrm 4187 -VPDPBUUDrr 4188 -VPDPWSSDSYrm 4189 -VPDPWSSDSYrr 4190 -VPDPWSSDSZ 4191 -VPDPWSSDSZrm 4192 -VPDPWSSDSZrmb 4193 -VPDPWSSDSZrmbk 4194 -VPDPWSSDSZrmbkz 4195 -VPDPWSSDSZrmk 4196 -VPDPWSSDSZrmkz 4197 -VPDPWSSDSZrr 4198 -VPDPWSSDSZrrk 4199 -VPDPWSSDSZrrkz 4200 -VPDPWSSDSrm 4201 -VPDPWSSDSrr 4202 -VPDPWSSDYrm 4203 -VPDPWSSDYrr 4204 -VPDPWSSDZ 4205 -VPDPWSSDZrm 4206 -VPDPWSSDZrmb 4207 -VPDPWSSDZrmbk 4208 -VPDPWSSDZrmbkz 4209 -VPDPWSSDZrmk 4210 -VPDPWSSDZrmkz 4211 -VPDPWSSDZrr 4212 -VPDPWSSDZrrk 4213 -VPDPWSSDZrrkz 4214 -VPDPWSSDrm 4215 -VPDPWSSDrr 4216 -VPDPWSUDSYrm 4217 -VPDPWSUDSYrr 4218 -VPDPWSUDSZ 4219 -VPDPWSUDSZrm 4220 -VPDPWSUDSZrmb 4221 -VPDPWSUDSZrmbk 4222 -VPDPWSUDSZrmbkz 4223 -VPDPWSUDSZrmk 4224 -VPDPWSUDSZrmkz 4225 -VPDPWSUDSZrr 4226 -VPDPWSUDSZrrk 4227 -VPDPWSUDSZrrkz 4228 -VPDPWSUDSrm 4229 -VPDPWSUDSrr 4230 -VPDPWSUDYrm 4231 -VPDPWSUDYrr 4232 -VPDPWSUDZ 4233 -VPDPWSUDZrm 4234 -VPDPWSUDZrmb 4235 -VPDPWSUDZrmbk 4236 -VPDPWSUDZrmbkz 4237 -VPDPWSUDZrmk 4238 -VPDPWSUDZrmkz 4239 -VPDPWSUDZrr 4240 -VPDPWSUDZrrk 4241 -VPDPWSUDZrrkz 4242 -VPDPWSUDrm 4243 -VPDPWSUDrr 4244 -VPDPWUSDSYrm 4245 -VPDPWUSDSYrr 4246 -VPDPWUSDSZ 4247 -VPDPWUSDSZrm 4248 -VPDPWUSDSZrmb 4249 -VPDPWUSDSZrmbk 4250 -VPDPWUSDSZrmbkz 4251 -VPDPWUSDSZrmk 4252 -VPDPWUSDSZrmkz 4253 -VPDPWUSDSZrr 4254 -VPDPWUSDSZrrk 4255 -VPDPWUSDSZrrkz 4256 -VPDPWUSDSrm 4257 -VPDPWUSDSrr 4258 -VPDPWUSDYrm 4259 -VPDPWUSDYrr 4260 -VPDPWUSDZ 4261 -VPDPWUSDZrm 4262 -VPDPWUSDZrmb 4263 -VPDPWUSDZrmbk 4264 -VPDPWUSDZrmbkz 4265 -VPDPWUSDZrmk 4266 -VPDPWUSDZrmkz 4267 -VPDPWUSDZrr 4268 -VPDPWUSDZrrk 4269 -VPDPWUSDZrrkz 4270 -VPDPWUSDrm 4271 -VPDPWUSDrr 4272 -VPDPWUUDSYrm 4273 -VPDPWUUDSYrr 4274 -VPDPWUUDSZ 4275 -VPDPWUUDSZrm 4276 -VPDPWUUDSZrmb 4277 -VPDPWUUDSZrmbk 4278 -VPDPWUUDSZrmbkz 4279 -VPDPWUUDSZrmk 4280 -VPDPWUUDSZrmkz 4281 -VPDPWUUDSZrr 4282 -VPDPWUUDSZrrk 4283 -VPDPWUUDSZrrkz 4284 -VPDPWUUDSrm 4285 -VPDPWUUDSrr 4286 -VPDPWUUDYrm 4287 -VPDPWUUDYrr 4288 -VPDPWUUDZ 4289 -VPDPWUUDZrm 4290 -VPDPWUUDZrmb 4291 -VPDPWUUDZrmbk 4292 -VPDPWUUDZrmbkz 4293 -VPDPWUUDZrmk 4294 -VPDPWUUDZrmkz 4295 -VPDPWUUDZrr 4296 -VPDPWUUDZrrk 4297 -VPDPWUUDZrrkz 4298 -VPDPWUUDrm 4299 -VPDPWUUDrr 4300 -VPERM 4301 -VPERMBZ 4302 -VPERMBZrm 4303 -VPERMBZrmk 4304 -VPERMBZrmkz 4305 -VPERMBZrr 4306 -VPERMBZrrk 4307 -VPERMBZrrkz 4308 -VPERMDYrm 4309 -VPERMDYrr 4310 -VPERMDZ 4311 -VPERMDZrm 4312 -VPERMDZrmb 4313 -VPERMDZrmbk 4314 -VPERMDZrmbkz 4315 -VPERMDZrmk 4316 -VPERMDZrmkz 4317 -VPERMDZrr 4318 -VPERMDZrrk 4319 -VPERMDZrrkz 4320 -VPERMI 4321 -VPERMIL 4322 -VPERMILPDYmi 4323 -VPERMILPDYri 4324 -VPERMILPDYrm 4325 -VPERMILPDYrr 4326 -VPERMILPDZ 4327 -VPERMILPDZmbi 4328 -VPERMILPDZmbik 4329 -VPERMILPDZmbikz 4330 -VPERMILPDZmi 4331 -VPERMILPDZmik 4332 -VPERMILPDZmikz 4333 -VPERMILPDZri 4334 -VPERMILPDZrik 4335 -VPERMILPDZrikz 4336 -VPERMILPDZrm 4337 -VPERMILPDZrmb 4338 -VPERMILPDZrmbk 4339 -VPERMILPDZrmbkz 4340 -VPERMILPDZrmk 4341 -VPERMILPDZrmkz 4342 -VPERMILPDZrr 4343 -VPERMILPDZrrk 4344 -VPERMILPDZrrkz 4345 -VPERMILPDmi 4346 -VPERMILPDri 4347 -VPERMILPDrm 4348 -VPERMILPDrr 4349 -VPERMILPSYmi 4350 -VPERMILPSYri 4351 -VPERMILPSYrm 4352 -VPERMILPSYrr 4353 -VPERMILPSZ 4354 -VPERMILPSZmbi 4355 -VPERMILPSZmbik 4356 -VPERMILPSZmbikz 4357 -VPERMILPSZmi 4358 -VPERMILPSZmik 4359 -VPERMILPSZmikz 4360 -VPERMILPSZri 4361 -VPERMILPSZrik 4362 -VPERMILPSZrikz 4363 -VPERMILPSZrm 4364 -VPERMILPSZrmb 4365 -VPERMILPSZrmbk 4366 -VPERMILPSZrmbkz 4367 -VPERMILPSZrmk 4368 -VPERMILPSZrmkz 4369 -VPERMILPSZrr 4370 -VPERMILPSZrrk 4371 -VPERMILPSZrrkz 4372 -VPERMILPSmi 4373 -VPERMILPSri 4374 -VPERMILPSrm 4375 -VPERMILPSrr 4376 -VPERMPDYmi 4377 -VPERMPDYri 4378 -VPERMPDZ 4379 -VPERMPDZmbi 4380 -VPERMPDZmbik 4381 -VPERMPDZmbikz 4382 -VPERMPDZmi 4383 -VPERMPDZmik 4384 -VPERMPDZmikz 4385 -VPERMPDZri 4386 -VPERMPDZrik 4387 -VPERMPDZrikz 4388 -VPERMPDZrm 4389 -VPERMPDZrmb 4390 -VPERMPDZrmbk 4391 -VPERMPDZrmbkz 4392 -VPERMPDZrmk 4393 -VPERMPDZrmkz 4394 -VPERMPDZrr 4395 -VPERMPDZrrk 4396 -VPERMPDZrrkz 4397 -VPERMPSYrm 4398 -VPERMPSYrr 4399 -VPERMPSZ 4400 -VPERMPSZrm 4401 -VPERMPSZrmb 4402 -VPERMPSZrmbk 4403 -VPERMPSZrmbkz 4404 -VPERMPSZrmk 4405 -VPERMPSZrmkz 4406 -VPERMPSZrr 4407 -VPERMPSZrrk 4408 -VPERMPSZrrkz 4409 -VPERMQYmi 4410 -VPERMQYri 4411 -VPERMQZ 4412 -VPERMQZmbi 4413 -VPERMQZmbik 4414 -VPERMQZmbikz 4415 -VPERMQZmi 4416 -VPERMQZmik 4417 -VPERMQZmikz 4418 -VPERMQZri 4419 -VPERMQZrik 4420 -VPERMQZrikz 4421 -VPERMQZrm 4422 -VPERMQZrmb 4423 -VPERMQZrmbk 4424 -VPERMQZrmbkz 4425 -VPERMQZrmk 4426 -VPERMQZrmkz 4427 -VPERMQZrr 4428 -VPERMQZrrk 4429 -VPERMQZrrkz 4430 -VPERMT 4431 -VPERMWZ 4432 -VPERMWZrm 4433 -VPERMWZrmk 4434 -VPERMWZrmkz 4435 -VPERMWZrr 4436 -VPERMWZrrk 4437 -VPERMWZrrkz 4438 -VPEXPANDBZ 4439 -VPEXPANDBZrm 4440 -VPEXPANDBZrmk 4441 -VPEXPANDBZrmkz 4442 -VPEXPANDBZrr 4443 -VPEXPANDBZrrk 4444 -VPEXPANDBZrrkz 4445 -VPEXPANDDZ 4446 -VPEXPANDDZrm 4447 -VPEXPANDDZrmk 4448 -VPEXPANDDZrmkz 4449 -VPEXPANDDZrr 4450 -VPEXPANDDZrrk 4451 -VPEXPANDDZrrkz 4452 -VPEXPANDQZ 4453 -VPEXPANDQZrm 4454 -VPEXPANDQZrmk 4455 -VPEXPANDQZrmkz 4456 -VPEXPANDQZrr 4457 -VPEXPANDQZrrk 4458 -VPEXPANDQZrrkz 4459 -VPEXPANDWZ 4460 -VPEXPANDWZrm 4461 -VPEXPANDWZrmk 4462 -VPEXPANDWZrmkz 4463 -VPEXPANDWZrr 4464 -VPEXPANDWZrrk 4465 -VPEXPANDWZrrkz 4466 -VPEXTRBZmri 4467 -VPEXTRBZrri 4468 -VPEXTRBmri 4469 -VPEXTRBrri 4470 -VPEXTRDZmri 4471 -VPEXTRDZrri 4472 -VPEXTRDmri 4473 -VPEXTRDrri 4474 -VPEXTRQZmri 4475 -VPEXTRQZrri 4476 -VPEXTRQmri 4477 -VPEXTRQrri 4478 -VPEXTRWZmri 4479 -VPEXTRWZrri 4480 -VPEXTRWZrri_REV 4481 -VPEXTRWmri 4482 -VPEXTRWrri 4483 -VPEXTRWrri_REV 4484 -VPGATHERDDYrm 4485 -VPGATHERDDZ 4486 -VPGATHERDDZrm 4487 -VPGATHERDDrm 4488 -VPGATHERDQYrm 4489 -VPGATHERDQZ 4490 -VPGATHERDQZrm 4491 -VPGATHERDQrm 4492 -VPGATHERQDYrm 4493 -VPGATHERQDZ 4494 -VPGATHERQDZrm 4495 -VPGATHERQDrm 4496 -VPGATHERQQYrm 4497 -VPGATHERQQZ 4498 -VPGATHERQQZrm 4499 -VPGATHERQQrm 4500 -VPHADDBDrm 4501 -VPHADDBDrr 4502 -VPHADDBQrm 4503 -VPHADDBQrr 4504 -VPHADDBWrm 4505 -VPHADDBWrr 4506 -VPHADDDQrm 4507 -VPHADDDQrr 4508 -VPHADDDYrm 4509 -VPHADDDYrr 4510 -VPHADDDrm 4511 -VPHADDDrr 4512 -VPHADDSWYrm 4513 -VPHADDSWYrr 4514 -VPHADDSWrm 4515 -VPHADDSWrr 4516 -VPHADDUBDrm 4517 -VPHADDUBDrr 4518 -VPHADDUBQrm 4519 -VPHADDUBQrr 4520 -VPHADDUBWrm 4521 -VPHADDUBWrr 4522 -VPHADDUDQrm 4523 -VPHADDUDQrr 4524 -VPHADDUWDrm 4525 -VPHADDUWDrr 4526 -VPHADDUWQrm 4527 -VPHADDUWQrr 4528 -VPHADDWDrm 4529 -VPHADDWDrr 4530 -VPHADDWQrm 4531 -VPHADDWQrr 4532 -VPHADDWYrm 4533 -VPHADDWYrr 4534 -VPHADDWrm 4535 -VPHADDWrr 4536 -VPHMINPOSUWrm 4537 -VPHMINPOSUWrr 4538 -VPHSUBBWrm 4539 -VPHSUBBWrr 4540 -VPHSUBDQrm 4541 -VPHSUBDQrr 4542 -VPHSUBDYrm 4543 -VPHSUBDYrr 4544 -VPHSUBDrm 4545 -VPHSUBDrr 4546 -VPHSUBSWYrm 4547 -VPHSUBSWYrr 4548 -VPHSUBSWrm 4549 -VPHSUBSWrr 4550 -VPHSUBWDrm 4551 -VPHSUBWDrr 4552 -VPHSUBWYrm 4553 -VPHSUBWYrr 4554 -VPHSUBWrm 4555 -VPHSUBWrr 4556 -VPINSRBZrmi 4557 -VPINSRBZrri 4558 -VPINSRBrmi 4559 -VPINSRBrri 4560 -VPINSRDZrmi 4561 -VPINSRDZrri 4562 -VPINSRDrmi 4563 -VPINSRDrri 4564 -VPINSRQZrmi 4565 -VPINSRQZrri 4566 -VPINSRQrmi 4567 -VPINSRQrri 4568 -VPINSRWZrmi 4569 -VPINSRWZrri 4570 -VPINSRWrmi 4571 -VPINSRWrri 4572 -VPLZCNTDZ 4573 -VPLZCNTDZrm 4574 -VPLZCNTDZrmb 4575 -VPLZCNTDZrmbk 4576 -VPLZCNTDZrmbkz 4577 -VPLZCNTDZrmk 4578 -VPLZCNTDZrmkz 4579 -VPLZCNTDZrr 4580 -VPLZCNTDZrrk 4581 -VPLZCNTDZrrkz 4582 -VPLZCNTQZ 4583 -VPLZCNTQZrm 4584 -VPLZCNTQZrmb 4585 -VPLZCNTQZrmbk 4586 -VPLZCNTQZrmbkz 4587 -VPLZCNTQZrmk 4588 -VPLZCNTQZrmkz 4589 -VPLZCNTQZrr 4590 -VPLZCNTQZrrk 4591 -VPLZCNTQZrrkz 4592 -VPMACSDDrm 4593 -VPMACSDDrr 4594 -VPMACSDQHrm 4595 -VPMACSDQHrr 4596 -VPMACSDQLrm 4597 -VPMACSDQLrr 4598 -VPMACSSDDrm 4599 -VPMACSSDDrr 4600 -VPMACSSDQHrm 4601 -VPMACSSDQHrr 4602 -VPMACSSDQLrm 4603 -VPMACSSDQLrr 4604 -VPMACSSWDrm 4605 -VPMACSSWDrr 4606 -VPMACSSWWrm 4607 -VPMACSSWWrr 4608 -VPMACSWDrm 4609 -VPMACSWDrr 4610 -VPMACSWWrm 4611 -VPMACSWWrr 4612 -VPMADCSSWDrm 4613 -VPMADCSSWDrr 4614 -VPMADCSWDrm 4615 -VPMADCSWDrr 4616 -VPMADD 4617 -VPMADDUBSWYrm 4618 -VPMADDUBSWYrr 4619 -VPMADDUBSWZ 4620 -VPMADDUBSWZrm 4621 -VPMADDUBSWZrmk 4622 -VPMADDUBSWZrmkz 4623 -VPMADDUBSWZrr 4624 -VPMADDUBSWZrrk 4625 -VPMADDUBSWZrrkz 4626 -VPMADDUBSWrm 4627 -VPMADDUBSWrr 4628 -VPMADDWDYrm 4629 -VPMADDWDYrr 4630 -VPMADDWDZ 4631 -VPMADDWDZrm 4632 -VPMADDWDZrmk 4633 -VPMADDWDZrmkz 4634 -VPMADDWDZrr 4635 -VPMADDWDZrrk 4636 -VPMADDWDZrrkz 4637 -VPMADDWDrm 4638 -VPMADDWDrr 4639 -VPMASKMOVDYmr 4640 -VPMASKMOVDYrm 4641 -VPMASKMOVDmr 4642 -VPMASKMOVDrm 4643 -VPMASKMOVQYmr 4644 -VPMASKMOVQYrm 4645 -VPMASKMOVQmr 4646 -VPMASKMOVQrm 4647 -VPMAXSBYrm 4648 -VPMAXSBYrr 4649 -VPMAXSBZ 4650 -VPMAXSBZrm 4651 -VPMAXSBZrmk 4652 -VPMAXSBZrmkz 4653 -VPMAXSBZrr 4654 -VPMAXSBZrrk 4655 -VPMAXSBZrrkz 4656 -VPMAXSBrm 4657 -VPMAXSBrr 4658 -VPMAXSDYrm 4659 -VPMAXSDYrr 4660 -VPMAXSDZ 4661 -VPMAXSDZrm 4662 -VPMAXSDZrmb 4663 -VPMAXSDZrmbk 4664 -VPMAXSDZrmbkz 4665 -VPMAXSDZrmk 4666 -VPMAXSDZrmkz 4667 -VPMAXSDZrr 4668 -VPMAXSDZrrk 4669 -VPMAXSDZrrkz 4670 -VPMAXSDrm 4671 -VPMAXSDrr 4672 -VPMAXSQZ 4673 -VPMAXSQZrm 4674 -VPMAXSQZrmb 4675 -VPMAXSQZrmbk 4676 -VPMAXSQZrmbkz 4677 -VPMAXSQZrmk 4678 -VPMAXSQZrmkz 4679 -VPMAXSQZrr 4680 -VPMAXSQZrrk 4681 -VPMAXSQZrrkz 4682 -VPMAXSWYrm 4683 -VPMAXSWYrr 4684 -VPMAXSWZ 4685 -VPMAXSWZrm 4686 -VPMAXSWZrmk 4687 -VPMAXSWZrmkz 4688 -VPMAXSWZrr 4689 -VPMAXSWZrrk 4690 -VPMAXSWZrrkz 4691 -VPMAXSWrm 4692 -VPMAXSWrr 4693 -VPMAXUBYrm 4694 -VPMAXUBYrr 4695 -VPMAXUBZ 4696 -VPMAXUBZrm 4697 -VPMAXUBZrmk 4698 -VPMAXUBZrmkz 4699 -VPMAXUBZrr 4700 -VPMAXUBZrrk 4701 -VPMAXUBZrrkz 4702 -VPMAXUBrm 4703 -VPMAXUBrr 4704 -VPMAXUDYrm 4705 -VPMAXUDYrr 4706 -VPMAXUDZ 4707 -VPMAXUDZrm 4708 -VPMAXUDZrmb 4709 -VPMAXUDZrmbk 4710 -VPMAXUDZrmbkz 4711 -VPMAXUDZrmk 4712 -VPMAXUDZrmkz 4713 -VPMAXUDZrr 4714 -VPMAXUDZrrk 4715 -VPMAXUDZrrkz 4716 -VPMAXUDrm 4717 -VPMAXUDrr 4718 -VPMAXUQZ 4719 -VPMAXUQZrm 4720 -VPMAXUQZrmb 4721 -VPMAXUQZrmbk 4722 -VPMAXUQZrmbkz 4723 -VPMAXUQZrmk 4724 -VPMAXUQZrmkz 4725 -VPMAXUQZrr 4726 -VPMAXUQZrrk 4727 -VPMAXUQZrrkz 4728 -VPMAXUWYrm 4729 -VPMAXUWYrr 4730 -VPMAXUWZ 4731 -VPMAXUWZrm 4732 -VPMAXUWZrmk 4733 -VPMAXUWZrmkz 4734 -VPMAXUWZrr 4735 -VPMAXUWZrrk 4736 -VPMAXUWZrrkz 4737 -VPMAXUWrm 4738 -VPMAXUWrr 4739 -VPMINSBYrm 4740 -VPMINSBYrr 4741 -VPMINSBZ 4742 -VPMINSBZrm 4743 -VPMINSBZrmk 4744 -VPMINSBZrmkz 4745 -VPMINSBZrr 4746 -VPMINSBZrrk 4747 -VPMINSBZrrkz 4748 -VPMINSBrm 4749 -VPMINSBrr 4750 -VPMINSDYrm 4751 -VPMINSDYrr 4752 -VPMINSDZ 4753 -VPMINSDZrm 4754 -VPMINSDZrmb 4755 -VPMINSDZrmbk 4756 -VPMINSDZrmbkz 4757 -VPMINSDZrmk 4758 -VPMINSDZrmkz 4759 -VPMINSDZrr 4760 -VPMINSDZrrk 4761 -VPMINSDZrrkz 4762 -VPMINSDrm 4763 -VPMINSDrr 4764 -VPMINSQZ 4765 -VPMINSQZrm 4766 -VPMINSQZrmb 4767 -VPMINSQZrmbk 4768 -VPMINSQZrmbkz 4769 -VPMINSQZrmk 4770 -VPMINSQZrmkz 4771 -VPMINSQZrr 4772 -VPMINSQZrrk 4773 -VPMINSQZrrkz 4774 -VPMINSWYrm 4775 -VPMINSWYrr 4776 -VPMINSWZ 4777 -VPMINSWZrm 4778 -VPMINSWZrmk 4779 -VPMINSWZrmkz 4780 -VPMINSWZrr 4781 -VPMINSWZrrk 4782 -VPMINSWZrrkz 4783 -VPMINSWrm 4784 -VPMINSWrr 4785 -VPMINUBYrm 4786 -VPMINUBYrr 4787 -VPMINUBZ 4788 -VPMINUBZrm 4789 -VPMINUBZrmk 4790 -VPMINUBZrmkz 4791 -VPMINUBZrr 4792 -VPMINUBZrrk 4793 -VPMINUBZrrkz 4794 -VPMINUBrm 4795 -VPMINUBrr 4796 -VPMINUDYrm 4797 -VPMINUDYrr 4798 -VPMINUDZ 4799 -VPMINUDZrm 4800 -VPMINUDZrmb 4801 -VPMINUDZrmbk 4802 -VPMINUDZrmbkz 4803 -VPMINUDZrmk 4804 -VPMINUDZrmkz 4805 -VPMINUDZrr 4806 -VPMINUDZrrk 4807 -VPMINUDZrrkz 4808 -VPMINUDrm 4809 -VPMINUDrr 4810 -VPMINUQZ 4811 -VPMINUQZrm 4812 -VPMINUQZrmb 4813 -VPMINUQZrmbk 4814 -VPMINUQZrmbkz 4815 -VPMINUQZrmk 4816 -VPMINUQZrmkz 4817 -VPMINUQZrr 4818 -VPMINUQZrrk 4819 -VPMINUQZrrkz 4820 -VPMINUWYrm 4821 -VPMINUWYrr 4822 -VPMINUWZ 4823 -VPMINUWZrm 4824 -VPMINUWZrmk 4825 -VPMINUWZrmkz 4826 -VPMINUWZrr 4827 -VPMINUWZrrk 4828 -VPMINUWZrrkz 4829 -VPMINUWrm 4830 -VPMINUWrr 4831 -VPMOVB 4832 -VPMOVD 4833 -VPMOVDBZ 4834 -VPMOVDBZmr 4835 -VPMOVDBZmrk 4836 -VPMOVDBZrr 4837 -VPMOVDBZrrk 4838 -VPMOVDBZrrkz 4839 -VPMOVDWZ 4840 -VPMOVDWZmr 4841 -VPMOVDWZmrk 4842 -VPMOVDWZrr 4843 -VPMOVDWZrrk 4844 -VPMOVDWZrrkz 4845 -VPMOVM 4846 -VPMOVMSKBYrr 4847 -VPMOVMSKBrr 4848 -VPMOVQ 4849 -VPMOVQBZ 4850 -VPMOVQBZmr 4851 -VPMOVQBZmrk 4852 -VPMOVQBZrr 4853 -VPMOVQBZrrk 4854 -VPMOVQBZrrkz 4855 -VPMOVQDZ 4856 -VPMOVQDZmr 4857 -VPMOVQDZmrk 4858 -VPMOVQDZrr 4859 -VPMOVQDZrrk 4860 -VPMOVQDZrrkz 4861 -VPMOVQWZ 4862 -VPMOVQWZmr 4863 -VPMOVQWZmrk 4864 -VPMOVQWZrr 4865 -VPMOVQWZrrk 4866 -VPMOVQWZrrkz 4867 -VPMOVSDBZ 4868 -VPMOVSDBZmr 4869 -VPMOVSDBZmrk 4870 -VPMOVSDBZrr 4871 -VPMOVSDBZrrk 4872 -VPMOVSDBZrrkz 4873 -VPMOVSDWZ 4874 -VPMOVSDWZmr 4875 -VPMOVSDWZmrk 4876 -VPMOVSDWZrr 4877 -VPMOVSDWZrrk 4878 -VPMOVSDWZrrkz 4879 -VPMOVSQBZ 4880 -VPMOVSQBZmr 4881 -VPMOVSQBZmrk 4882 -VPMOVSQBZrr 4883 -VPMOVSQBZrrk 4884 -VPMOVSQBZrrkz 4885 -VPMOVSQDZ 4886 -VPMOVSQDZmr 4887 -VPMOVSQDZmrk 4888 -VPMOVSQDZrr 4889 -VPMOVSQDZrrk 4890 -VPMOVSQDZrrkz 4891 -VPMOVSQWZ 4892 -VPMOVSQWZmr 4893 -VPMOVSQWZmrk 4894 -VPMOVSQWZrr 4895 -VPMOVSQWZrrk 4896 -VPMOVSQWZrrkz 4897 -VPMOVSWBZ 4898 -VPMOVSWBZmr 4899 -VPMOVSWBZmrk 4900 -VPMOVSWBZrr 4901 -VPMOVSWBZrrk 4902 -VPMOVSWBZrrkz 4903 -VPMOVSXBDYrm 4904 -VPMOVSXBDYrr 4905 -VPMOVSXBDZ 4906 -VPMOVSXBDZrm 4907 -VPMOVSXBDZrmk 4908 -VPMOVSXBDZrmkz 4909 -VPMOVSXBDZrr 4910 -VPMOVSXBDZrrk 4911 -VPMOVSXBDZrrkz 4912 -VPMOVSXBDrm 4913 -VPMOVSXBDrr 4914 -VPMOVSXBQYrm 4915 -VPMOVSXBQYrr 4916 -VPMOVSXBQZ 4917 -VPMOVSXBQZrm 4918 -VPMOVSXBQZrmk 4919 -VPMOVSXBQZrmkz 4920 -VPMOVSXBQZrr 4921 -VPMOVSXBQZrrk 4922 -VPMOVSXBQZrrkz 4923 -VPMOVSXBQrm 4924 -VPMOVSXBQrr 4925 -VPMOVSXBWYrm 4926 -VPMOVSXBWYrr 4927 -VPMOVSXBWZ 4928 -VPMOVSXBWZrm 4929 -VPMOVSXBWZrmk 4930 -VPMOVSXBWZrmkz 4931 -VPMOVSXBWZrr 4932 -VPMOVSXBWZrrk 4933 -VPMOVSXBWZrrkz 4934 -VPMOVSXBWrm 4935 -VPMOVSXBWrr 4936 -VPMOVSXDQYrm 4937 -VPMOVSXDQYrr 4938 -VPMOVSXDQZ 4939 -VPMOVSXDQZrm 4940 -VPMOVSXDQZrmk 4941 -VPMOVSXDQZrmkz 4942 -VPMOVSXDQZrr 4943 -VPMOVSXDQZrrk 4944 -VPMOVSXDQZrrkz 4945 -VPMOVSXDQrm 4946 -VPMOVSXDQrr 4947 -VPMOVSXWDYrm 4948 -VPMOVSXWDYrr 4949 -VPMOVSXWDZ 4950 -VPMOVSXWDZrm 4951 -VPMOVSXWDZrmk 4952 -VPMOVSXWDZrmkz 4953 -VPMOVSXWDZrr 4954 -VPMOVSXWDZrrk 4955 -VPMOVSXWDZrrkz 4956 -VPMOVSXWDrm 4957 -VPMOVSXWDrr 4958 -VPMOVSXWQYrm 4959 -VPMOVSXWQYrr 4960 -VPMOVSXWQZ 4961 -VPMOVSXWQZrm 4962 -VPMOVSXWQZrmk 4963 -VPMOVSXWQZrmkz 4964 -VPMOVSXWQZrr 4965 -VPMOVSXWQZrrk 4966 -VPMOVSXWQZrrkz 4967 -VPMOVSXWQrm 4968 -VPMOVSXWQrr 4969 -VPMOVUSDBZ 4970 -VPMOVUSDBZmr 4971 -VPMOVUSDBZmrk 4972 -VPMOVUSDBZrr 4973 -VPMOVUSDBZrrk 4974 -VPMOVUSDBZrrkz 4975 -VPMOVUSDWZ 4976 -VPMOVUSDWZmr 4977 -VPMOVUSDWZmrk 4978 -VPMOVUSDWZrr 4979 -VPMOVUSDWZrrk 4980 -VPMOVUSDWZrrkz 4981 -VPMOVUSQBZ 4982 -VPMOVUSQBZmr 4983 -VPMOVUSQBZmrk 4984 -VPMOVUSQBZrr 4985 -VPMOVUSQBZrrk 4986 -VPMOVUSQBZrrkz 4987 -VPMOVUSQDZ 4988 -VPMOVUSQDZmr 4989 -VPMOVUSQDZmrk 4990 -VPMOVUSQDZrr 4991 -VPMOVUSQDZrrk 4992 -VPMOVUSQDZrrkz 4993 -VPMOVUSQWZ 4994 -VPMOVUSQWZmr 4995 -VPMOVUSQWZmrk 4996 -VPMOVUSQWZrr 4997 -VPMOVUSQWZrrk 4998 -VPMOVUSQWZrrkz 4999 -VPMOVUSWBZ 5000 -VPMOVUSWBZmr 5001 -VPMOVUSWBZmrk 5002 -VPMOVUSWBZrr 5003 -VPMOVUSWBZrrk 5004 -VPMOVUSWBZrrkz 5005 -VPMOVW 5006 -VPMOVWBZ 5007 -VPMOVWBZmr 5008 -VPMOVWBZmrk 5009 -VPMOVWBZrr 5010 -VPMOVWBZrrk 5011 -VPMOVWBZrrkz 5012 -VPMOVZXBDYrm 5013 -VPMOVZXBDYrr 5014 -VPMOVZXBDZ 5015 -VPMOVZXBDZrm 5016 -VPMOVZXBDZrmk 5017 -VPMOVZXBDZrmkz 5018 -VPMOVZXBDZrr 5019 -VPMOVZXBDZrrk 5020 -VPMOVZXBDZrrkz 5021 -VPMOVZXBDrm 5022 -VPMOVZXBDrr 5023 -VPMOVZXBQYrm 5024 -VPMOVZXBQYrr 5025 -VPMOVZXBQZ 5026 -VPMOVZXBQZrm 5027 -VPMOVZXBQZrmk 5028 -VPMOVZXBQZrmkz 5029 -VPMOVZXBQZrr 5030 -VPMOVZXBQZrrk 5031 -VPMOVZXBQZrrkz 5032 -VPMOVZXBQrm 5033 -VPMOVZXBQrr 5034 -VPMOVZXBWYrm 5035 -VPMOVZXBWYrr 5036 -VPMOVZXBWZ 5037 -VPMOVZXBWZrm 5038 -VPMOVZXBWZrmk 5039 -VPMOVZXBWZrmkz 5040 -VPMOVZXBWZrr 5041 -VPMOVZXBWZrrk 5042 -VPMOVZXBWZrrkz 5043 -VPMOVZXBWrm 5044 -VPMOVZXBWrr 5045 -VPMOVZXDQYrm 5046 -VPMOVZXDQYrr 5047 -VPMOVZXDQZ 5048 -VPMOVZXDQZrm 5049 -VPMOVZXDQZrmk 5050 -VPMOVZXDQZrmkz 5051 -VPMOVZXDQZrr 5052 -VPMOVZXDQZrrk 5053 -VPMOVZXDQZrrkz 5054 -VPMOVZXDQrm 5055 -VPMOVZXDQrr 5056 -VPMOVZXWDYrm 5057 -VPMOVZXWDYrr 5058 -VPMOVZXWDZ 5059 -VPMOVZXWDZrm 5060 -VPMOVZXWDZrmk 5061 -VPMOVZXWDZrmkz 5062 -VPMOVZXWDZrr 5063 -VPMOVZXWDZrrk 5064 -VPMOVZXWDZrrkz 5065 -VPMOVZXWDrm 5066 -VPMOVZXWDrr 5067 -VPMOVZXWQYrm 5068 -VPMOVZXWQYrr 5069 -VPMOVZXWQZ 5070 -VPMOVZXWQZrm 5071 -VPMOVZXWQZrmk 5072 -VPMOVZXWQZrmkz 5073 -VPMOVZXWQZrr 5074 -VPMOVZXWQZrrk 5075 -VPMOVZXWQZrrkz 5076 -VPMOVZXWQrm 5077 -VPMOVZXWQrr 5078 -VPMULDQYrm 5079 -VPMULDQYrr 5080 -VPMULDQZ 5081 -VPMULDQZrm 5082 -VPMULDQZrmb 5083 -VPMULDQZrmbk 5084 -VPMULDQZrmbkz 5085 -VPMULDQZrmk 5086 -VPMULDQZrmkz 5087 -VPMULDQZrr 5088 -VPMULDQZrrk 5089 -VPMULDQZrrkz 5090 -VPMULDQrm 5091 -VPMULDQrr 5092 -VPMULHRSWYrm 5093 -VPMULHRSWYrr 5094 -VPMULHRSWZ 5095 -VPMULHRSWZrm 5096 -VPMULHRSWZrmk 5097 -VPMULHRSWZrmkz 5098 -VPMULHRSWZrr 5099 -VPMULHRSWZrrk 5100 -VPMULHRSWZrrkz 5101 -VPMULHRSWrm 5102 -VPMULHRSWrr 5103 -VPMULHUWYrm 5104 -VPMULHUWYrr 5105 -VPMULHUWZ 5106 -VPMULHUWZrm 5107 -VPMULHUWZrmk 5108 -VPMULHUWZrmkz 5109 -VPMULHUWZrr 5110 -VPMULHUWZrrk 5111 -VPMULHUWZrrkz 5112 -VPMULHUWrm 5113 -VPMULHUWrr 5114 -VPMULHWYrm 5115 -VPMULHWYrr 5116 -VPMULHWZ 5117 -VPMULHWZrm 5118 -VPMULHWZrmk 5119 -VPMULHWZrmkz 5120 -VPMULHWZrr 5121 -VPMULHWZrrk 5122 -VPMULHWZrrkz 5123 -VPMULHWrm 5124 -VPMULHWrr 5125 -VPMULLDYrm 5126 -VPMULLDYrr 5127 -VPMULLDZ 5128 -VPMULLDZrm 5129 -VPMULLDZrmb 5130 -VPMULLDZrmbk 5131 -VPMULLDZrmbkz 5132 -VPMULLDZrmk 5133 -VPMULLDZrmkz 5134 -VPMULLDZrr 5135 -VPMULLDZrrk 5136 -VPMULLDZrrkz 5137 -VPMULLDrm 5138 -VPMULLDrr 5139 -VPMULLQZ 5140 -VPMULLQZrm 5141 -VPMULLQZrmb 5142 -VPMULLQZrmbk 5143 -VPMULLQZrmbkz 5144 -VPMULLQZrmk 5145 -VPMULLQZrmkz 5146 -VPMULLQZrr 5147 -VPMULLQZrrk 5148 -VPMULLQZrrkz 5149 -VPMULLWYrm 5150 -VPMULLWYrr 5151 -VPMULLWZ 5152 -VPMULLWZrm 5153 -VPMULLWZrmk 5154 -VPMULLWZrmkz 5155 -VPMULLWZrr 5156 -VPMULLWZrrk 5157 -VPMULLWZrrkz 5158 -VPMULLWrm 5159 -VPMULLWrr 5160 -VPMULTISHIFTQBZ 5161 -VPMULTISHIFTQBZrm 5162 -VPMULTISHIFTQBZrmb 5163 -VPMULTISHIFTQBZrmbk 5164 -VPMULTISHIFTQBZrmbkz 5165 -VPMULTISHIFTQBZrmk 5166 -VPMULTISHIFTQBZrmkz 5167 -VPMULTISHIFTQBZrr 5168 -VPMULTISHIFTQBZrrk 5169 -VPMULTISHIFTQBZrrkz 5170 -VPMULUDQYrm 5171 -VPMULUDQYrr 5172 -VPMULUDQZ 5173 -VPMULUDQZrm 5174 -VPMULUDQZrmb 5175 -VPMULUDQZrmbk 5176 -VPMULUDQZrmbkz 5177 -VPMULUDQZrmk 5178 -VPMULUDQZrmkz 5179 -VPMULUDQZrr 5180 -VPMULUDQZrrk 5181 -VPMULUDQZrrkz 5182 -VPMULUDQrm 5183 -VPMULUDQrr 5184 -VPOPCNTBZ 5185 -VPOPCNTBZrm 5186 -VPOPCNTBZrmk 5187 -VPOPCNTBZrmkz 5188 -VPOPCNTBZrr 5189 -VPOPCNTBZrrk 5190 -VPOPCNTBZrrkz 5191 -VPOPCNTDZ 5192 -VPOPCNTDZrm 5193 -VPOPCNTDZrmb 5194 -VPOPCNTDZrmbk 5195 -VPOPCNTDZrmbkz 5196 -VPOPCNTDZrmk 5197 -VPOPCNTDZrmkz 5198 -VPOPCNTDZrr 5199 -VPOPCNTDZrrk 5200 -VPOPCNTDZrrkz 5201 -VPOPCNTQZ 5202 -VPOPCNTQZrm 5203 -VPOPCNTQZrmb 5204 -VPOPCNTQZrmbk 5205 -VPOPCNTQZrmbkz 5206 -VPOPCNTQZrmk 5207 -VPOPCNTQZrmkz 5208 -VPOPCNTQZrr 5209 -VPOPCNTQZrrk 5210 -VPOPCNTQZrrkz 5211 -VPOPCNTWZ 5212 -VPOPCNTWZrm 5213 -VPOPCNTWZrmk 5214 -VPOPCNTWZrmkz 5215 -VPOPCNTWZrr 5216 -VPOPCNTWZrrk 5217 -VPOPCNTWZrrkz 5218 -VPORDZ 5219 -VPORDZrm 5220 -VPORDZrmb 5221 -VPORDZrmbk 5222 -VPORDZrmbkz 5223 -VPORDZrmk 5224 -VPORDZrmkz 5225 -VPORDZrr 5226 -VPORDZrrk 5227 -VPORDZrrkz 5228 -VPORQZ 5229 -VPORQZrm 5230 -VPORQZrmb 5231 -VPORQZrmbk 5232 -VPORQZrmbkz 5233 -VPORQZrmk 5234 -VPORQZrmkz 5235 -VPORQZrr 5236 -VPORQZrrk 5237 -VPORQZrrkz 5238 -VPORYrm 5239 -VPORYrr 5240 -VPORrm 5241 -VPORrr 5242 -VPPERMrmr 5243 -VPPERMrrm 5244 -VPPERMrrr 5245 -VPPERMrrr_REV 5246 -VPROLDZ 5247 -VPROLDZmbi 5248 -VPROLDZmbik 5249 -VPROLDZmbikz 5250 -VPROLDZmi 5251 -VPROLDZmik 5252 -VPROLDZmikz 5253 -VPROLDZri 5254 -VPROLDZrik 5255 -VPROLDZrikz 5256 -VPROLQZ 5257 -VPROLQZmbi 5258 -VPROLQZmbik 5259 -VPROLQZmbikz 5260 -VPROLQZmi 5261 -VPROLQZmik 5262 -VPROLQZmikz 5263 -VPROLQZri 5264 -VPROLQZrik 5265 -VPROLQZrikz 5266 -VPROLVDZ 5267 -VPROLVDZrm 5268 -VPROLVDZrmb 5269 -VPROLVDZrmbk 5270 -VPROLVDZrmbkz 5271 -VPROLVDZrmk 5272 -VPROLVDZrmkz 5273 -VPROLVDZrr 5274 -VPROLVDZrrk 5275 -VPROLVDZrrkz 5276 -VPROLVQZ 5277 -VPROLVQZrm 5278 -VPROLVQZrmb 5279 -VPROLVQZrmbk 5280 -VPROLVQZrmbkz 5281 -VPROLVQZrmk 5282 -VPROLVQZrmkz 5283 -VPROLVQZrr 5284 -VPROLVQZrrk 5285 -VPROLVQZrrkz 5286 -VPRORDZ 5287 -VPRORDZmbi 5288 -VPRORDZmbik 5289 -VPRORDZmbikz 5290 -VPRORDZmi 5291 -VPRORDZmik 5292 -VPRORDZmikz 5293 -VPRORDZri 5294 -VPRORDZrik 5295 -VPRORDZrikz 5296 -VPRORQZ 5297 -VPRORQZmbi 5298 -VPRORQZmbik 5299 -VPRORQZmbikz 5300 -VPRORQZmi 5301 -VPRORQZmik 5302 -VPRORQZmikz 5303 -VPRORQZri 5304 -VPRORQZrik 5305 -VPRORQZrikz 5306 -VPRORVDZ 5307 -VPRORVDZrm 5308 -VPRORVDZrmb 5309 -VPRORVDZrmbk 5310 -VPRORVDZrmbkz 5311 -VPRORVDZrmk 5312 -VPRORVDZrmkz 5313 -VPRORVDZrr 5314 -VPRORVDZrrk 5315 -VPRORVDZrrkz 5316 -VPRORVQZ 5317 -VPRORVQZrm 5318 -VPRORVQZrmb 5319 -VPRORVQZrmbk 5320 -VPRORVQZrmbkz 5321 -VPRORVQZrmk 5322 -VPRORVQZrmkz 5323 -VPRORVQZrr 5324 -VPRORVQZrrk 5325 -VPRORVQZrrkz 5326 -VPROTBmi 5327 -VPROTBmr 5328 -VPROTBri 5329 -VPROTBrm 5330 -VPROTBrr 5331 -VPROTBrr_REV 5332 -VPROTDmi 5333 -VPROTDmr 5334 -VPROTDri 5335 -VPROTDrm 5336 -VPROTDrr 5337 -VPROTDrr_REV 5338 -VPROTQmi 5339 -VPROTQmr 5340 -VPROTQri 5341 -VPROTQrm 5342 -VPROTQrr 5343 -VPROTQrr_REV 5344 -VPROTWmi 5345 -VPROTWmr 5346 -VPROTWri 5347 -VPROTWrm 5348 -VPROTWrr 5349 -VPROTWrr_REV 5350 -VPSADBWYrm 5351 -VPSADBWYrr 5352 -VPSADBWZ 5353 -VPSADBWZrm 5354 -VPSADBWZrr 5355 -VPSADBWrm 5356 -VPSADBWrr 5357 -VPSCATTERDDZ 5358 -VPSCATTERDDZmr 5359 -VPSCATTERDQZ 5360 -VPSCATTERDQZmr 5361 -VPSCATTERQDZ 5362 -VPSCATTERQDZmr 5363 -VPSCATTERQQZ 5364 -VPSCATTERQQZmr 5365 -VPSHABmr 5366 -VPSHABrm 5367 -VPSHABrr 5368 -VPSHABrr_REV 5369 -VPSHADmr 5370 -VPSHADrm 5371 -VPSHADrr 5372 -VPSHADrr_REV 5373 -VPSHAQmr 5374 -VPSHAQrm 5375 -VPSHAQrr 5376 -VPSHAQrr_REV 5377 -VPSHAWmr 5378 -VPSHAWrm 5379 -VPSHAWrr 5380 -VPSHAWrr_REV 5381 -VPSHLBmr 5382 -VPSHLBrm 5383 -VPSHLBrr 5384 -VPSHLBrr_REV 5385 -VPSHLDDZ 5386 -VPSHLDDZrmbi 5387 -VPSHLDDZrmbik 5388 -VPSHLDDZrmbikz 5389 -VPSHLDDZrmi 5390 -VPSHLDDZrmik 5391 -VPSHLDDZrmikz 5392 -VPSHLDDZrri 5393 -VPSHLDDZrrik 5394 -VPSHLDDZrrikz 5395 -VPSHLDQZ 5396 -VPSHLDQZrmbi 5397 -VPSHLDQZrmbik 5398 -VPSHLDQZrmbikz 5399 -VPSHLDQZrmi 5400 -VPSHLDQZrmik 5401 -VPSHLDQZrmikz 5402 -VPSHLDQZrri 5403 -VPSHLDQZrrik 5404 -VPSHLDQZrrikz 5405 -VPSHLDVDZ 5406 -VPSHLDVDZm 5407 -VPSHLDVDZmb 5408 -VPSHLDVDZmbk 5409 -VPSHLDVDZmbkz 5410 -VPSHLDVDZmk 5411 -VPSHLDVDZmkz 5412 -VPSHLDVDZr 5413 -VPSHLDVDZrk 5414 -VPSHLDVDZrkz 5415 -VPSHLDVQZ 5416 -VPSHLDVQZm 5417 -VPSHLDVQZmb 5418 -VPSHLDVQZmbk 5419 -VPSHLDVQZmbkz 5420 -VPSHLDVQZmk 5421 -VPSHLDVQZmkz 5422 -VPSHLDVQZr 5423 -VPSHLDVQZrk 5424 -VPSHLDVQZrkz 5425 -VPSHLDVWZ 5426 -VPSHLDVWZm 5427 -VPSHLDVWZmk 5428 -VPSHLDVWZmkz 5429 -VPSHLDVWZr 5430 -VPSHLDVWZrk 5431 -VPSHLDVWZrkz 5432 -VPSHLDWZ 5433 -VPSHLDWZrmi 5434 -VPSHLDWZrmik 5435 -VPSHLDWZrmikz 5436 -VPSHLDWZrri 5437 -VPSHLDWZrrik 5438 -VPSHLDWZrrikz 5439 -VPSHLDmr 5440 -VPSHLDrm 5441 -VPSHLDrr 5442 -VPSHLDrr_REV 5443 -VPSHLQmr 5444 -VPSHLQrm 5445 -VPSHLQrr 5446 -VPSHLQrr_REV 5447 -VPSHLWmr 5448 -VPSHLWrm 5449 -VPSHLWrr 5450 -VPSHLWrr_REV 5451 -VPSHRDDZ 5452 -VPSHRDDZrmbi 5453 -VPSHRDDZrmbik 5454 -VPSHRDDZrmbikz 5455 -VPSHRDDZrmi 5456 -VPSHRDDZrmik 5457 -VPSHRDDZrmikz 5458 -VPSHRDDZrri 5459 -VPSHRDDZrrik 5460 -VPSHRDDZrrikz 5461 -VPSHRDQZ 5462 -VPSHRDQZrmbi 5463 -VPSHRDQZrmbik 5464 -VPSHRDQZrmbikz 5465 -VPSHRDQZrmi 5466 -VPSHRDQZrmik 5467 -VPSHRDQZrmikz 5468 -VPSHRDQZrri 5469 -VPSHRDQZrrik 5470 -VPSHRDQZrrikz 5471 -VPSHRDVDZ 5472 -VPSHRDVDZm 5473 -VPSHRDVDZmb 5474 -VPSHRDVDZmbk 5475 -VPSHRDVDZmbkz 5476 -VPSHRDVDZmk 5477 -VPSHRDVDZmkz 5478 -VPSHRDVDZr 5479 -VPSHRDVDZrk 5480 -VPSHRDVDZrkz 5481 -VPSHRDVQZ 5482 -VPSHRDVQZm 5483 -VPSHRDVQZmb 5484 -VPSHRDVQZmbk 5485 -VPSHRDVQZmbkz 5486 -VPSHRDVQZmk 5487 -VPSHRDVQZmkz 5488 -VPSHRDVQZr 5489 -VPSHRDVQZrk 5490 -VPSHRDVQZrkz 5491 -VPSHRDVWZ 5492 -VPSHRDVWZm 5493 -VPSHRDVWZmk 5494 -VPSHRDVWZmkz 5495 -VPSHRDVWZr 5496 -VPSHRDVWZrk 5497 -VPSHRDVWZrkz 5498 -VPSHRDWZ 5499 -VPSHRDWZrmi 5500 -VPSHRDWZrmik 5501 -VPSHRDWZrmikz 5502 -VPSHRDWZrri 5503 -VPSHRDWZrrik 5504 -VPSHRDWZrrikz 5505 -VPSHUFBITQMBZ 5506 -VPSHUFBITQMBZrm 5507 -VPSHUFBITQMBZrmk 5508 -VPSHUFBITQMBZrr 5509 -VPSHUFBITQMBZrrk 5510 -VPSHUFBYrm 5511 -VPSHUFBYrr 5512 -VPSHUFBZ 5513 -VPSHUFBZrm 5514 -VPSHUFBZrmk 5515 -VPSHUFBZrmkz 5516 -VPSHUFBZrr 5517 -VPSHUFBZrrk 5518 -VPSHUFBZrrkz 5519 -VPSHUFBrm 5520 -VPSHUFBrr 5521 -VPSHUFDYmi 5522 -VPSHUFDYri 5523 -VPSHUFDZ 5524 -VPSHUFDZmbi 5525 -VPSHUFDZmbik 5526 -VPSHUFDZmbikz 5527 -VPSHUFDZmi 5528 -VPSHUFDZmik 5529 -VPSHUFDZmikz 5530 -VPSHUFDZri 5531 -VPSHUFDZrik 5532 -VPSHUFDZrikz 5533 -VPSHUFDmi 5534 -VPSHUFDri 5535 -VPSHUFHWYmi 5536 -VPSHUFHWYri 5537 -VPSHUFHWZ 5538 -VPSHUFHWZmi 5539 -VPSHUFHWZmik 5540 -VPSHUFHWZmikz 5541 -VPSHUFHWZri 5542 -VPSHUFHWZrik 5543 -VPSHUFHWZrikz 5544 -VPSHUFHWmi 5545 -VPSHUFHWri 5546 -VPSHUFLWYmi 5547 -VPSHUFLWYri 5548 -VPSHUFLWZ 5549 -VPSHUFLWZmi 5550 -VPSHUFLWZmik 5551 -VPSHUFLWZmikz 5552 -VPSHUFLWZri 5553 -VPSHUFLWZrik 5554 -VPSHUFLWZrikz 5555 -VPSHUFLWmi 5556 -VPSHUFLWri 5557 -VPSIGNBYrm 5558 -VPSIGNBYrr 5559 -VPSIGNBrm 5560 -VPSIGNBrr 5561 -VPSIGNDYrm 5562 -VPSIGNDYrr 5563 -VPSIGNDrm 5564 -VPSIGNDrr 5565 -VPSIGNWYrm 5566 -VPSIGNWYrr 5567 -VPSIGNWrm 5568 -VPSIGNWrr 5569 -VPSLLDQYri 5570 -VPSLLDQZ 5571 -VPSLLDQZmi 5572 -VPSLLDQZri 5573 -VPSLLDQri 5574 -VPSLLDYri 5575 -VPSLLDYrm 5576 -VPSLLDYrr 5577 -VPSLLDZ 5578 -VPSLLDZmbi 5579 -VPSLLDZmbik 5580 -VPSLLDZmbikz 5581 -VPSLLDZmi 5582 -VPSLLDZmik 5583 -VPSLLDZmikz 5584 -VPSLLDZri 5585 -VPSLLDZrik 5586 -VPSLLDZrikz 5587 -VPSLLDZrm 5588 -VPSLLDZrmk 5589 -VPSLLDZrmkz 5590 -VPSLLDZrr 5591 -VPSLLDZrrk 5592 -VPSLLDZrrkz 5593 -VPSLLDri 5594 -VPSLLDrm 5595 -VPSLLDrr 5596 -VPSLLQYri 5597 -VPSLLQYrm 5598 -VPSLLQYrr 5599 -VPSLLQZ 5600 -VPSLLQZmbi 5601 -VPSLLQZmbik 5602 -VPSLLQZmbikz 5603 -VPSLLQZmi 5604 -VPSLLQZmik 5605 -VPSLLQZmikz 5606 -VPSLLQZri 5607 -VPSLLQZrik 5608 -VPSLLQZrikz 5609 -VPSLLQZrm 5610 -VPSLLQZrmk 5611 -VPSLLQZrmkz 5612 -VPSLLQZrr 5613 -VPSLLQZrrk 5614 -VPSLLQZrrkz 5615 -VPSLLQri 5616 -VPSLLQrm 5617 -VPSLLQrr 5618 -VPSLLVDYrm 5619 -VPSLLVDYrr 5620 -VPSLLVDZ 5621 -VPSLLVDZrm 5622 -VPSLLVDZrmb 5623 -VPSLLVDZrmbk 5624 -VPSLLVDZrmbkz 5625 -VPSLLVDZrmk 5626 -VPSLLVDZrmkz 5627 -VPSLLVDZrr 5628 -VPSLLVDZrrk 5629 -VPSLLVDZrrkz 5630 -VPSLLVDrm 5631 -VPSLLVDrr 5632 -VPSLLVQYrm 5633 -VPSLLVQYrr 5634 -VPSLLVQZ 5635 -VPSLLVQZrm 5636 -VPSLLVQZrmb 5637 -VPSLLVQZrmbk 5638 -VPSLLVQZrmbkz 5639 -VPSLLVQZrmk 5640 -VPSLLVQZrmkz 5641 -VPSLLVQZrr 5642 -VPSLLVQZrrk 5643 -VPSLLVQZrrkz 5644 -VPSLLVQrm 5645 -VPSLLVQrr 5646 -VPSLLVWZ 5647 -VPSLLVWZrm 5648 -VPSLLVWZrmk 5649 -VPSLLVWZrmkz 5650 -VPSLLVWZrr 5651 -VPSLLVWZrrk 5652 -VPSLLVWZrrkz 5653 -VPSLLWYri 5654 -VPSLLWYrm 5655 -VPSLLWYrr 5656 -VPSLLWZ 5657 -VPSLLWZmi 5658 -VPSLLWZmik 5659 -VPSLLWZmikz 5660 -VPSLLWZri 5661 -VPSLLWZrik 5662 -VPSLLWZrikz 5663 -VPSLLWZrm 5664 -VPSLLWZrmk 5665 -VPSLLWZrmkz 5666 -VPSLLWZrr 5667 -VPSLLWZrrk 5668 -VPSLLWZrrkz 5669 -VPSLLWri 5670 -VPSLLWrm 5671 -VPSLLWrr 5672 -VPSRADYri 5673 -VPSRADYrm 5674 -VPSRADYrr 5675 -VPSRADZ 5676 -VPSRADZmbi 5677 -VPSRADZmbik 5678 -VPSRADZmbikz 5679 -VPSRADZmi 5680 -VPSRADZmik 5681 -VPSRADZmikz 5682 -VPSRADZri 5683 -VPSRADZrik 5684 -VPSRADZrikz 5685 -VPSRADZrm 5686 -VPSRADZrmk 5687 -VPSRADZrmkz 5688 -VPSRADZrr 5689 -VPSRADZrrk 5690 -VPSRADZrrkz 5691 -VPSRADri 5692 -VPSRADrm 5693 -VPSRADrr 5694 -VPSRAQZ 5695 -VPSRAQZmbi 5696 -VPSRAQZmbik 5697 -VPSRAQZmbikz 5698 -VPSRAQZmi 5699 -VPSRAQZmik 5700 -VPSRAQZmikz 5701 -VPSRAQZri 5702 -VPSRAQZrik 5703 -VPSRAQZrikz 5704 -VPSRAQZrm 5705 -VPSRAQZrmk 5706 -VPSRAQZrmkz 5707 -VPSRAQZrr 5708 -VPSRAQZrrk 5709 -VPSRAQZrrkz 5710 -VPSRAVDYrm 5711 -VPSRAVDYrr 5712 -VPSRAVDZ 5713 -VPSRAVDZrm 5714 -VPSRAVDZrmb 5715 -VPSRAVDZrmbk 5716 -VPSRAVDZrmbkz 5717 -VPSRAVDZrmk 5718 -VPSRAVDZrmkz 5719 -VPSRAVDZrr 5720 -VPSRAVDZrrk 5721 -VPSRAVDZrrkz 5722 -VPSRAVDrm 5723 -VPSRAVDrr 5724 -VPSRAVQZ 5725 -VPSRAVQZrm 5726 -VPSRAVQZrmb 5727 -VPSRAVQZrmbk 5728 -VPSRAVQZrmbkz 5729 -VPSRAVQZrmk 5730 -VPSRAVQZrmkz 5731 -VPSRAVQZrr 5732 -VPSRAVQZrrk 5733 -VPSRAVQZrrkz 5734 -VPSRAVWZ 5735 -VPSRAVWZrm 5736 -VPSRAVWZrmk 5737 -VPSRAVWZrmkz 5738 -VPSRAVWZrr 5739 -VPSRAVWZrrk 5740 -VPSRAVWZrrkz 5741 -VPSRAWYri 5742 -VPSRAWYrm 5743 -VPSRAWYrr 5744 -VPSRAWZ 5745 -VPSRAWZmi 5746 -VPSRAWZmik 5747 -VPSRAWZmikz 5748 -VPSRAWZri 5749 -VPSRAWZrik 5750 -VPSRAWZrikz 5751 -VPSRAWZrm 5752 -VPSRAWZrmk 5753 -VPSRAWZrmkz 5754 -VPSRAWZrr 5755 -VPSRAWZrrk 5756 -VPSRAWZrrkz 5757 -VPSRAWri 5758 -VPSRAWrm 5759 -VPSRAWrr 5760 -VPSRLDQYri 5761 -VPSRLDQZ 5762 -VPSRLDQZmi 5763 -VPSRLDQZri 5764 -VPSRLDQri 5765 -VPSRLDYri 5766 -VPSRLDYrm 5767 -VPSRLDYrr 5768 -VPSRLDZ 5769 -VPSRLDZmbi 5770 -VPSRLDZmbik 5771 -VPSRLDZmbikz 5772 -VPSRLDZmi 5773 -VPSRLDZmik 5774 -VPSRLDZmikz 5775 -VPSRLDZri 5776 -VPSRLDZrik 5777 -VPSRLDZrikz 5778 -VPSRLDZrm 5779 -VPSRLDZrmk 5780 -VPSRLDZrmkz 5781 -VPSRLDZrr 5782 -VPSRLDZrrk 5783 -VPSRLDZrrkz 5784 -VPSRLDri 5785 -VPSRLDrm 5786 -VPSRLDrr 5787 -VPSRLQYri 5788 -VPSRLQYrm 5789 -VPSRLQYrr 5790 -VPSRLQZ 5791 -VPSRLQZmbi 5792 -VPSRLQZmbik 5793 -VPSRLQZmbikz 5794 -VPSRLQZmi 5795 -VPSRLQZmik 5796 -VPSRLQZmikz 5797 -VPSRLQZri 5798 -VPSRLQZrik 5799 -VPSRLQZrikz 5800 -VPSRLQZrm 5801 -VPSRLQZrmk 5802 -VPSRLQZrmkz 5803 -VPSRLQZrr 5804 -VPSRLQZrrk 5805 -VPSRLQZrrkz 5806 -VPSRLQri 5807 -VPSRLQrm 5808 -VPSRLQrr 5809 -VPSRLVDYrm 5810 -VPSRLVDYrr 5811 -VPSRLVDZ 5812 -VPSRLVDZrm 5813 -VPSRLVDZrmb 5814 -VPSRLVDZrmbk 5815 -VPSRLVDZrmbkz 5816 -VPSRLVDZrmk 5817 -VPSRLVDZrmkz 5818 -VPSRLVDZrr 5819 -VPSRLVDZrrk 5820 -VPSRLVDZrrkz 5821 -VPSRLVDrm 5822 -VPSRLVDrr 5823 -VPSRLVQYrm 5824 -VPSRLVQYrr 5825 -VPSRLVQZ 5826 -VPSRLVQZrm 5827 -VPSRLVQZrmb 5828 -VPSRLVQZrmbk 5829 -VPSRLVQZrmbkz 5830 -VPSRLVQZrmk 5831 -VPSRLVQZrmkz 5832 -VPSRLVQZrr 5833 -VPSRLVQZrrk 5834 -VPSRLVQZrrkz 5835 -VPSRLVQrm 5836 -VPSRLVQrr 5837 -VPSRLVWZ 5838 -VPSRLVWZrm 5839 -VPSRLVWZrmk 5840 -VPSRLVWZrmkz 5841 -VPSRLVWZrr 5842 -VPSRLVWZrrk 5843 -VPSRLVWZrrkz 5844 -VPSRLWYri 5845 -VPSRLWYrm 5846 -VPSRLWYrr 5847 -VPSRLWZ 5848 -VPSRLWZmi 5849 -VPSRLWZmik 5850 -VPSRLWZmikz 5851 -VPSRLWZri 5852 -VPSRLWZrik 5853 -VPSRLWZrikz 5854 -VPSRLWZrm 5855 -VPSRLWZrmk 5856 -VPSRLWZrmkz 5857 -VPSRLWZrr 5858 -VPSRLWZrrk 5859 -VPSRLWZrrkz 5860 -VPSRLWri 5861 -VPSRLWrm 5862 -VPSRLWrr 5863 -VPSUBBYrm 5864 -VPSUBBYrr 5865 -VPSUBBZ 5866 -VPSUBBZrm 5867 -VPSUBBZrmk 5868 -VPSUBBZrmkz 5869 -VPSUBBZrr 5870 -VPSUBBZrrk 5871 -VPSUBBZrrkz 5872 -VPSUBBrm 5873 -VPSUBBrr 5874 -VPSUBDYrm 5875 -VPSUBDYrr 5876 -VPSUBDZ 5877 -VPSUBDZrm 5878 -VPSUBDZrmb 5879 -VPSUBDZrmbk 5880 -VPSUBDZrmbkz 5881 -VPSUBDZrmk 5882 -VPSUBDZrmkz 5883 -VPSUBDZrr 5884 -VPSUBDZrrk 5885 -VPSUBDZrrkz 5886 -VPSUBDrm 5887 -VPSUBDrr 5888 -VPSUBQYrm 5889 -VPSUBQYrr 5890 -VPSUBQZ 5891 -VPSUBQZrm 5892 -VPSUBQZrmb 5893 -VPSUBQZrmbk 5894 -VPSUBQZrmbkz 5895 -VPSUBQZrmk 5896 -VPSUBQZrmkz 5897 -VPSUBQZrr 5898 -VPSUBQZrrk 5899 -VPSUBQZrrkz 5900 -VPSUBQrm 5901 -VPSUBQrr 5902 -VPSUBSBYrm 5903 -VPSUBSBYrr 5904 -VPSUBSBZ 5905 -VPSUBSBZrm 5906 -VPSUBSBZrmk 5907 -VPSUBSBZrmkz 5908 -VPSUBSBZrr 5909 -VPSUBSBZrrk 5910 -VPSUBSBZrrkz 5911 -VPSUBSBrm 5912 -VPSUBSBrr 5913 -VPSUBSWYrm 5914 -VPSUBSWYrr 5915 -VPSUBSWZ 5916 -VPSUBSWZrm 5917 -VPSUBSWZrmk 5918 -VPSUBSWZrmkz 5919 -VPSUBSWZrr 5920 -VPSUBSWZrrk 5921 -VPSUBSWZrrkz 5922 -VPSUBSWrm 5923 -VPSUBSWrr 5924 -VPSUBUSBYrm 5925 -VPSUBUSBYrr 5926 -VPSUBUSBZ 5927 -VPSUBUSBZrm 5928 -VPSUBUSBZrmk 5929 -VPSUBUSBZrmkz 5930 -VPSUBUSBZrr 5931 -VPSUBUSBZrrk 5932 -VPSUBUSBZrrkz 5933 -VPSUBUSBrm 5934 -VPSUBUSBrr 5935 -VPSUBUSWYrm 5936 -VPSUBUSWYrr 5937 -VPSUBUSWZ 5938 -VPSUBUSWZrm 5939 -VPSUBUSWZrmk 5940 -VPSUBUSWZrmkz 5941 -VPSUBUSWZrr 5942 -VPSUBUSWZrrk 5943 -VPSUBUSWZrrkz 5944 -VPSUBUSWrm 5945 -VPSUBUSWrr 5946 -VPSUBWYrm 5947 -VPSUBWYrr 5948 -VPSUBWZ 5949 -VPSUBWZrm 5950 -VPSUBWZrmk 5951 -VPSUBWZrmkz 5952 -VPSUBWZrr 5953 -VPSUBWZrrk 5954 -VPSUBWZrrkz 5955 -VPSUBWrm 5956 -VPSUBWrr 5957 -VPTERNLOGDZ 5958 -VPTERNLOGDZrmbi 5959 -VPTERNLOGDZrmbik 5960 -VPTERNLOGDZrmbikz 5961 -VPTERNLOGDZrmi 5962 -VPTERNLOGDZrmik 5963 -VPTERNLOGDZrmikz 5964 -VPTERNLOGDZrri 5965 -VPTERNLOGDZrrik 5966 -VPTERNLOGDZrrikz 5967 -VPTERNLOGQZ 5968 -VPTERNLOGQZrmbi 5969 -VPTERNLOGQZrmbik 5970 -VPTERNLOGQZrmbikz 5971 -VPTERNLOGQZrmi 5972 -VPTERNLOGQZrmik 5973 -VPTERNLOGQZrmikz 5974 -VPTERNLOGQZrri 5975 -VPTERNLOGQZrrik 5976 -VPTERNLOGQZrrikz 5977 -VPTESTMBZ 5978 -VPTESTMBZrm 5979 -VPTESTMBZrmk 5980 -VPTESTMBZrr 5981 -VPTESTMBZrrk 5982 -VPTESTMDZ 5983 -VPTESTMDZrm 5984 -VPTESTMDZrmb 5985 -VPTESTMDZrmbk 5986 -VPTESTMDZrmk 5987 -VPTESTMDZrr 5988 -VPTESTMDZrrk 5989 -VPTESTMQZ 5990 -VPTESTMQZrm 5991 -VPTESTMQZrmb 5992 -VPTESTMQZrmbk 5993 -VPTESTMQZrmk 5994 -VPTESTMQZrr 5995 -VPTESTMQZrrk 5996 -VPTESTMWZ 5997 -VPTESTMWZrm 5998 -VPTESTMWZrmk 5999 -VPTESTMWZrr 6000 -VPTESTMWZrrk 6001 -VPTESTNMBZ 6002 -VPTESTNMBZrm 6003 -VPTESTNMBZrmk 6004 -VPTESTNMBZrr 6005 -VPTESTNMBZrrk 6006 -VPTESTNMDZ 6007 -VPTESTNMDZrm 6008 -VPTESTNMDZrmb 6009 -VPTESTNMDZrmbk 6010 -VPTESTNMDZrmk 6011 -VPTESTNMDZrr 6012 -VPTESTNMDZrrk 6013 -VPTESTNMQZ 6014 -VPTESTNMQZrm 6015 -VPTESTNMQZrmb 6016 -VPTESTNMQZrmbk 6017 -VPTESTNMQZrmk 6018 -VPTESTNMQZrr 6019 -VPTESTNMQZrrk 6020 -VPTESTNMWZ 6021 -VPTESTNMWZrm 6022 -VPTESTNMWZrmk 6023 -VPTESTNMWZrr 6024 -VPTESTNMWZrrk 6025 -VPTESTYrm 6026 -VPTESTYrr 6027 -VPTESTrm 6028 -VPTESTrr 6029 -VPUNPCKHBWYrm 6030 -VPUNPCKHBWYrr 6031 -VPUNPCKHBWZ 6032 -VPUNPCKHBWZrm 6033 -VPUNPCKHBWZrmk 6034 -VPUNPCKHBWZrmkz 6035 -VPUNPCKHBWZrr 6036 -VPUNPCKHBWZrrk 6037 -VPUNPCKHBWZrrkz 6038 -VPUNPCKHBWrm 6039 -VPUNPCKHBWrr 6040 -VPUNPCKHDQYrm 6041 -VPUNPCKHDQYrr 6042 -VPUNPCKHDQZ 6043 -VPUNPCKHDQZrm 6044 -VPUNPCKHDQZrmb 6045 -VPUNPCKHDQZrmbk 6046 -VPUNPCKHDQZrmbkz 6047 -VPUNPCKHDQZrmk 6048 -VPUNPCKHDQZrmkz 6049 -VPUNPCKHDQZrr 6050 -VPUNPCKHDQZrrk 6051 -VPUNPCKHDQZrrkz 6052 -VPUNPCKHDQrm 6053 -VPUNPCKHDQrr 6054 -VPUNPCKHQDQYrm 6055 -VPUNPCKHQDQYrr 6056 -VPUNPCKHQDQZ 6057 -VPUNPCKHQDQZrm 6058 -VPUNPCKHQDQZrmb 6059 -VPUNPCKHQDQZrmbk 6060 -VPUNPCKHQDQZrmbkz 6061 -VPUNPCKHQDQZrmk 6062 -VPUNPCKHQDQZrmkz 6063 -VPUNPCKHQDQZrr 6064 -VPUNPCKHQDQZrrk 6065 -VPUNPCKHQDQZrrkz 6066 -VPUNPCKHQDQrm 6067 -VPUNPCKHQDQrr 6068 -VPUNPCKHWDYrm 6069 -VPUNPCKHWDYrr 6070 -VPUNPCKHWDZ 6071 -VPUNPCKHWDZrm 6072 -VPUNPCKHWDZrmk 6073 -VPUNPCKHWDZrmkz 6074 -VPUNPCKHWDZrr 6075 -VPUNPCKHWDZrrk 6076 -VPUNPCKHWDZrrkz 6077 -VPUNPCKHWDrm 6078 -VPUNPCKHWDrr 6079 -VPUNPCKLBWYrm 6080 -VPUNPCKLBWYrr 6081 -VPUNPCKLBWZ 6082 -VPUNPCKLBWZrm 6083 -VPUNPCKLBWZrmk 6084 -VPUNPCKLBWZrmkz 6085 -VPUNPCKLBWZrr 6086 -VPUNPCKLBWZrrk 6087 -VPUNPCKLBWZrrkz 6088 -VPUNPCKLBWrm 6089 -VPUNPCKLBWrr 6090 -VPUNPCKLDQYrm 6091 -VPUNPCKLDQYrr 6092 -VPUNPCKLDQZ 6093 -VPUNPCKLDQZrm 6094 -VPUNPCKLDQZrmb 6095 -VPUNPCKLDQZrmbk 6096 -VPUNPCKLDQZrmbkz 6097 -VPUNPCKLDQZrmk 6098 -VPUNPCKLDQZrmkz 6099 -VPUNPCKLDQZrr 6100 -VPUNPCKLDQZrrk 6101 -VPUNPCKLDQZrrkz 6102 -VPUNPCKLDQrm 6103 -VPUNPCKLDQrr 6104 -VPUNPCKLQDQYrm 6105 -VPUNPCKLQDQYrr 6106 -VPUNPCKLQDQZ 6107 -VPUNPCKLQDQZrm 6108 -VPUNPCKLQDQZrmb 6109 -VPUNPCKLQDQZrmbk 6110 -VPUNPCKLQDQZrmbkz 6111 -VPUNPCKLQDQZrmk 6112 -VPUNPCKLQDQZrmkz 6113 -VPUNPCKLQDQZrr 6114 -VPUNPCKLQDQZrrk 6115 -VPUNPCKLQDQZrrkz 6116 -VPUNPCKLQDQrm 6117 -VPUNPCKLQDQrr 6118 -VPUNPCKLWDYrm 6119 -VPUNPCKLWDYrr 6120 -VPUNPCKLWDZ 6121 -VPUNPCKLWDZrm 6122 -VPUNPCKLWDZrmk 6123 -VPUNPCKLWDZrmkz 6124 -VPUNPCKLWDZrr 6125 -VPUNPCKLWDZrrk 6126 -VPUNPCKLWDZrrkz 6127 -VPUNPCKLWDrm 6128 -VPUNPCKLWDrr 6129 -VPXORDZ 6130 -VPXORDZrm 6131 -VPXORDZrmb 6132 -VPXORDZrmbk 6133 -VPXORDZrmbkz 6134 -VPXORDZrmk 6135 -VPXORDZrmkz 6136 -VPXORDZrr 6137 -VPXORDZrrk 6138 -VPXORDZrrkz 6139 -VPXORQZ 6140 -VPXORQZrm 6141 -VPXORQZrmb 6142 -VPXORQZrmbk 6143 -VPXORQZrmbkz 6144 -VPXORQZrmk 6145 -VPXORQZrmkz 6146 -VPXORQZrr 6147 -VPXORQZrrk 6148 -VPXORQZrrkz 6149 -VPXORYrm 6150 -VPXORYrr 6151 -VPXORrm 6152 -VPXORrr 6153 -VRANGEPDZ 6154 -VRANGEPDZrmbi 6155 -VRANGEPDZrmbik 6156 -VRANGEPDZrmbikz 6157 -VRANGEPDZrmi 6158 -VRANGEPDZrmik 6159 -VRANGEPDZrmikz 6160 -VRANGEPDZrri 6161 -VRANGEPDZrrib 6162 -VRANGEPDZrribk 6163 -VRANGEPDZrribkz 6164 -VRANGEPDZrrik 6165 -VRANGEPDZrrikz 6166 -VRANGEPSZ 6167 -VRANGEPSZrmbi 6168 -VRANGEPSZrmbik 6169 -VRANGEPSZrmbikz 6170 -VRANGEPSZrmi 6171 -VRANGEPSZrmik 6172 -VRANGEPSZrmikz 6173 -VRANGEPSZrri 6174 -VRANGEPSZrrib 6175 -VRANGEPSZrribk 6176 -VRANGEPSZrribkz 6177 -VRANGEPSZrrik 6178 -VRANGEPSZrrikz 6179 -VRANGESDZrmi 6180 -VRANGESDZrmik 6181 -VRANGESDZrmikz 6182 -VRANGESDZrri 6183 -VRANGESDZrrib 6184 -VRANGESDZrribk 6185 -VRANGESDZrribkz 6186 -VRANGESDZrrik 6187 -VRANGESDZrrikz 6188 -VRANGESSZrmi 6189 -VRANGESSZrmik 6190 -VRANGESSZrmikz 6191 -VRANGESSZrri 6192 -VRANGESSZrrib 6193 -VRANGESSZrribk 6194 -VRANGESSZrribkz 6195 -VRANGESSZrrik 6196 -VRANGESSZrrikz 6197 -VRCP 6198 -VRCPBF 6199 -VRCPPHZ 6200 -VRCPPHZm 6201 -VRCPPHZmb 6202 -VRCPPHZmbk 6203 -VRCPPHZmbkz 6204 -VRCPPHZmk 6205 -VRCPPHZmkz 6206 -VRCPPHZr 6207 -VRCPPHZrk 6208 -VRCPPHZrkz 6209 -VRCPPSYm 6210 -VRCPPSYr 6211 -VRCPPSm 6212 -VRCPPSr 6213 -VRCPSHZrm 6214 -VRCPSHZrmk 6215 -VRCPSHZrmkz 6216 -VRCPSHZrr 6217 -VRCPSHZrrk 6218 -VRCPSHZrrkz 6219 -VRCPSSm 6220 -VRCPSSm_Int 6221 -VRCPSSr 6222 -VRCPSSr_Int 6223 -VREDUCEBF 6224 -VREDUCEPDZ 6225 -VREDUCEPDZrmbi 6226 -VREDUCEPDZrmbik 6227 -VREDUCEPDZrmbikz 6228 -VREDUCEPDZrmi 6229 -VREDUCEPDZrmik 6230 -VREDUCEPDZrmikz 6231 -VREDUCEPDZrri 6232 -VREDUCEPDZrrib 6233 -VREDUCEPDZrribk 6234 -VREDUCEPDZrribkz 6235 -VREDUCEPDZrrik 6236 -VREDUCEPDZrrikz 6237 -VREDUCEPHZ 6238 -VREDUCEPHZrmbi 6239 -VREDUCEPHZrmbik 6240 -VREDUCEPHZrmbikz 6241 -VREDUCEPHZrmi 6242 -VREDUCEPHZrmik 6243 -VREDUCEPHZrmikz 6244 -VREDUCEPHZrri 6245 -VREDUCEPHZrrib 6246 -VREDUCEPHZrribk 6247 -VREDUCEPHZrribkz 6248 -VREDUCEPHZrrik 6249 -VREDUCEPHZrrikz 6250 -VREDUCEPSZ 6251 -VREDUCEPSZrmbi 6252 -VREDUCEPSZrmbik 6253 -VREDUCEPSZrmbikz 6254 -VREDUCEPSZrmi 6255 -VREDUCEPSZrmik 6256 -VREDUCEPSZrmikz 6257 -VREDUCEPSZrri 6258 -VREDUCEPSZrrib 6259 -VREDUCEPSZrribk 6260 -VREDUCEPSZrribkz 6261 -VREDUCEPSZrrik 6262 -VREDUCEPSZrrikz 6263 -VREDUCESDZrmi 6264 -VREDUCESDZrmik 6265 -VREDUCESDZrmikz 6266 -VREDUCESDZrri 6267 -VREDUCESDZrrib 6268 -VREDUCESDZrribk 6269 -VREDUCESDZrribkz 6270 -VREDUCESDZrrik 6271 -VREDUCESDZrrikz 6272 -VREDUCESHZrmi 6273 -VREDUCESHZrmik 6274 -VREDUCESHZrmikz 6275 -VREDUCESHZrri 6276 -VREDUCESHZrrib 6277 -VREDUCESHZrribk 6278 -VREDUCESHZrribkz 6279 -VREDUCESHZrrik 6280 -VREDUCESHZrrikz 6281 -VREDUCESSZrmi 6282 -VREDUCESSZrmik 6283 -VREDUCESSZrmikz 6284 -VREDUCESSZrri 6285 -VREDUCESSZrrib 6286 -VREDUCESSZrribk 6287 -VREDUCESSZrribkz 6288 -VREDUCESSZrrik 6289 -VREDUCESSZrrikz 6290 -VRNDSCALEBF 6291 -VRNDSCALEPDZ 6292 -VRNDSCALEPDZrmbi 6293 -VRNDSCALEPDZrmbik 6294 -VRNDSCALEPDZrmbikz 6295 -VRNDSCALEPDZrmi 6296 -VRNDSCALEPDZrmik 6297 -VRNDSCALEPDZrmikz 6298 -VRNDSCALEPDZrri 6299 -VRNDSCALEPDZrrib 6300 -VRNDSCALEPDZrribk 6301 -VRNDSCALEPDZrribkz 6302 -VRNDSCALEPDZrrik 6303 -VRNDSCALEPDZrrikz 6304 -VRNDSCALEPHZ 6305 -VRNDSCALEPHZrmbi 6306 -VRNDSCALEPHZrmbik 6307 -VRNDSCALEPHZrmbikz 6308 -VRNDSCALEPHZrmi 6309 -VRNDSCALEPHZrmik 6310 -VRNDSCALEPHZrmikz 6311 -VRNDSCALEPHZrri 6312 -VRNDSCALEPHZrrib 6313 -VRNDSCALEPHZrribk 6314 -VRNDSCALEPHZrribkz 6315 -VRNDSCALEPHZrrik 6316 -VRNDSCALEPHZrrikz 6317 -VRNDSCALEPSZ 6318 -VRNDSCALEPSZrmbi 6319 -VRNDSCALEPSZrmbik 6320 -VRNDSCALEPSZrmbikz 6321 -VRNDSCALEPSZrmi 6322 -VRNDSCALEPSZrmik 6323 -VRNDSCALEPSZrmikz 6324 -VRNDSCALEPSZrri 6325 -VRNDSCALEPSZrrib 6326 -VRNDSCALEPSZrribk 6327 -VRNDSCALEPSZrribkz 6328 -VRNDSCALEPSZrrik 6329 -VRNDSCALEPSZrrikz 6330 -VRNDSCALESDZrmi 6331 -VRNDSCALESDZrmi_Int 6332 -VRNDSCALESDZrmik_Int 6333 -VRNDSCALESDZrmikz_Int 6334 -VRNDSCALESDZrri 6335 -VRNDSCALESDZrri_Int 6336 -VRNDSCALESDZrrib_Int 6337 -VRNDSCALESDZrribk_Int 6338 -VRNDSCALESDZrribkz_Int 6339 -VRNDSCALESDZrrik_Int 6340 -VRNDSCALESDZrrikz_Int 6341 -VRNDSCALESHZrmi 6342 -VRNDSCALESHZrmi_Int 6343 -VRNDSCALESHZrmik_Int 6344 -VRNDSCALESHZrmikz_Int 6345 -VRNDSCALESHZrri 6346 -VRNDSCALESHZrri_Int 6347 -VRNDSCALESHZrrib_Int 6348 -VRNDSCALESHZrribk_Int 6349 -VRNDSCALESHZrribkz_Int 6350 -VRNDSCALESHZrrik_Int 6351 -VRNDSCALESHZrrikz_Int 6352 -VRNDSCALESSZrmi 6353 -VRNDSCALESSZrmi_Int 6354 -VRNDSCALESSZrmik_Int 6355 -VRNDSCALESSZrmikz_Int 6356 -VRNDSCALESSZrri 6357 -VRNDSCALESSZrri_Int 6358 -VRNDSCALESSZrrib_Int 6359 -VRNDSCALESSZrribk_Int 6360 -VRNDSCALESSZrribkz_Int 6361 -VRNDSCALESSZrrik_Int 6362 -VRNDSCALESSZrrikz_Int 6363 -VROUNDPDYmi 6364 -VROUNDPDYri 6365 -VROUNDPDmi 6366 -VROUNDPDri 6367 -VROUNDPSYmi 6368 -VROUNDPSYri 6369 -VROUNDPSmi 6370 -VROUNDPSri 6371 -VROUNDSDmi 6372 -VROUNDSDmi_Int 6373 -VROUNDSDri 6374 -VROUNDSDri_Int 6375 -VROUNDSSmi 6376 -VROUNDSSmi_Int 6377 -VROUNDSSri 6378 -VROUNDSSri_Int 6379 -VRSQRT 6380 -VRSQRTBF 6381 -VRSQRTPHZ 6382 -VRSQRTPHZm 6383 -VRSQRTPHZmb 6384 -VRSQRTPHZmbk 6385 -VRSQRTPHZmbkz 6386 -VRSQRTPHZmk 6387 -VRSQRTPHZmkz 6388 -VRSQRTPHZr 6389 -VRSQRTPHZrk 6390 -VRSQRTPHZrkz 6391 -VRSQRTPSYm 6392 -VRSQRTPSYr 6393 -VRSQRTPSm 6394 -VRSQRTPSr 6395 -VRSQRTSHZrm 6396 -VRSQRTSHZrmk 6397 -VRSQRTSHZrmkz 6398 -VRSQRTSHZrr 6399 -VRSQRTSHZrrk 6400 -VRSQRTSHZrrkz 6401 -VRSQRTSSm 6402 -VRSQRTSSm_Int 6403 -VRSQRTSSr 6404 -VRSQRTSSr_Int 6405 -VSCALEFBF 6406 -VSCALEFPDZ 6407 -VSCALEFPDZrm 6408 -VSCALEFPDZrmb 6409 -VSCALEFPDZrmbk 6410 -VSCALEFPDZrmbkz 6411 -VSCALEFPDZrmk 6412 -VSCALEFPDZrmkz 6413 -VSCALEFPDZrr 6414 -VSCALEFPDZrrb 6415 -VSCALEFPDZrrbk 6416 -VSCALEFPDZrrbkz 6417 -VSCALEFPDZrrk 6418 -VSCALEFPDZrrkz 6419 -VSCALEFPHZ 6420 -VSCALEFPHZrm 6421 -VSCALEFPHZrmb 6422 -VSCALEFPHZrmbk 6423 -VSCALEFPHZrmbkz 6424 -VSCALEFPHZrmk 6425 -VSCALEFPHZrmkz 6426 -VSCALEFPHZrr 6427 -VSCALEFPHZrrb 6428 -VSCALEFPHZrrbk 6429 -VSCALEFPHZrrbkz 6430 -VSCALEFPHZrrk 6431 -VSCALEFPHZrrkz 6432 -VSCALEFPSZ 6433 -VSCALEFPSZrm 6434 -VSCALEFPSZrmb 6435 -VSCALEFPSZrmbk 6436 -VSCALEFPSZrmbkz 6437 -VSCALEFPSZrmk 6438 -VSCALEFPSZrmkz 6439 -VSCALEFPSZrr 6440 -VSCALEFPSZrrb 6441 -VSCALEFPSZrrbk 6442 -VSCALEFPSZrrbkz 6443 -VSCALEFPSZrrk 6444 -VSCALEFPSZrrkz 6445 -VSCALEFSDZrm 6446 -VSCALEFSDZrmk 6447 -VSCALEFSDZrmkz 6448 -VSCALEFSDZrr 6449 -VSCALEFSDZrrb_Int 6450 -VSCALEFSDZrrbk_Int 6451 -VSCALEFSDZrrbkz_Int 6452 -VSCALEFSDZrrk 6453 -VSCALEFSDZrrkz 6454 -VSCALEFSHZrm 6455 -VSCALEFSHZrmk 6456 -VSCALEFSHZrmkz 6457 -VSCALEFSHZrr 6458 -VSCALEFSHZrrb_Int 6459 -VSCALEFSHZrrbk_Int 6460 -VSCALEFSHZrrbkz_Int 6461 -VSCALEFSHZrrk 6462 -VSCALEFSHZrrkz 6463 -VSCALEFSSZrm 6464 -VSCALEFSSZrmk 6465 -VSCALEFSSZrmkz 6466 -VSCALEFSSZrr 6467 -VSCALEFSSZrrb_Int 6468 -VSCALEFSSZrrbk_Int 6469 -VSCALEFSSZrrbkz_Int 6470 -VSCALEFSSZrrk 6471 -VSCALEFSSZrrkz 6472 -VSCATTERDPDZ 6473 -VSCATTERDPDZmr 6474 -VSCATTERDPSZ 6475 -VSCATTERDPSZmr 6476 -VSCATTERPF 6477 -VSCATTERQPDZ 6478 -VSCATTERQPDZmr 6479 -VSCATTERQPSZ 6480 -VSCATTERQPSZmr 6481 -VSHA 6482 -VSHUFF 6483 -VSHUFI 6484 -VSHUFPDYrmi 6485 -VSHUFPDYrri 6486 -VSHUFPDZ 6487 -VSHUFPDZrmbi 6488 -VSHUFPDZrmbik 6489 -VSHUFPDZrmbikz 6490 -VSHUFPDZrmi 6491 -VSHUFPDZrmik 6492 -VSHUFPDZrmikz 6493 -VSHUFPDZrri 6494 -VSHUFPDZrrik 6495 -VSHUFPDZrrikz 6496 -VSHUFPDrmi 6497 -VSHUFPDrri 6498 -VSHUFPSYrmi 6499 -VSHUFPSYrri 6500 -VSHUFPSZ 6501 -VSHUFPSZrmbi 6502 -VSHUFPSZrmbik 6503 -VSHUFPSZrmbikz 6504 -VSHUFPSZrmi 6505 -VSHUFPSZrmik 6506 -VSHUFPSZrmikz 6507 -VSHUFPSZrri 6508 -VSHUFPSZrrik 6509 -VSHUFPSZrrikz 6510 -VSHUFPSrmi 6511 -VSHUFPSrri 6512 -VSM 6513 -VSQRTBF 6514 -VSQRTPDYm 6515 -VSQRTPDYr 6516 -VSQRTPDZ 6517 -VSQRTPDZm 6518 -VSQRTPDZmb 6519 -VSQRTPDZmbk 6520 -VSQRTPDZmbkz 6521 -VSQRTPDZmk 6522 -VSQRTPDZmkz 6523 -VSQRTPDZr 6524 -VSQRTPDZrb 6525 -VSQRTPDZrbk 6526 -VSQRTPDZrbkz 6527 -VSQRTPDZrk 6528 -VSQRTPDZrkz 6529 -VSQRTPDm 6530 -VSQRTPDr 6531 -VSQRTPHZ 6532 -VSQRTPHZm 6533 -VSQRTPHZmb 6534 -VSQRTPHZmbk 6535 -VSQRTPHZmbkz 6536 -VSQRTPHZmk 6537 -VSQRTPHZmkz 6538 -VSQRTPHZr 6539 -VSQRTPHZrb 6540 -VSQRTPHZrbk 6541 -VSQRTPHZrbkz 6542 -VSQRTPHZrk 6543 -VSQRTPHZrkz 6544 -VSQRTPSYm 6545 -VSQRTPSYr 6546 -VSQRTPSZ 6547 -VSQRTPSZm 6548 -VSQRTPSZmb 6549 -VSQRTPSZmbk 6550 -VSQRTPSZmbkz 6551 -VSQRTPSZmk 6552 -VSQRTPSZmkz 6553 -VSQRTPSZr 6554 -VSQRTPSZrb 6555 -VSQRTPSZrbk 6556 -VSQRTPSZrbkz 6557 -VSQRTPSZrk 6558 -VSQRTPSZrkz 6559 -VSQRTPSm 6560 -VSQRTPSr 6561 -VSQRTSDZm 6562 -VSQRTSDZm_Int 6563 -VSQRTSDZmk_Int 6564 -VSQRTSDZmkz_Int 6565 -VSQRTSDZr 6566 -VSQRTSDZr_Int 6567 -VSQRTSDZrb_Int 6568 -VSQRTSDZrbk_Int 6569 -VSQRTSDZrbkz_Int 6570 -VSQRTSDZrk_Int 6571 -VSQRTSDZrkz_Int 6572 -VSQRTSDm 6573 -VSQRTSDm_Int 6574 -VSQRTSDr 6575 -VSQRTSDr_Int 6576 -VSQRTSHZm 6577 -VSQRTSHZm_Int 6578 -VSQRTSHZmk_Int 6579 -VSQRTSHZmkz_Int 6580 -VSQRTSHZr 6581 -VSQRTSHZr_Int 6582 -VSQRTSHZrb_Int 6583 -VSQRTSHZrbk_Int 6584 -VSQRTSHZrbkz_Int 6585 -VSQRTSHZrk_Int 6586 -VSQRTSHZrkz_Int 6587 -VSQRTSSZm 6588 -VSQRTSSZm_Int 6589 -VSQRTSSZmk_Int 6590 -VSQRTSSZmkz_Int 6591 -VSQRTSSZr 6592 -VSQRTSSZr_Int 6593 -VSQRTSSZrb_Int 6594 -VSQRTSSZrbk_Int 6595 -VSQRTSSZrbkz_Int 6596 -VSQRTSSZrk_Int 6597 -VSQRTSSZrkz_Int 6598 -VSQRTSSm 6599 -VSQRTSSm_Int 6600 -VSQRTSSr 6601 -VSQRTSSr_Int 6602 -VSTMXCSR 6603 -VSUBBF 6604 -VSUBPDYrm 6605 -VSUBPDYrr 6606 -VSUBPDZ 6607 -VSUBPDZrm 6608 -VSUBPDZrmb 6609 -VSUBPDZrmbk 6610 -VSUBPDZrmbkz 6611 -VSUBPDZrmk 6612 -VSUBPDZrmkz 6613 -VSUBPDZrr 6614 -VSUBPDZrrb 6615 -VSUBPDZrrbk 6616 -VSUBPDZrrbkz 6617 -VSUBPDZrrk 6618 -VSUBPDZrrkz 6619 -VSUBPDrm 6620 -VSUBPDrr 6621 -VSUBPHZ 6622 -VSUBPHZrm 6623 -VSUBPHZrmb 6624 -VSUBPHZrmbk 6625 -VSUBPHZrmbkz 6626 -VSUBPHZrmk 6627 -VSUBPHZrmkz 6628 -VSUBPHZrr 6629 -VSUBPHZrrb 6630 -VSUBPHZrrbk 6631 -VSUBPHZrrbkz 6632 -VSUBPHZrrk 6633 -VSUBPHZrrkz 6634 -VSUBPSYrm 6635 -VSUBPSYrr 6636 -VSUBPSZ 6637 -VSUBPSZrm 6638 -VSUBPSZrmb 6639 -VSUBPSZrmbk 6640 -VSUBPSZrmbkz 6641 -VSUBPSZrmk 6642 -VSUBPSZrmkz 6643 -VSUBPSZrr 6644 -VSUBPSZrrb 6645 -VSUBPSZrrbk 6646 -VSUBPSZrrbkz 6647 -VSUBPSZrrk 6648 -VSUBPSZrrkz 6649 -VSUBPSrm 6650 -VSUBPSrr 6651 -VSUBSDZrm 6652 -VSUBSDZrm_Int 6653 -VSUBSDZrmk_Int 6654 -VSUBSDZrmkz_Int 6655 -VSUBSDZrr 6656 -VSUBSDZrr_Int 6657 -VSUBSDZrrb_Int 6658 -VSUBSDZrrbk_Int 6659 -VSUBSDZrrbkz_Int 6660 -VSUBSDZrrk_Int 6661 -VSUBSDZrrkz_Int 6662 -VSUBSDrm 6663 -VSUBSDrm_Int 6664 -VSUBSDrr 6665 -VSUBSDrr_Int 6666 -VSUBSHZrm 6667 -VSUBSHZrm_Int 6668 -VSUBSHZrmk_Int 6669 -VSUBSHZrmkz_Int 6670 -VSUBSHZrr 6671 -VSUBSHZrr_Int 6672 -VSUBSHZrrb_Int 6673 -VSUBSHZrrbk_Int 6674 -VSUBSHZrrbkz_Int 6675 -VSUBSHZrrk_Int 6676 -VSUBSHZrrkz_Int 6677 -VSUBSSZrm 6678 -VSUBSSZrm_Int 6679 -VSUBSSZrmk_Int 6680 -VSUBSSZrmkz_Int 6681 -VSUBSSZrr 6682 -VSUBSSZrr_Int 6683 -VSUBSSZrrb_Int 6684 -VSUBSSZrrbk_Int 6685 -VSUBSSZrrbkz_Int 6686 -VSUBSSZrrk_Int 6687 -VSUBSSZrrkz_Int 6688 -VSUBSSrm 6689 -VSUBSSrm_Int 6690 -VSUBSSrr 6691 -VSUBSSrr_Int 6692 -VTESTPDYrm 6693 -VTESTPDYrr 6694 -VTESTPDrm 6695 -VTESTPDrr 6696 -VTESTPSYrm 6697 -VTESTPSYrr 6698 -VTESTPSrm 6699 -VTESTPSrr 6700 -VUCOMISDZrm 6701 -VUCOMISDZrm_Int 6702 -VUCOMISDZrr 6703 -VUCOMISDZrr_Int 6704 -VUCOMISDZrrb 6705 -VUCOMISDrm 6706 -VUCOMISDrm_Int 6707 -VUCOMISDrr 6708 -VUCOMISDrr_Int 6709 -VUCOMISHZrm 6710 -VUCOMISHZrm_Int 6711 -VUCOMISHZrr 6712 -VUCOMISHZrr_Int 6713 -VUCOMISHZrrb 6714 -VUCOMISSZrm 6715 -VUCOMISSZrm_Int 6716 -VUCOMISSZrr 6717 -VUCOMISSZrr_Int 6718 -VUCOMISSZrrb 6719 -VUCOMISSrm 6720 -VUCOMISSrm_Int 6721 -VUCOMISSrr 6722 -VUCOMISSrr_Int 6723 -VUCOMXSDZrm 6724 -VUCOMXSDZrm_Int 6725 -VUCOMXSDZrr 6726 -VUCOMXSDZrr_Int 6727 -VUCOMXSDZrrb_Int 6728 -VUCOMXSHZrm 6729 -VUCOMXSHZrm_Int 6730 -VUCOMXSHZrr 6731 -VUCOMXSHZrr_Int 6732 -VUCOMXSHZrrb_Int 6733 -VUCOMXSSZrm 6734 -VUCOMXSSZrm_Int 6735 -VUCOMXSSZrr 6736 -VUCOMXSSZrr_Int 6737 -VUCOMXSSZrrb_Int 6738 -VUNPCKHPDYrm 6739 -VUNPCKHPDYrr 6740 -VUNPCKHPDZ 6741 -VUNPCKHPDZrm 6742 -VUNPCKHPDZrmb 6743 -VUNPCKHPDZrmbk 6744 -VUNPCKHPDZrmbkz 6745 -VUNPCKHPDZrmk 6746 -VUNPCKHPDZrmkz 6747 -VUNPCKHPDZrr 6748 -VUNPCKHPDZrrk 6749 -VUNPCKHPDZrrkz 6750 -VUNPCKHPDrm 6751 -VUNPCKHPDrr 6752 -VUNPCKHPSYrm 6753 -VUNPCKHPSYrr 6754 -VUNPCKHPSZ 6755 -VUNPCKHPSZrm 6756 -VUNPCKHPSZrmb 6757 -VUNPCKHPSZrmbk 6758 -VUNPCKHPSZrmbkz 6759 -VUNPCKHPSZrmk 6760 -VUNPCKHPSZrmkz 6761 -VUNPCKHPSZrr 6762 -VUNPCKHPSZrrk 6763 -VUNPCKHPSZrrkz 6764 -VUNPCKHPSrm 6765 -VUNPCKHPSrr 6766 -VUNPCKLPDYrm 6767 -VUNPCKLPDYrr 6768 -VUNPCKLPDZ 6769 -VUNPCKLPDZrm 6770 -VUNPCKLPDZrmb 6771 -VUNPCKLPDZrmbk 6772 -VUNPCKLPDZrmbkz 6773 -VUNPCKLPDZrmk 6774 -VUNPCKLPDZrmkz 6775 -VUNPCKLPDZrr 6776 -VUNPCKLPDZrrk 6777 -VUNPCKLPDZrrkz 6778 -VUNPCKLPDrm 6779 -VUNPCKLPDrr 6780 -VUNPCKLPSYrm 6781 -VUNPCKLPSYrr 6782 -VUNPCKLPSZ 6783 -VUNPCKLPSZrm 6784 -VUNPCKLPSZrmb 6785 -VUNPCKLPSZrmbk 6786 -VUNPCKLPSZrmbkz 6787 -VUNPCKLPSZrmk 6788 -VUNPCKLPSZrmkz 6789 -VUNPCKLPSZrr 6790 -VUNPCKLPSZrrk 6791 -VUNPCKLPSZrrkz 6792 -VUNPCKLPSrm 6793 -VUNPCKLPSrr 6794 -VXORPDYrm 6795 -VXORPDYrr 6796 -VXORPDZ 6797 -VXORPDZrm 6798 -VXORPDZrmb 6799 -VXORPDZrmbk 6800 -VXORPDZrmbkz 6801 -VXORPDZrmk 6802 -VXORPDZrmkz 6803 -VXORPDZrr 6804 -VXORPDZrrk 6805 -VXORPDZrrkz 6806 -VXORPDrm 6807 -VXORPDrr 6808 -VXORPSYrm 6809 -VXORPSYrr 6810 -VXORPSZ 6811 -VXORPSZrm 6812 -VXORPSZrmb 6813 -VXORPSZrmbk 6814 -VXORPSZrmbkz 6815 -VXORPSZrmk 6816 -VXORPSZrmkz 6817 -VXORPSZrr 6818 -VXORPSZrrk 6819 -VXORPSZrrkz 6820 -VXORPSrm 6821 -VXORPSrr 6822 -VZEROALL 6823 -VZEROUPPER 6824 -V_SET 6825 -V_SETALLONES 6826 -WAIT 6827 -WBINVD 6828 -WBNOINVD 6829 -WRFLAGS 6830 -WRFSBASE 6831 -WRGSBASE 6832 -WRMSR 6833 -WRMSRLIST 6834 -WRMSRNS 6835 -WRMSRNSir 6836 -WRMSRNSir_EVEX 6837 -WRPKRUr 6838 -WRSSD 6839 -WRSSD_EVEX 6840 -WRSSQ 6841 -WRSSQ_EVEX 6842 -WRUSSD 6843 -WRUSSD_EVEX 6844 -WRUSSQ 6845 -WRUSSQ_EVEX 6846 -XABORT 6847 -XABORT_DEF 6848 -XACQUIRE_PREFIX 6849 -XADD 6850 -XAM_F 6851 -XAM_Fp 6852 -XBEGIN 6853 -XCHG 6854 -XCH_F 6855 -XCRYPTCBC 6856 -XCRYPTCFB 6857 -XCRYPTCTR 6858 -XCRYPTECB 6859 -XCRYPTOFB 6860 -XEND 6861 -XGETBV 6862 -XLAT 6863 -XOR 6864 -XORPDrm 6865 -XORPDrr 6866 -XORPSrm 6867 -XORPSrr 6868 -XRELEASE_PREFIX 6869 -XRESLDTRK 6870 -XRSTOR 6871 -XRSTORS 6872 -XSAVE 6873 -XSAVEC 6874 -XSAVEOPT 6875 -XSAVES 6876 -XSETBV 6877 -XSHA 6878 -XSTORE 6879 -XSUSLDTRK 6880 -XTEST 6881 -Immediate 6882 -CImmediate 6883 -FPImmediate 6884 -MBB 6885 -FrameIndex 6886 -ConstantPoolIndex 6887 -TargetIndex 6888 -JumpTableIndex 6889 -ExternalSymbol 6890 -GlobalAddress 6891 -BlockAddress 6892 -RegisterMask 6893 -RegisterLiveOut 6894 -Metadata 6895 -MCSymbol 6896 -CFIIndex 6897 -IntrinsicID 6898 -Predicate 6899 -ShuffleMask 6900 -PhyReg_GR8 6901 -PhyReg_GRH8 6902 -PhyReg_GR8_NOREX2 6903 -PhyReg_GR8_NOREX 6904 -PhyReg_GR8_ABCD_H 6905 -PhyReg_GR8_ABCD_L 6906 -PhyReg_GRH16 6907 -PhyReg_GR16 6908 -PhyReg_GR16_NOREX2 6909 -PhyReg_GR16_NOREX 6910 -PhyReg_VK1 6911 -PhyReg_VK16 6912 -PhyReg_VK2 6913 -PhyReg_VK4 6914 -PhyReg_VK8 6915 -PhyReg_VK16WM 6916 -PhyReg_VK1WM 6917 -PhyReg_VK2WM 6918 -PhyReg_VK4WM 6919 -PhyReg_VK8WM 6920 -PhyReg_SEGMENT_REG 6921 -PhyReg_GR16_ABCD 6922 -PhyReg_FPCCR 6923 -PhyReg_FR16X 6924 -PhyReg_FR16 6925 -PhyReg_VK16PAIR 6926 -PhyReg_VK1PAIR 6927 -PhyReg_VK2PAIR 6928 -PhyReg_VK4PAIR 6929 -PhyReg_VK8PAIR 6930 -PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM 6931 -PhyReg_LOW32_ADDR_ACCESS_RBP 6932 -PhyReg_LOW32_ADDR_ACCESS 6933 -PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit 6934 -PhyReg_FR32X 6935 -PhyReg_GR32 6936 -PhyReg_GR32_NOSP 6937 -PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2 6938 -PhyReg_DEBUG_REG 6939 -PhyReg_FR32 6940 -PhyReg_GR32_NOREX2 6941 -PhyReg_GR32_NOREX2_NOSP 6942 -PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX 6943 -PhyReg_GR32_NOREX 6944 -PhyReg_VK32 6945 -PhyReg_GR32_NOREX_NOSP 6946 -PhyReg_RFP32 6947 -PhyReg_VK32WM 6948 -PhyReg_GR32_ABCD 6949 -PhyReg_GR32_TC 6950 -PhyReg_GR32_ABCD_and_GR32_TC 6951 -PhyReg_GR32_AD 6952 -PhyReg_GR32_ArgRef 6953 -PhyReg_GR32_BPSP 6954 -PhyReg_GR32_BSI 6955 -PhyReg_GR32_CB 6956 -PhyReg_GR32_DC 6957 -PhyReg_GR32_DIBP 6958 -PhyReg_GR32_SIDI 6959 -PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit 6960 -PhyReg_CCR 6961 -PhyReg_DFCCR 6962 -PhyReg_GR32_ABCD_and_GR32_BSI 6963 -PhyReg_GR32_AD_and_GR32_ArgRef 6964 -PhyReg_GR32_ArgRef_and_GR32_CB 6965 -PhyReg_GR32_BPSP_and_GR32_DIBP 6966 -PhyReg_GR32_BPSP_and_GR32_TC 6967 -PhyReg_GR32_BSI_and_GR32_SIDI 6968 -PhyReg_GR32_DIBP_and_GR32_SIDI 6969 -PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit 6970 -PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit 6971 -PhyReg_RFP64 6972 -PhyReg_GR64 6973 -PhyReg_FR64X 6974 -PhyReg_GR64_with_sub_8bit 6975 -PhyReg_GR64_NOSP 6976 -PhyReg_GR64_NOREX2 6977 -PhyReg_CONTROL_REG 6978 -PhyReg_FR64 6979 -PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2 6980 -PhyReg_GR64_NOREX2_NOSP 6981 -PhyReg_GR64PLTSafe 6982 -PhyReg_GR64_TC 6983 -PhyReg_GR64_NOREX 6984 -PhyReg_GR64_TCW64 6985 -PhyReg_GR64_TC_with_sub_8bit 6986 -PhyReg_GR64_NOREX2_NOSP_and_GR64_TC 6987 -PhyReg_GR64_TCW64_with_sub_8bit 6988 -PhyReg_GR64_TC_and_GR64_TCW64 6989 -PhyReg_GR64_with_sub_16bit_in_GR16_NOREX 6990 -PhyReg_VK64 6991 -PhyReg_VR64 6992 -PhyReg_GR64PLTSafe_and_GR64_TC 6993 -PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64 6994 -PhyReg_GR64_NOREX_NOSP 6995 -PhyReg_GR64_NOREX_and_GR64_TC 6996 -PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit 6997 -PhyReg_VK64WM 6998 -PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64 6999 -PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX 7000 -PhyReg_GR64PLTSafe_and_GR64_TCW64 7001 -PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC 7002 -PhyReg_GR64_NOREX_and_GR64_TCW64 7003 -PhyReg_GR64_ABCD 7004 -PhyReg_GR64_with_sub_32bit_in_GR32_TC 7005 -PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC 7006 -PhyReg_GR64_AD 7007 -PhyReg_GR64_ArgRef 7008 -PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP 7009 -PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef 7010 -PhyReg_GR64_with_sub_32bit_in_GR32_BPSP 7011 -PhyReg_GR64_with_sub_32bit_in_GR32_BSI 7012 -PhyReg_GR64_with_sub_32bit_in_GR32_CB 7013 -PhyReg_GR64_with_sub_32bit_in_GR32_DIBP 7014 -PhyReg_GR64_with_sub_32bit_in_GR32_SIDI 7015 -PhyReg_GR64_A 7016 -PhyReg_GR64_ArgRef_and_GR64_TC 7017 -PhyReg_GR64_and_LOW32_ADDR_ACCESS 7018 -PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI 7019 -PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef 7020 -PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB 7021 -PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP 7022 -PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC 7023 -PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI 7024 -PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI 7025 -PhyReg_RST 7026 -PhyReg_RFP80 7027 -PhyReg_RFP80_7 7028 -PhyReg_VR128X 7029 -PhyReg_VR128 7030 -PhyReg_VR256X 7031 -PhyReg_VR256 7032 -PhyReg_VR512 7033 -PhyReg_VR512_0_15 7034 -PhyReg_TILE 7035 -PhyReg_TILEPAIR 7036 -VirtReg_GR8 7037 -VirtReg_GRH8 7038 -VirtReg_GR8_NOREX2 7039 -VirtReg_GR8_NOREX 7040 -VirtReg_GR8_ABCD_H 7041 -VirtReg_GR8_ABCD_L 7042 -VirtReg_GRH16 7043 -VirtReg_GR16 7044 -VirtReg_GR16_NOREX2 7045 -VirtReg_GR16_NOREX 7046 -VirtReg_VK1 7047 -VirtReg_VK16 7048 -VirtReg_VK2 7049 -VirtReg_VK4 7050 -VirtReg_VK8 7051 -VirtReg_VK16WM 7052 -VirtReg_VK1WM 7053 -VirtReg_VK2WM 7054 -VirtReg_VK4WM 7055 -VirtReg_VK8WM 7056 -VirtReg_SEGMENT_REG 7057 -VirtReg_GR16_ABCD 7058 -VirtReg_FPCCR 7059 -VirtReg_FR16X 7060 -VirtReg_FR16 7061 -VirtReg_VK16PAIR 7062 -VirtReg_VK1PAIR 7063 -VirtReg_VK2PAIR 7064 -VirtReg_VK4PAIR 7065 -VirtReg_VK8PAIR 7066 -VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM 7067 -VirtReg_LOW32_ADDR_ACCESS_RBP 7068 -VirtReg_LOW32_ADDR_ACCESS 7069 -VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit 7070 -VirtReg_FR32X 7071 -VirtReg_GR32 7072 -VirtReg_GR32_NOSP 7073 -VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2 7074 -VirtReg_DEBUG_REG 7075 -VirtReg_FR32 7076 -VirtReg_GR32_NOREX2 7077 -VirtReg_GR32_NOREX2_NOSP 7078 -VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX 7079 -VirtReg_GR32_NOREX 7080 -VirtReg_VK32 7081 -VirtReg_GR32_NOREX_NOSP 7082 -VirtReg_RFP32 7083 -VirtReg_VK32WM 7084 -VirtReg_GR32_ABCD 7085 -VirtReg_GR32_TC 7086 -VirtReg_GR32_ABCD_and_GR32_TC 7087 -VirtReg_GR32_AD 7088 -VirtReg_GR32_ArgRef 7089 -VirtReg_GR32_BPSP 7090 -VirtReg_GR32_BSI 7091 -VirtReg_GR32_CB 7092 -VirtReg_GR32_DC 7093 -VirtReg_GR32_DIBP 7094 -VirtReg_GR32_SIDI 7095 -VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit 7096 -VirtReg_CCR 7097 -VirtReg_DFCCR 7098 -VirtReg_GR32_ABCD_and_GR32_BSI 7099 -VirtReg_GR32_AD_and_GR32_ArgRef 7100 -VirtReg_GR32_ArgRef_and_GR32_CB 7101 -VirtReg_GR32_BPSP_and_GR32_DIBP 7102 -VirtReg_GR32_BPSP_and_GR32_TC 7103 -VirtReg_GR32_BSI_and_GR32_SIDI 7104 -VirtReg_GR32_DIBP_and_GR32_SIDI 7105 -VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit 7106 -VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit 7107 -VirtReg_RFP64 7108 -VirtReg_GR64 7109 -VirtReg_FR64X 7110 -VirtReg_GR64_with_sub_8bit 7111 -VirtReg_GR64_NOSP 7112 -VirtReg_GR64_NOREX2 7113 -VirtReg_CONTROL_REG 7114 -VirtReg_FR64 7115 -VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2 7116 -VirtReg_GR64_NOREX2_NOSP 7117 -VirtReg_GR64PLTSafe 7118 -VirtReg_GR64_TC 7119 -VirtReg_GR64_NOREX 7120 -VirtReg_GR64_TCW64 7121 -VirtReg_GR64_TC_with_sub_8bit 7122 -VirtReg_GR64_NOREX2_NOSP_and_GR64_TC 7123 -VirtReg_GR64_TCW64_with_sub_8bit 7124 -VirtReg_GR64_TC_and_GR64_TCW64 7125 -VirtReg_GR64_with_sub_16bit_in_GR16_NOREX 7126 -VirtReg_VK64 7127 -VirtReg_VR64 7128 -VirtReg_GR64PLTSafe_and_GR64_TC 7129 -VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64 7130 -VirtReg_GR64_NOREX_NOSP 7131 -VirtReg_GR64_NOREX_and_GR64_TC 7132 -VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit 7133 -VirtReg_VK64WM 7134 -VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64 7135 -VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX 7136 -VirtReg_GR64PLTSafe_and_GR64_TCW64 7137 -VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC 7138 -VirtReg_GR64_NOREX_and_GR64_TCW64 7139 -VirtReg_GR64_ABCD 7140 -VirtReg_GR64_with_sub_32bit_in_GR32_TC 7141 -VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC 7142 -VirtReg_GR64_AD 7143 -VirtReg_GR64_ArgRef 7144 -VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP 7145 -VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef 7146 -VirtReg_GR64_with_sub_32bit_in_GR32_BPSP 7147 -VirtReg_GR64_with_sub_32bit_in_GR32_BSI 7148 -VirtReg_GR64_with_sub_32bit_in_GR32_CB 7149 -VirtReg_GR64_with_sub_32bit_in_GR32_DIBP 7150 -VirtReg_GR64_with_sub_32bit_in_GR32_SIDI 7151 -VirtReg_GR64_A 7152 -VirtReg_GR64_ArgRef_and_GR64_TC 7153 -VirtReg_GR64_and_LOW32_ADDR_ACCESS 7154 -VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI 7155 -VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef 7156 -VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB 7157 -VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP 7158 -VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC 7159 -VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI 7160 -VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI 7161 -VirtReg_RST 7162 -VirtReg_RFP80 7163 -VirtReg_RFP80_7 7164 -VirtReg_VR128X 7165 -VirtReg_VR128 7166 -VirtReg_VR256X 7167 -VirtReg_VR256 7168 -VirtReg_VR512 7169 -VirtReg_VR512_0_15 7170 -VirtReg_TILE 7171 -VirtReg_TILEPAIR 7172 +PTCMMIMFP 1441 +PTCMMRLFP 1442 +PTCVTROWD 1443 +PTCVTROWPS 1444 +PTDPBF 1445 +PTDPBHF 1446 +PTDPBSSD 1447 +PTDPBSSDV 1448 +PTDPBSUD 1449 +PTDPBSUDV 1450 +PTDPBUSD 1451 +PTDPBUSDV 1452 +PTDPBUUD 1453 +PTDPBUUDV 1454 +PTDPFP 1455 +PTDPHBF 1456 +PTDPHF 1457 +PTESTrm 1458 +PTESTrr 1459 +PTILELOADD 1460 +PTILELOADDRS 1461 +PTILELOADDRST 1462 +PTILELOADDRSV 1463 +PTILELOADDT 1464 +PTILELOADDV 1465 +PTILEMOVROWrre 1466 +PTILEMOVROWrreV 1467 +PTILEMOVROWrri 1468 +PTILEMOVROWrriV 1469 +PTILESTORED 1470 +PTILESTOREDV 1471 +PTILEZERO 1472 +PTILEZEROV 1473 +PTMMULTF 1474 +PTWRITE 1475 +PTWRITEm 1476 +PTWRITEr 1477 +PUNPCKHBWrm 1478 +PUNPCKHBWrr 1479 +PUNPCKHDQrm 1480 +PUNPCKHDQrr 1481 +PUNPCKHQDQrm 1482 +PUNPCKHQDQrr 1483 +PUNPCKHWDrm 1484 +PUNPCKHWDrr 1485 +PUNPCKLBWrm 1486 +PUNPCKLBWrr 1487 +PUNPCKLDQrm 1488 +PUNPCKLDQrr 1489 +PUNPCKLQDQrm 1490 +PUNPCKLQDQrr 1491 +PUNPCKLWDrm 1492 +PUNPCKLWDrr 1493 +PUSH 1494 +PUSHA 1495 +PUSHCS 1496 +PUSHDS 1497 +PUSHES 1498 +PUSHF 1499 +PUSHFS 1500 +PUSHGS 1501 +PUSHP 1502 +PUSHSS 1503 +PVALIDATE 1504 +PXORrm 1505 +PXORrr 1506 +RCL 1507 +RCPPSm 1508 +RCPPSr 1509 +RCPSSm 1510 +RCPSSm_Int 1511 +RCPSSr 1512 +RCPSSr_Int 1513 +RCR 1514 +RDFLAGS 1515 +RDFSBASE 1516 +RDGSBASE 1517 +RDMSR 1518 +RDMSRLIST 1519 +RDMSRri 1520 +RDMSRri_EVEX 1521 +RDPID 1522 +RDPKRUr 1523 +RDPMC 1524 +RDPRU 1525 +RDRAND 1526 +RDSEED 1527 +RDSSPD 1528 +RDSSPQ 1529 +RDTSC 1530 +RDTSCP 1531 +REG_SEQUENCE 1532 +REPNE_PREFIX 1533 +REP_MOVSB 1534 +REP_MOVSD 1535 +REP_MOVSQ 1536 +REP_MOVSW 1537 +REP_PREFIX 1538 +REP_STOSB 1539 +REP_STOSD 1540 +REP_STOSQ 1541 +REP_STOSW 1542 +RET 1543 +RETI 1544 +REX 1545 +RMPADJUST 1546 +RMPQUERY 1547 +RMPUPDATE 1548 +ROL 1549 +ROR 1550 +RORX 1551 +ROUNDPDmi 1552 +ROUNDPDri 1553 +ROUNDPSmi 1554 +ROUNDPSri 1555 +ROUNDSDmi 1556 +ROUNDSDmi_Int 1557 +ROUNDSDri 1558 +ROUNDSDri_Int 1559 +ROUNDSSmi 1560 +ROUNDSSmi_Int 1561 +ROUNDSSri 1562 +ROUNDSSri_Int 1563 +RSM 1564 +RSQRTPSm 1565 +RSQRTPSr 1566 +RSQRTSSm 1567 +RSQRTSSm_Int 1568 +RSQRTSSr 1569 +RSQRTSSr_Int 1570 +RSTORSSP 1571 +SAHF 1572 +SALC 1573 +SAR 1574 +SARX 1575 +SAVEPREVSSP 1576 +SBB 1577 +SCASB 1578 +SCASL 1579 +SCASQ 1580 +SCASW 1581 +SEAMCALL 1582 +SEAMOPS 1583 +SEAMRET 1584 +SEG_ALLOCA 1585 +SEH_BeginEpilogue 1586 +SEH_EndEpilogue 1587 +SEH_EndPrologue 1588 +SEH_PushFrame 1589 +SEH_PushReg 1590 +SEH_SaveReg 1591 +SEH_SaveXMM 1592 +SEH_SetFrame 1593 +SEH_StackAlign 1594 +SEH_StackAlloc 1595 +SEH_UnwindV 1596 +SEH_UnwindVersion 1597 +SENDUIPI 1598 +SERIALIZE 1599 +SETB_C 1600 +SETCCm 1601 +SETCCm_EVEX 1602 +SETCCr 1603 +SETCCr_EVEX 1604 +SETSSBSY 1605 +SETZUCCm 1606 +SETZUCCr 1607 +SFENCE 1608 +SGDT 1609 +SHA 1610 +SHL 1611 +SHLD 1612 +SHLDROT 1613 +SHLX 1614 +SHR 1615 +SHRD 1616 +SHRDROT 1617 +SHRX 1618 +SHUFPDrmi 1619 +SHUFPDrri 1620 +SHUFPSrmi 1621 +SHUFPSrri 1622 +SIDT 1623 +SKINIT 1624 +SLDT 1625 +SLWPCB 1626 +SMSW 1627 +SQRTPDm 1628 +SQRTPDr 1629 +SQRTPSm 1630 +SQRTPSr 1631 +SQRTSDm 1632 +SQRTSDm_Int 1633 +SQRTSDr 1634 +SQRTSDr_Int 1635 +SQRTSSm 1636 +SQRTSSm_Int 1637 +SQRTSSr 1638 +SQRTSSr_Int 1639 +SQRT_F 1640 +SQRT_Fp 1641 +SS_PREFIX 1642 +STAC 1643 +STACKALLOC_W_PROBING 1644 +STACKMAP 1645 +STATEPOINT 1646 +STC 1647 +STD 1648 +STGI 1649 +STI 1650 +STMXCSR 1651 +STOSB 1652 +STOSL 1653 +STOSQ 1654 +STOSW 1655 +STR 1656 +STRm 1657 +STTILECFG 1658 +STTILECFG_EVEX 1659 +STUI 1660 +ST_F 1661 +ST_FP 1662 +ST_FPrr 1663 +ST_Fp 1664 +ST_FpP 1665 +ST_Frr 1666 +SUB 1667 +SUBPDrm 1668 +SUBPDrr 1669 +SUBPSrm 1670 +SUBPSrr 1671 +SUBREG_TO_REG 1672 +SUBR_F 1673 +SUBR_FI 1674 +SUBR_FPrST 1675 +SUBR_FST 1676 +SUBR_Fp 1677 +SUBR_FpI 1678 +SUBR_FrST 1679 +SUBSDrm 1680 +SUBSDrm_Int 1681 +SUBSDrr 1682 +SUBSDrr_Int 1683 +SUBSSrm 1684 +SUBSSrm_Int 1685 +SUBSSrr 1686 +SUBSSrr_Int 1687 +SUB_F 1688 +SUB_FI 1689 +SUB_FPrST 1690 +SUB_FST 1691 +SUB_Fp 1692 +SUB_FpI 1693 +SUB_FrST 1694 +SWAPGS 1695 +SYSCALL 1696 +SYSENTER 1697 +SYSEXIT 1698 +SYSRET 1699 +T 1700 +TAILJMPd 1701 +TAILJMPd_CC 1702 +TAILJMPm 1703 +TAILJMPr 1704 +TCMMIMFP 1705 +TCMMRLFP 1706 +TCRETURN_HIPE 1707 +TCRETURN_WIN 1708 +TCRETURN_WINmi 1709 +TCRETURNdi 1710 +TCRETURNdicc 1711 +TCRETURNmi 1712 +TCRETURNri 1713 +TCVTROWD 1714 +TCVTROWPS 1715 +TDCALL 1716 +TDPBF 1717 +TDPBHF 1718 +TDPBSSD 1719 +TDPBSUD 1720 +TDPBUSD 1721 +TDPBUUD 1722 +TDPFP 1723 +TDPHBF 1724 +TDPHF 1725 +TEST 1726 +TESTUI 1727 +TILELOADD 1728 +TILELOADDRS 1729 +TILELOADDRST 1730 +TILELOADDRS_EVEX 1731 +TILELOADDT 1732 +TILELOADD_EVEX 1733 +TILEMOVROWrre 1734 +TILEMOVROWrri 1735 +TILERELEASE 1736 +TILESTORED 1737 +TILESTORED_EVEX 1738 +TILEZERO 1739 +TLBSYNC 1740 +TLSCall 1741 +TLS_addr 1742 +TLS_addrX 1743 +TLS_base_addr 1744 +TLS_base_addrX 1745 +TLS_desc 1746 +TMMULTF 1747 +TPAUSE 1748 +TRAP 1749 +TST_F 1750 +TST_Fp 1751 +TZCNT 1752 +TZMSK 1753 +UBSAN_UD 1754 +UCOMISDrm 1755 +UCOMISDrm_Int 1756 +UCOMISDrr 1757 +UCOMISDrr_Int 1758 +UCOMISSrm 1759 +UCOMISSrm_Int 1760 +UCOMISSrr 1761 +UCOMISSrr_Int 1762 +UCOM_FIPr 1763 +UCOM_FIr 1764 +UCOM_FPPr 1765 +UCOM_FPr 1766 +UCOM_FpIr 1767 +UCOM_Fpr 1768 +UCOM_Fr 1769 +UD 1770 +UIRET 1771 +UMONITOR 1772 +UMWAIT 1773 +UNPCKHPDrm 1774 +UNPCKHPDrr 1775 +UNPCKHPSrm 1776 +UNPCKHPSrr 1777 +UNPCKLPDrm 1778 +UNPCKLPDrr 1779 +UNPCKLPSrm 1780 +UNPCKLPSrr 1781 +URDMSRri 1782 +URDMSRri_EVEX 1783 +URDMSRrr 1784 +URDMSRrr_EVEX 1785 +UWRMSRir 1786 +UWRMSRir_EVEX 1787 +UWRMSRrr 1788 +UWRMSRrr_EVEX 1789 +V 1790 +VAARG 1791 +VAARG_X 1792 +VADDBF 1793 +VADDPDYrm 1794 +VADDPDYrr 1795 +VADDPDZ 1796 +VADDPDZrm 1797 +VADDPDZrmb 1798 +VADDPDZrmbk 1799 +VADDPDZrmbkz 1800 +VADDPDZrmk 1801 +VADDPDZrmkz 1802 +VADDPDZrr 1803 +VADDPDZrrb 1804 +VADDPDZrrbk 1805 +VADDPDZrrbkz 1806 +VADDPDZrrk 1807 +VADDPDZrrkz 1808 +VADDPDrm 1809 +VADDPDrr 1810 +VADDPHZ 1811 +VADDPHZrm 1812 +VADDPHZrmb 1813 +VADDPHZrmbk 1814 +VADDPHZrmbkz 1815 +VADDPHZrmk 1816 +VADDPHZrmkz 1817 +VADDPHZrr 1818 +VADDPHZrrb 1819 +VADDPHZrrbk 1820 +VADDPHZrrbkz 1821 +VADDPHZrrk 1822 +VADDPHZrrkz 1823 +VADDPSYrm 1824 +VADDPSYrr 1825 +VADDPSZ 1826 +VADDPSZrm 1827 +VADDPSZrmb 1828 +VADDPSZrmbk 1829 +VADDPSZrmbkz 1830 +VADDPSZrmk 1831 +VADDPSZrmkz 1832 +VADDPSZrr 1833 +VADDPSZrrb 1834 +VADDPSZrrbk 1835 +VADDPSZrrbkz 1836 +VADDPSZrrk 1837 +VADDPSZrrkz 1838 +VADDPSrm 1839 +VADDPSrr 1840 +VADDSDZrm 1841 +VADDSDZrm_Int 1842 +VADDSDZrmk_Int 1843 +VADDSDZrmkz_Int 1844 +VADDSDZrr 1845 +VADDSDZrr_Int 1846 +VADDSDZrrb_Int 1847 +VADDSDZrrbk_Int 1848 +VADDSDZrrbkz_Int 1849 +VADDSDZrrk_Int 1850 +VADDSDZrrkz_Int 1851 +VADDSDrm 1852 +VADDSDrm_Int 1853 +VADDSDrr 1854 +VADDSDrr_Int 1855 +VADDSHZrm 1856 +VADDSHZrm_Int 1857 +VADDSHZrmk_Int 1858 +VADDSHZrmkz_Int 1859 +VADDSHZrr 1860 +VADDSHZrr_Int 1861 +VADDSHZrrb_Int 1862 +VADDSHZrrbk_Int 1863 +VADDSHZrrbkz_Int 1864 +VADDSHZrrk_Int 1865 +VADDSHZrrkz_Int 1866 +VADDSSZrm 1867 +VADDSSZrm_Int 1868 +VADDSSZrmk_Int 1869 +VADDSSZrmkz_Int 1870 +VADDSSZrr 1871 +VADDSSZrr_Int 1872 +VADDSSZrrb_Int 1873 +VADDSSZrrbk_Int 1874 +VADDSSZrrbkz_Int 1875 +VADDSSZrrk_Int 1876 +VADDSSZrrkz_Int 1877 +VADDSSrm 1878 +VADDSSrm_Int 1879 +VADDSSrr 1880 +VADDSSrr_Int 1881 +VADDSUBPDYrm 1882 +VADDSUBPDYrr 1883 +VADDSUBPDrm 1884 +VADDSUBPDrr 1885 +VADDSUBPSYrm 1886 +VADDSUBPSYrr 1887 +VADDSUBPSrm 1888 +VADDSUBPSrr 1889 +VAESDECLASTYrm 1890 +VAESDECLASTYrr 1891 +VAESDECLASTZ 1892 +VAESDECLASTZrm 1893 +VAESDECLASTZrr 1894 +VAESDECLASTrm 1895 +VAESDECLASTrr 1896 +VAESDECYrm 1897 +VAESDECYrr 1898 +VAESDECZ 1899 +VAESDECZrm 1900 +VAESDECZrr 1901 +VAESDECrm 1902 +VAESDECrr 1903 +VAESENCLASTYrm 1904 +VAESENCLASTYrr 1905 +VAESENCLASTZ 1906 +VAESENCLASTZrm 1907 +VAESENCLASTZrr 1908 +VAESENCLASTrm 1909 +VAESENCLASTrr 1910 +VAESENCYrm 1911 +VAESENCYrr 1912 +VAESENCZ 1913 +VAESENCZrm 1914 +VAESENCZrr 1915 +VAESENCrm 1916 +VAESENCrr 1917 +VAESIMCrm 1918 +VAESIMCrr 1919 +VAESKEYGENASSISTrmi 1920 +VAESKEYGENASSISTrri 1921 +VALIGNDZ 1922 +VALIGNDZrmbi 1923 +VALIGNDZrmbik 1924 +VALIGNDZrmbikz 1925 +VALIGNDZrmi 1926 +VALIGNDZrmik 1927 +VALIGNDZrmikz 1928 +VALIGNDZrri 1929 +VALIGNDZrrik 1930 +VALIGNDZrrikz 1931 +VALIGNQZ 1932 +VALIGNQZrmbi 1933 +VALIGNQZrmbik 1934 +VALIGNQZrmbikz 1935 +VALIGNQZrmi 1936 +VALIGNQZrmik 1937 +VALIGNQZrmikz 1938 +VALIGNQZrri 1939 +VALIGNQZrrik 1940 +VALIGNQZrrikz 1941 +VANDNPDYrm 1942 +VANDNPDYrr 1943 +VANDNPDZ 1944 +VANDNPDZrm 1945 +VANDNPDZrmb 1946 +VANDNPDZrmbk 1947 +VANDNPDZrmbkz 1948 +VANDNPDZrmk 1949 +VANDNPDZrmkz 1950 +VANDNPDZrr 1951 +VANDNPDZrrk 1952 +VANDNPDZrrkz 1953 +VANDNPDrm 1954 +VANDNPDrr 1955 +VANDNPSYrm 1956 +VANDNPSYrr 1957 +VANDNPSZ 1958 +VANDNPSZrm 1959 +VANDNPSZrmb 1960 +VANDNPSZrmbk 1961 +VANDNPSZrmbkz 1962 +VANDNPSZrmk 1963 +VANDNPSZrmkz 1964 +VANDNPSZrr 1965 +VANDNPSZrrk 1966 +VANDNPSZrrkz 1967 +VANDNPSrm 1968 +VANDNPSrr 1969 +VANDPDYrm 1970 +VANDPDYrr 1971 +VANDPDZ 1972 +VANDPDZrm 1973 +VANDPDZrmb 1974 +VANDPDZrmbk 1975 +VANDPDZrmbkz 1976 +VANDPDZrmk 1977 +VANDPDZrmkz 1978 +VANDPDZrr 1979 +VANDPDZrrk 1980 +VANDPDZrrkz 1981 +VANDPDrm 1982 +VANDPDrr 1983 +VANDPSYrm 1984 +VANDPSYrr 1985 +VANDPSZ 1986 +VANDPSZrm 1987 +VANDPSZrmb 1988 +VANDPSZrmbk 1989 +VANDPSZrmbkz 1990 +VANDPSZrmk 1991 +VANDPSZrmkz 1992 +VANDPSZrr 1993 +VANDPSZrrk 1994 +VANDPSZrrkz 1995 +VANDPSrm 1996 +VANDPSrr 1997 +VASTART_SAVE_XMM_REGS 1998 +VBCSTNEBF 1999 +VBCSTNESH 2000 +VBLENDMPDZ 2001 +VBLENDMPDZrm 2002 +VBLENDMPDZrmb 2003 +VBLENDMPDZrmbk 2004 +VBLENDMPDZrmbkz 2005 +VBLENDMPDZrmk 2006 +VBLENDMPDZrmkz 2007 +VBLENDMPDZrr 2008 +VBLENDMPDZrrk 2009 +VBLENDMPDZrrkz 2010 +VBLENDMPSZ 2011 +VBLENDMPSZrm 2012 +VBLENDMPSZrmb 2013 +VBLENDMPSZrmbk 2014 +VBLENDMPSZrmbkz 2015 +VBLENDMPSZrmk 2016 +VBLENDMPSZrmkz 2017 +VBLENDMPSZrr 2018 +VBLENDMPSZrrk 2019 +VBLENDMPSZrrkz 2020 +VBLENDPDYrmi 2021 +VBLENDPDYrri 2022 +VBLENDPDrmi 2023 +VBLENDPDrri 2024 +VBLENDPSYrmi 2025 +VBLENDPSYrri 2026 +VBLENDPSrmi 2027 +VBLENDPSrri 2028 +VBLENDVPDYrmr 2029 +VBLENDVPDYrrr 2030 +VBLENDVPDrmr 2031 +VBLENDVPDrrr 2032 +VBLENDVPSYrmr 2033 +VBLENDVPSYrrr 2034 +VBLENDVPSrmr 2035 +VBLENDVPSrrr 2036 +VBROADCASTF 2037 +VBROADCASTI 2038 +VBROADCASTSDYrm 2039 +VBROADCASTSDYrr 2040 +VBROADCASTSDZ 2041 +VBROADCASTSDZrm 2042 +VBROADCASTSDZrmk 2043 +VBROADCASTSDZrmkz 2044 +VBROADCASTSDZrr 2045 +VBROADCASTSDZrrk 2046 +VBROADCASTSDZrrkz 2047 +VBROADCASTSSYrm 2048 +VBROADCASTSSYrr 2049 +VBROADCASTSSZ 2050 +VBROADCASTSSZrm 2051 +VBROADCASTSSZrmk 2052 +VBROADCASTSSZrmkz 2053 +VBROADCASTSSZrr 2054 +VBROADCASTSSZrrk 2055 +VBROADCASTSSZrrkz 2056 +VBROADCASTSSrm 2057 +VBROADCASTSSrr 2058 +VCMPBF 2059 +VCMPPDYrmi 2060 +VCMPPDYrri 2061 +VCMPPDZ 2062 +VCMPPDZrmbi 2063 +VCMPPDZrmbik 2064 +VCMPPDZrmi 2065 +VCMPPDZrmik 2066 +VCMPPDZrri 2067 +VCMPPDZrrib 2068 +VCMPPDZrribk 2069 +VCMPPDZrrik 2070 +VCMPPDrmi 2071 +VCMPPDrri 2072 +VCMPPHZ 2073 +VCMPPHZrmbi 2074 +VCMPPHZrmbik 2075 +VCMPPHZrmi 2076 +VCMPPHZrmik 2077 +VCMPPHZrri 2078 +VCMPPHZrrib 2079 +VCMPPHZrribk 2080 +VCMPPHZrrik 2081 +VCMPPSYrmi 2082 +VCMPPSYrri 2083 +VCMPPSZ 2084 +VCMPPSZrmbi 2085 +VCMPPSZrmbik 2086 +VCMPPSZrmi 2087 +VCMPPSZrmik 2088 +VCMPPSZrri 2089 +VCMPPSZrrib 2090 +VCMPPSZrribk 2091 +VCMPPSZrrik 2092 +VCMPPSrmi 2093 +VCMPPSrri 2094 +VCMPSDZrmi 2095 +VCMPSDZrmi_Int 2096 +VCMPSDZrmik_Int 2097 +VCMPSDZrri 2098 +VCMPSDZrri_Int 2099 +VCMPSDZrrib_Int 2100 +VCMPSDZrribk_Int 2101 +VCMPSDZrrik_Int 2102 +VCMPSDrmi 2103 +VCMPSDrmi_Int 2104 +VCMPSDrri 2105 +VCMPSDrri_Int 2106 +VCMPSHZrmi 2107 +VCMPSHZrmi_Int 2108 +VCMPSHZrmik_Int 2109 +VCMPSHZrri 2110 +VCMPSHZrri_Int 2111 +VCMPSHZrrib_Int 2112 +VCMPSHZrribk_Int 2113 +VCMPSHZrrik_Int 2114 +VCMPSSZrmi 2115 +VCMPSSZrmi_Int 2116 +VCMPSSZrmik_Int 2117 +VCMPSSZrri 2118 +VCMPSSZrri_Int 2119 +VCMPSSZrrib_Int 2120 +VCMPSSZrribk_Int 2121 +VCMPSSZrrik_Int 2122 +VCMPSSrmi 2123 +VCMPSSrmi_Int 2124 +VCMPSSrri 2125 +VCMPSSrri_Int 2126 +VCOMISBF 2127 +VCOMISDZrm 2128 +VCOMISDZrm_Int 2129 +VCOMISDZrr 2130 +VCOMISDZrr_Int 2131 +VCOMISDZrrb 2132 +VCOMISDrm 2133 +VCOMISDrm_Int 2134 +VCOMISDrr 2135 +VCOMISDrr_Int 2136 +VCOMISHZrm 2137 +VCOMISHZrm_Int 2138 +VCOMISHZrr 2139 +VCOMISHZrr_Int 2140 +VCOMISHZrrb 2141 +VCOMISSZrm 2142 +VCOMISSZrm_Int 2143 +VCOMISSZrr 2144 +VCOMISSZrr_Int 2145 +VCOMISSZrrb 2146 +VCOMISSrm 2147 +VCOMISSrm_Int 2148 +VCOMISSrr 2149 +VCOMISSrr_Int 2150 +VCOMPRESSPDZ 2151 +VCOMPRESSPDZmr 2152 +VCOMPRESSPDZmrk 2153 +VCOMPRESSPDZrr 2154 +VCOMPRESSPDZrrk 2155 +VCOMPRESSPDZrrkz 2156 +VCOMPRESSPSZ 2157 +VCOMPRESSPSZmr 2158 +VCOMPRESSPSZmrk 2159 +VCOMPRESSPSZrr 2160 +VCOMPRESSPSZrrk 2161 +VCOMPRESSPSZrrkz 2162 +VCOMXSDZrm_Int 2163 +VCOMXSDZrr_Int 2164 +VCOMXSDZrrb_Int 2165 +VCOMXSHZrm_Int 2166 +VCOMXSHZrr_Int 2167 +VCOMXSHZrrb_Int 2168 +VCOMXSSZrm_Int 2169 +VCOMXSSZrr_Int 2170 +VCOMXSSZrrb_Int 2171 +VCVT 2172 +VCVTBF 2173 +VCVTBIASPH 2174 +VCVTDQ 2175 +VCVTHF 2176 +VCVTNE 2177 +VCVTNEEBF 2178 +VCVTNEEPH 2179 +VCVTNEOBF 2180 +VCVTNEOPH 2181 +VCVTNEPS 2182 +VCVTPD 2183 +VCVTPH 2184 +VCVTPS 2185 +VCVTQQ 2186 +VCVTSD 2187 +VCVTSH 2188 +VCVTSI 2189 +VCVTSS 2190 +VCVTTBF 2191 +VCVTTPD 2192 +VCVTTPH 2193 +VCVTTPS 2194 +VCVTTSD 2195 +VCVTTSH 2196 +VCVTTSS 2197 +VCVTUDQ 2198 +VCVTUQQ 2199 +VCVTUSI 2200 +VCVTUW 2201 +VCVTW 2202 +VDBPSADBWZ 2203 +VDBPSADBWZrmi 2204 +VDBPSADBWZrmik 2205 +VDBPSADBWZrmikz 2206 +VDBPSADBWZrri 2207 +VDBPSADBWZrrik 2208 +VDBPSADBWZrrikz 2209 +VDIVBF 2210 +VDIVPDYrm 2211 +VDIVPDYrr 2212 +VDIVPDZ 2213 +VDIVPDZrm 2214 +VDIVPDZrmb 2215 +VDIVPDZrmbk 2216 +VDIVPDZrmbkz 2217 +VDIVPDZrmk 2218 +VDIVPDZrmkz 2219 +VDIVPDZrr 2220 +VDIVPDZrrb 2221 +VDIVPDZrrbk 2222 +VDIVPDZrrbkz 2223 +VDIVPDZrrk 2224 +VDIVPDZrrkz 2225 +VDIVPDrm 2226 +VDIVPDrr 2227 +VDIVPHZ 2228 +VDIVPHZrm 2229 +VDIVPHZrmb 2230 +VDIVPHZrmbk 2231 +VDIVPHZrmbkz 2232 +VDIVPHZrmk 2233 +VDIVPHZrmkz 2234 +VDIVPHZrr 2235 +VDIVPHZrrb 2236 +VDIVPHZrrbk 2237 +VDIVPHZrrbkz 2238 +VDIVPHZrrk 2239 +VDIVPHZrrkz 2240 +VDIVPSYrm 2241 +VDIVPSYrr 2242 +VDIVPSZ 2243 +VDIVPSZrm 2244 +VDIVPSZrmb 2245 +VDIVPSZrmbk 2246 +VDIVPSZrmbkz 2247 +VDIVPSZrmk 2248 +VDIVPSZrmkz 2249 +VDIVPSZrr 2250 +VDIVPSZrrb 2251 +VDIVPSZrrbk 2252 +VDIVPSZrrbkz 2253 +VDIVPSZrrk 2254 +VDIVPSZrrkz 2255 +VDIVPSrm 2256 +VDIVPSrr 2257 +VDIVSDZrm 2258 +VDIVSDZrm_Int 2259 +VDIVSDZrmk_Int 2260 +VDIVSDZrmkz_Int 2261 +VDIVSDZrr 2262 +VDIVSDZrr_Int 2263 +VDIVSDZrrb_Int 2264 +VDIVSDZrrbk_Int 2265 +VDIVSDZrrbkz_Int 2266 +VDIVSDZrrk_Int 2267 +VDIVSDZrrkz_Int 2268 +VDIVSDrm 2269 +VDIVSDrm_Int 2270 +VDIVSDrr 2271 +VDIVSDrr_Int 2272 +VDIVSHZrm 2273 +VDIVSHZrm_Int 2274 +VDIVSHZrmk_Int 2275 +VDIVSHZrmkz_Int 2276 +VDIVSHZrr 2277 +VDIVSHZrr_Int 2278 +VDIVSHZrrb_Int 2279 +VDIVSHZrrbk_Int 2280 +VDIVSHZrrbkz_Int 2281 +VDIVSHZrrk_Int 2282 +VDIVSHZrrkz_Int 2283 +VDIVSSZrm 2284 +VDIVSSZrm_Int 2285 +VDIVSSZrmk_Int 2286 +VDIVSSZrmkz_Int 2287 +VDIVSSZrr 2288 +VDIVSSZrr_Int 2289 +VDIVSSZrrb_Int 2290 +VDIVSSZrrbk_Int 2291 +VDIVSSZrrbkz_Int 2292 +VDIVSSZrrk_Int 2293 +VDIVSSZrrkz_Int 2294 +VDIVSSrm 2295 +VDIVSSrm_Int 2296 +VDIVSSrr 2297 +VDIVSSrr_Int 2298 +VDPBF 2299 +VDPPDrmi 2300 +VDPPDrri 2301 +VDPPHPSZ 2302 +VDPPHPSZm 2303 +VDPPHPSZmb 2304 +VDPPHPSZmbk 2305 +VDPPHPSZmbkz 2306 +VDPPHPSZmk 2307 +VDPPHPSZmkz 2308 +VDPPHPSZr 2309 +VDPPHPSZrk 2310 +VDPPHPSZrkz 2311 +VDPPSYrmi 2312 +VDPPSYrri 2313 +VDPPSrmi 2314 +VDPPSrri 2315 +VERRm 2316 +VERRr 2317 +VERWm 2318 +VERWr 2319 +VEXP 2320 +VEXPANDPDZ 2321 +VEXPANDPDZrm 2322 +VEXPANDPDZrmk 2323 +VEXPANDPDZrmkz 2324 +VEXPANDPDZrr 2325 +VEXPANDPDZrrk 2326 +VEXPANDPDZrrkz 2327 +VEXPANDPSZ 2328 +VEXPANDPSZrm 2329 +VEXPANDPSZrmk 2330 +VEXPANDPSZrmkz 2331 +VEXPANDPSZrr 2332 +VEXPANDPSZrrk 2333 +VEXPANDPSZrrkz 2334 +VEXTRACTF 2335 +VEXTRACTI 2336 +VEXTRACTPSZmri 2337 +VEXTRACTPSZrri 2338 +VEXTRACTPSmri 2339 +VEXTRACTPSrri 2340 +VFCMADDCPHZ 2341 +VFCMADDCPHZm 2342 +VFCMADDCPHZmb 2343 +VFCMADDCPHZmbk 2344 +VFCMADDCPHZmbkz 2345 +VFCMADDCPHZmk 2346 +VFCMADDCPHZmkz 2347 +VFCMADDCPHZr 2348 +VFCMADDCPHZrb 2349 +VFCMADDCPHZrbk 2350 +VFCMADDCPHZrbkz 2351 +VFCMADDCPHZrk 2352 +VFCMADDCPHZrkz 2353 +VFCMADDCSHZm 2354 +VFCMADDCSHZmk 2355 +VFCMADDCSHZmkz 2356 +VFCMADDCSHZr 2357 +VFCMADDCSHZrb 2358 +VFCMADDCSHZrbk 2359 +VFCMADDCSHZrbkz 2360 +VFCMADDCSHZrk 2361 +VFCMADDCSHZrkz 2362 +VFCMULCPHZ 2363 +VFCMULCPHZrm 2364 +VFCMULCPHZrmb 2365 +VFCMULCPHZrmbk 2366 +VFCMULCPHZrmbkz 2367 +VFCMULCPHZrmk 2368 +VFCMULCPHZrmkz 2369 +VFCMULCPHZrr 2370 +VFCMULCPHZrrb 2371 +VFCMULCPHZrrbk 2372 +VFCMULCPHZrrbkz 2373 +VFCMULCPHZrrk 2374 +VFCMULCPHZrrkz 2375 +VFCMULCSHZrm 2376 +VFCMULCSHZrmk 2377 +VFCMULCSHZrmkz 2378 +VFCMULCSHZrr 2379 +VFCMULCSHZrrb 2380 +VFCMULCSHZrrbk 2381 +VFCMULCSHZrrbkz 2382 +VFCMULCSHZrrk 2383 +VFCMULCSHZrrkz 2384 +VFIXUPIMMPDZ 2385 +VFIXUPIMMPDZrmbi 2386 +VFIXUPIMMPDZrmbik 2387 +VFIXUPIMMPDZrmbikz 2388 +VFIXUPIMMPDZrmi 2389 +VFIXUPIMMPDZrmik 2390 +VFIXUPIMMPDZrmikz 2391 +VFIXUPIMMPDZrri 2392 +VFIXUPIMMPDZrrib 2393 +VFIXUPIMMPDZrribk 2394 +VFIXUPIMMPDZrribkz 2395 +VFIXUPIMMPDZrrik 2396 +VFIXUPIMMPDZrrikz 2397 +VFIXUPIMMPSZ 2398 +VFIXUPIMMPSZrmbi 2399 +VFIXUPIMMPSZrmbik 2400 +VFIXUPIMMPSZrmbikz 2401 +VFIXUPIMMPSZrmi 2402 +VFIXUPIMMPSZrmik 2403 +VFIXUPIMMPSZrmikz 2404 +VFIXUPIMMPSZrri 2405 +VFIXUPIMMPSZrrib 2406 +VFIXUPIMMPSZrribk 2407 +VFIXUPIMMPSZrribkz 2408 +VFIXUPIMMPSZrrik 2409 +VFIXUPIMMPSZrrikz 2410 +VFIXUPIMMSDZrmi 2411 +VFIXUPIMMSDZrmik 2412 +VFIXUPIMMSDZrmikz 2413 +VFIXUPIMMSDZrri 2414 +VFIXUPIMMSDZrrib 2415 +VFIXUPIMMSDZrribk 2416 +VFIXUPIMMSDZrribkz 2417 +VFIXUPIMMSDZrrik 2418 +VFIXUPIMMSDZrrikz 2419 +VFIXUPIMMSSZrmi 2420 +VFIXUPIMMSSZrmik 2421 +VFIXUPIMMSSZrmikz 2422 +VFIXUPIMMSSZrri 2423 +VFIXUPIMMSSZrrib 2424 +VFIXUPIMMSSZrribk 2425 +VFIXUPIMMSSZrribkz 2426 +VFIXUPIMMSSZrrik 2427 +VFIXUPIMMSSZrrikz 2428 +VFMADD 2429 +VFMADDCPHZ 2430 +VFMADDCPHZm 2431 +VFMADDCPHZmb 2432 +VFMADDCPHZmbk 2433 +VFMADDCPHZmbkz 2434 +VFMADDCPHZmk 2435 +VFMADDCPHZmkz 2436 +VFMADDCPHZr 2437 +VFMADDCPHZrb 2438 +VFMADDCPHZrbk 2439 +VFMADDCPHZrbkz 2440 +VFMADDCPHZrk 2441 +VFMADDCPHZrkz 2442 +VFMADDCSHZm 2443 +VFMADDCSHZmk 2444 +VFMADDCSHZmkz 2445 +VFMADDCSHZr 2446 +VFMADDCSHZrb 2447 +VFMADDCSHZrbk 2448 +VFMADDCSHZrbkz 2449 +VFMADDCSHZrk 2450 +VFMADDCSHZrkz 2451 +VFMADDPD 2452 +VFMADDPS 2453 +VFMADDSD 2454 +VFMADDSS 2455 +VFMADDSUB 2456 +VFMADDSUBPD 2457 +VFMADDSUBPS 2458 +VFMSUB 2459 +VFMSUBADD 2460 +VFMSUBADDPD 2461 +VFMSUBADDPS 2462 +VFMSUBPD 2463 +VFMSUBPS 2464 +VFMSUBSD 2465 +VFMSUBSS 2466 +VFMULCPHZ 2467 +VFMULCPHZrm 2468 +VFMULCPHZrmb 2469 +VFMULCPHZrmbk 2470 +VFMULCPHZrmbkz 2471 +VFMULCPHZrmk 2472 +VFMULCPHZrmkz 2473 +VFMULCPHZrr 2474 +VFMULCPHZrrb 2475 +VFMULCPHZrrbk 2476 +VFMULCPHZrrbkz 2477 +VFMULCPHZrrk 2478 +VFMULCPHZrrkz 2479 +VFMULCSHZrm 2480 +VFMULCSHZrmk 2481 +VFMULCSHZrmkz 2482 +VFMULCSHZrr 2483 +VFMULCSHZrrb 2484 +VFMULCSHZrrbk 2485 +VFMULCSHZrrbkz 2486 +VFMULCSHZrrk 2487 +VFMULCSHZrrkz 2488 +VFNMADD 2489 +VFNMADDPD 2490 +VFNMADDPS 2491 +VFNMADDSD 2492 +VFNMADDSS 2493 +VFNMSUB 2494 +VFNMSUBPD 2495 +VFNMSUBPS 2496 +VFNMSUBSD 2497 +VFNMSUBSS 2498 +VFPCLASSBF 2499 +VFPCLASSPDZ 2500 +VFPCLASSPDZmbi 2501 +VFPCLASSPDZmbik 2502 +VFPCLASSPDZmi 2503 +VFPCLASSPDZmik 2504 +VFPCLASSPDZri 2505 +VFPCLASSPDZrik 2506 +VFPCLASSPHZ 2507 +VFPCLASSPHZmbi 2508 +VFPCLASSPHZmbik 2509 +VFPCLASSPHZmi 2510 +VFPCLASSPHZmik 2511 +VFPCLASSPHZri 2512 +VFPCLASSPHZrik 2513 +VFPCLASSPSZ 2514 +VFPCLASSPSZmbi 2515 +VFPCLASSPSZmbik 2516 +VFPCLASSPSZmi 2517 +VFPCLASSPSZmik 2518 +VFPCLASSPSZri 2519 +VFPCLASSPSZrik 2520 +VFPCLASSSDZmi 2521 +VFPCLASSSDZmik 2522 +VFPCLASSSDZri 2523 +VFPCLASSSDZrik 2524 +VFPCLASSSHZmi 2525 +VFPCLASSSHZmik 2526 +VFPCLASSSHZri 2527 +VFPCLASSSHZrik 2528 +VFPCLASSSSZmi 2529 +VFPCLASSSSZmik 2530 +VFPCLASSSSZri 2531 +VFPCLASSSSZrik 2532 +VFRCZPDYrm 2533 +VFRCZPDYrr 2534 +VFRCZPDrm 2535 +VFRCZPDrr 2536 +VFRCZPSYrm 2537 +VFRCZPSYrr 2538 +VFRCZPSrm 2539 +VFRCZPSrr 2540 +VFRCZSDrm 2541 +VFRCZSDrr 2542 +VFRCZSSrm 2543 +VFRCZSSrr 2544 +VGATHERDPDYrm 2545 +VGATHERDPDZ 2546 +VGATHERDPDZrm 2547 +VGATHERDPDrm 2548 +VGATHERDPSYrm 2549 +VGATHERDPSZ 2550 +VGATHERDPSZrm 2551 +VGATHERDPSrm 2552 +VGATHERPF 2553 +VGATHERQPDYrm 2554 +VGATHERQPDZ 2555 +VGATHERQPDZrm 2556 +VGATHERQPDrm 2557 +VGATHERQPSYrm 2558 +VGATHERQPSZ 2559 +VGATHERQPSZrm 2560 +VGATHERQPSrm 2561 +VGETEXPBF 2562 +VGETEXPPDZ 2563 +VGETEXPPDZm 2564 +VGETEXPPDZmb 2565 +VGETEXPPDZmbk 2566 +VGETEXPPDZmbkz 2567 +VGETEXPPDZmk 2568 +VGETEXPPDZmkz 2569 +VGETEXPPDZr 2570 +VGETEXPPDZrb 2571 +VGETEXPPDZrbk 2572 +VGETEXPPDZrbkz 2573 +VGETEXPPDZrk 2574 +VGETEXPPDZrkz 2575 +VGETEXPPHZ 2576 +VGETEXPPHZm 2577 +VGETEXPPHZmb 2578 +VGETEXPPHZmbk 2579 +VGETEXPPHZmbkz 2580 +VGETEXPPHZmk 2581 +VGETEXPPHZmkz 2582 +VGETEXPPHZr 2583 +VGETEXPPHZrb 2584 +VGETEXPPHZrbk 2585 +VGETEXPPHZrbkz 2586 +VGETEXPPHZrk 2587 +VGETEXPPHZrkz 2588 +VGETEXPPSZ 2589 +VGETEXPPSZm 2590 +VGETEXPPSZmb 2591 +VGETEXPPSZmbk 2592 +VGETEXPPSZmbkz 2593 +VGETEXPPSZmk 2594 +VGETEXPPSZmkz 2595 +VGETEXPPSZr 2596 +VGETEXPPSZrb 2597 +VGETEXPPSZrbk 2598 +VGETEXPPSZrbkz 2599 +VGETEXPPSZrk 2600 +VGETEXPPSZrkz 2601 +VGETEXPSDZm 2602 +VGETEXPSDZmk 2603 +VGETEXPSDZmkz 2604 +VGETEXPSDZr 2605 +VGETEXPSDZrb 2606 +VGETEXPSDZrbk 2607 +VGETEXPSDZrbkz 2608 +VGETEXPSDZrk 2609 +VGETEXPSDZrkz 2610 +VGETEXPSHZm 2611 +VGETEXPSHZmk 2612 +VGETEXPSHZmkz 2613 +VGETEXPSHZr 2614 +VGETEXPSHZrb 2615 +VGETEXPSHZrbk 2616 +VGETEXPSHZrbkz 2617 +VGETEXPSHZrk 2618 +VGETEXPSHZrkz 2619 +VGETEXPSSZm 2620 +VGETEXPSSZmk 2621 +VGETEXPSSZmkz 2622 +VGETEXPSSZr 2623 +VGETEXPSSZrb 2624 +VGETEXPSSZrbk 2625 +VGETEXPSSZrbkz 2626 +VGETEXPSSZrk 2627 +VGETEXPSSZrkz 2628 +VGETMANTBF 2629 +VGETMANTPDZ 2630 +VGETMANTPDZrmbi 2631 +VGETMANTPDZrmbik 2632 +VGETMANTPDZrmbikz 2633 +VGETMANTPDZrmi 2634 +VGETMANTPDZrmik 2635 +VGETMANTPDZrmikz 2636 +VGETMANTPDZrri 2637 +VGETMANTPDZrrib 2638 +VGETMANTPDZrribk 2639 +VGETMANTPDZrribkz 2640 +VGETMANTPDZrrik 2641 +VGETMANTPDZrrikz 2642 +VGETMANTPHZ 2643 +VGETMANTPHZrmbi 2644 +VGETMANTPHZrmbik 2645 +VGETMANTPHZrmbikz 2646 +VGETMANTPHZrmi 2647 +VGETMANTPHZrmik 2648 +VGETMANTPHZrmikz 2649 +VGETMANTPHZrri 2650 +VGETMANTPHZrrib 2651 +VGETMANTPHZrribk 2652 +VGETMANTPHZrribkz 2653 +VGETMANTPHZrrik 2654 +VGETMANTPHZrrikz 2655 +VGETMANTPSZ 2656 +VGETMANTPSZrmbi 2657 +VGETMANTPSZrmbik 2658 +VGETMANTPSZrmbikz 2659 +VGETMANTPSZrmi 2660 +VGETMANTPSZrmik 2661 +VGETMANTPSZrmikz 2662 +VGETMANTPSZrri 2663 +VGETMANTPSZrrib 2664 +VGETMANTPSZrribk 2665 +VGETMANTPSZrribkz 2666 +VGETMANTPSZrrik 2667 +VGETMANTPSZrrikz 2668 +VGETMANTSDZrmi 2669 +VGETMANTSDZrmik 2670 +VGETMANTSDZrmikz 2671 +VGETMANTSDZrri 2672 +VGETMANTSDZrrib 2673 +VGETMANTSDZrribk 2674 +VGETMANTSDZrribkz 2675 +VGETMANTSDZrrik 2676 +VGETMANTSDZrrikz 2677 +VGETMANTSHZrmi 2678 +VGETMANTSHZrmik 2679 +VGETMANTSHZrmikz 2680 +VGETMANTSHZrri 2681 +VGETMANTSHZrrib 2682 +VGETMANTSHZrribk 2683 +VGETMANTSHZrribkz 2684 +VGETMANTSHZrrik 2685 +VGETMANTSHZrrikz 2686 +VGETMANTSSZrmi 2687 +VGETMANTSSZrmik 2688 +VGETMANTSSZrmikz 2689 +VGETMANTSSZrri 2690 +VGETMANTSSZrrib 2691 +VGETMANTSSZrribk 2692 +VGETMANTSSZrribkz 2693 +VGETMANTSSZrrik 2694 +VGETMANTSSZrrikz 2695 +VGF 2696 +VHADDPDYrm 2697 +VHADDPDYrr 2698 +VHADDPDrm 2699 +VHADDPDrr 2700 +VHADDPSYrm 2701 +VHADDPSYrr 2702 +VHADDPSrm 2703 +VHADDPSrr 2704 +VHSUBPDYrm 2705 +VHSUBPDYrr 2706 +VHSUBPDrm 2707 +VHSUBPDrr 2708 +VHSUBPSYrm 2709 +VHSUBPSYrr 2710 +VHSUBPSrm 2711 +VHSUBPSrr 2712 +VINSERTF 2713 +VINSERTI 2714 +VINSERTPSZrmi 2715 +VINSERTPSZrri 2716 +VINSERTPSrmi 2717 +VINSERTPSrri 2718 +VLDDQUYrm 2719 +VLDDQUrm 2720 +VLDMXCSR 2721 +VMASKMOVDQU 2722 +VMASKMOVPDYmr 2723 +VMASKMOVPDYrm 2724 +VMASKMOVPDmr 2725 +VMASKMOVPDrm 2726 +VMASKMOVPSYmr 2727 +VMASKMOVPSYrm 2728 +VMASKMOVPSmr 2729 +VMASKMOVPSrm 2730 +VMAXBF 2731 +VMAXCPDYrm 2732 +VMAXCPDYrr 2733 +VMAXCPDZ 2734 +VMAXCPDZrm 2735 +VMAXCPDZrmb 2736 +VMAXCPDZrmbk 2737 +VMAXCPDZrmbkz 2738 +VMAXCPDZrmk 2739 +VMAXCPDZrmkz 2740 +VMAXCPDZrr 2741 +VMAXCPDZrrk 2742 +VMAXCPDZrrkz 2743 +VMAXCPDrm 2744 +VMAXCPDrr 2745 +VMAXCPHZ 2746 +VMAXCPHZrm 2747 +VMAXCPHZrmb 2748 +VMAXCPHZrmbk 2749 +VMAXCPHZrmbkz 2750 +VMAXCPHZrmk 2751 +VMAXCPHZrmkz 2752 +VMAXCPHZrr 2753 +VMAXCPHZrrk 2754 +VMAXCPHZrrkz 2755 +VMAXCPSYrm 2756 +VMAXCPSYrr 2757 +VMAXCPSZ 2758 +VMAXCPSZrm 2759 +VMAXCPSZrmb 2760 +VMAXCPSZrmbk 2761 +VMAXCPSZrmbkz 2762 +VMAXCPSZrmk 2763 +VMAXCPSZrmkz 2764 +VMAXCPSZrr 2765 +VMAXCPSZrrk 2766 +VMAXCPSZrrkz 2767 +VMAXCPSrm 2768 +VMAXCPSrr 2769 +VMAXCSDZrm 2770 +VMAXCSDZrr 2771 +VMAXCSDrm 2772 +VMAXCSDrr 2773 +VMAXCSHZrm 2774 +VMAXCSHZrr 2775 +VMAXCSSZrm 2776 +VMAXCSSZrr 2777 +VMAXCSSrm 2778 +VMAXCSSrr 2779 +VMAXPDYrm 2780 +VMAXPDYrr 2781 +VMAXPDZ 2782 +VMAXPDZrm 2783 +VMAXPDZrmb 2784 +VMAXPDZrmbk 2785 +VMAXPDZrmbkz 2786 +VMAXPDZrmk 2787 +VMAXPDZrmkz 2788 +VMAXPDZrr 2789 +VMAXPDZrrb 2790 +VMAXPDZrrbk 2791 +VMAXPDZrrbkz 2792 +VMAXPDZrrk 2793 +VMAXPDZrrkz 2794 +VMAXPDrm 2795 +VMAXPDrr 2796 +VMAXPHZ 2797 +VMAXPHZrm 2798 +VMAXPHZrmb 2799 +VMAXPHZrmbk 2800 +VMAXPHZrmbkz 2801 +VMAXPHZrmk 2802 +VMAXPHZrmkz 2803 +VMAXPHZrr 2804 +VMAXPHZrrb 2805 +VMAXPHZrrbk 2806 +VMAXPHZrrbkz 2807 +VMAXPHZrrk 2808 +VMAXPHZrrkz 2809 +VMAXPSYrm 2810 +VMAXPSYrr 2811 +VMAXPSZ 2812 +VMAXPSZrm 2813 +VMAXPSZrmb 2814 +VMAXPSZrmbk 2815 +VMAXPSZrmbkz 2816 +VMAXPSZrmk 2817 +VMAXPSZrmkz 2818 +VMAXPSZrr 2819 +VMAXPSZrrb 2820 +VMAXPSZrrbk 2821 +VMAXPSZrrbkz 2822 +VMAXPSZrrk 2823 +VMAXPSZrrkz 2824 +VMAXPSrm 2825 +VMAXPSrr 2826 +VMAXSDZrm 2827 +VMAXSDZrm_Int 2828 +VMAXSDZrmk_Int 2829 +VMAXSDZrmkz_Int 2830 +VMAXSDZrr 2831 +VMAXSDZrr_Int 2832 +VMAXSDZrrb_Int 2833 +VMAXSDZrrbk_Int 2834 +VMAXSDZrrbkz_Int 2835 +VMAXSDZrrk_Int 2836 +VMAXSDZrrkz_Int 2837 +VMAXSDrm 2838 +VMAXSDrm_Int 2839 +VMAXSDrr 2840 +VMAXSDrr_Int 2841 +VMAXSHZrm 2842 +VMAXSHZrm_Int 2843 +VMAXSHZrmk_Int 2844 +VMAXSHZrmkz_Int 2845 +VMAXSHZrr 2846 +VMAXSHZrr_Int 2847 +VMAXSHZrrb_Int 2848 +VMAXSHZrrbk_Int 2849 +VMAXSHZrrbkz_Int 2850 +VMAXSHZrrk_Int 2851 +VMAXSHZrrkz_Int 2852 +VMAXSSZrm 2853 +VMAXSSZrm_Int 2854 +VMAXSSZrmk_Int 2855 +VMAXSSZrmkz_Int 2856 +VMAXSSZrr 2857 +VMAXSSZrr_Int 2858 +VMAXSSZrrb_Int 2859 +VMAXSSZrrbk_Int 2860 +VMAXSSZrrbkz_Int 2861 +VMAXSSZrrk_Int 2862 +VMAXSSZrrkz_Int 2863 +VMAXSSrm 2864 +VMAXSSrm_Int 2865 +VMAXSSrr 2866 +VMAXSSrr_Int 2867 +VMCALL 2868 +VMCLEARm 2869 +VMFUNC 2870 +VMINBF 2871 +VMINCPDYrm 2872 +VMINCPDYrr 2873 +VMINCPDZ 2874 +VMINCPDZrm 2875 +VMINCPDZrmb 2876 +VMINCPDZrmbk 2877 +VMINCPDZrmbkz 2878 +VMINCPDZrmk 2879 +VMINCPDZrmkz 2880 +VMINCPDZrr 2881 +VMINCPDZrrk 2882 +VMINCPDZrrkz 2883 +VMINCPDrm 2884 +VMINCPDrr 2885 +VMINCPHZ 2886 +VMINCPHZrm 2887 +VMINCPHZrmb 2888 +VMINCPHZrmbk 2889 +VMINCPHZrmbkz 2890 +VMINCPHZrmk 2891 +VMINCPHZrmkz 2892 +VMINCPHZrr 2893 +VMINCPHZrrk 2894 +VMINCPHZrrkz 2895 +VMINCPSYrm 2896 +VMINCPSYrr 2897 +VMINCPSZ 2898 +VMINCPSZrm 2899 +VMINCPSZrmb 2900 +VMINCPSZrmbk 2901 +VMINCPSZrmbkz 2902 +VMINCPSZrmk 2903 +VMINCPSZrmkz 2904 +VMINCPSZrr 2905 +VMINCPSZrrk 2906 +VMINCPSZrrkz 2907 +VMINCPSrm 2908 +VMINCPSrr 2909 +VMINCSDZrm 2910 +VMINCSDZrr 2911 +VMINCSDrm 2912 +VMINCSDrr 2913 +VMINCSHZrm 2914 +VMINCSHZrr 2915 +VMINCSSZrm 2916 +VMINCSSZrr 2917 +VMINCSSrm 2918 +VMINCSSrr 2919 +VMINMAXBF 2920 +VMINMAXPDZ 2921 +VMINMAXPDZrmbi 2922 +VMINMAXPDZrmbik 2923 +VMINMAXPDZrmbikz 2924 +VMINMAXPDZrmi 2925 +VMINMAXPDZrmik 2926 +VMINMAXPDZrmikz 2927 +VMINMAXPDZrri 2928 +VMINMAXPDZrrib 2929 +VMINMAXPDZrribk 2930 +VMINMAXPDZrribkz 2931 +VMINMAXPDZrrik 2932 +VMINMAXPDZrrikz 2933 +VMINMAXPHZ 2934 +VMINMAXPHZrmbi 2935 +VMINMAXPHZrmbik 2936 +VMINMAXPHZrmbikz 2937 +VMINMAXPHZrmi 2938 +VMINMAXPHZrmik 2939 +VMINMAXPHZrmikz 2940 +VMINMAXPHZrri 2941 +VMINMAXPHZrrib 2942 +VMINMAXPHZrribk 2943 +VMINMAXPHZrribkz 2944 +VMINMAXPHZrrik 2945 +VMINMAXPHZrrikz 2946 +VMINMAXPSZ 2947 +VMINMAXPSZrmbi 2948 +VMINMAXPSZrmbik 2949 +VMINMAXPSZrmbikz 2950 +VMINMAXPSZrmi 2951 +VMINMAXPSZrmik 2952 +VMINMAXPSZrmikz 2953 +VMINMAXPSZrri 2954 +VMINMAXPSZrrib 2955 +VMINMAXPSZrribk 2956 +VMINMAXPSZrribkz 2957 +VMINMAXPSZrrik 2958 +VMINMAXPSZrrikz 2959 +VMINMAXSDrmi 2960 +VMINMAXSDrmi_Int 2961 +VMINMAXSDrmik_Int 2962 +VMINMAXSDrmikz_Int 2963 +VMINMAXSDrri 2964 +VMINMAXSDrri_Int 2965 +VMINMAXSDrrib_Int 2966 +VMINMAXSDrribk_Int 2967 +VMINMAXSDrribkz_Int 2968 +VMINMAXSDrrik_Int 2969 +VMINMAXSDrrikz_Int 2970 +VMINMAXSHrmi 2971 +VMINMAXSHrmi_Int 2972 +VMINMAXSHrmik_Int 2973 +VMINMAXSHrmikz_Int 2974 +VMINMAXSHrri 2975 +VMINMAXSHrri_Int 2976 +VMINMAXSHrrib_Int 2977 +VMINMAXSHrribk_Int 2978 +VMINMAXSHrribkz_Int 2979 +VMINMAXSHrrik_Int 2980 +VMINMAXSHrrikz_Int 2981 +VMINMAXSSrmi 2982 +VMINMAXSSrmi_Int 2983 +VMINMAXSSrmik_Int 2984 +VMINMAXSSrmikz_Int 2985 +VMINMAXSSrri 2986 +VMINMAXSSrri_Int 2987 +VMINMAXSSrrib_Int 2988 +VMINMAXSSrribk_Int 2989 +VMINMAXSSrribkz_Int 2990 +VMINMAXSSrrik_Int 2991 +VMINMAXSSrrikz_Int 2992 +VMINPDYrm 2993 +VMINPDYrr 2994 +VMINPDZ 2995 +VMINPDZrm 2996 +VMINPDZrmb 2997 +VMINPDZrmbk 2998 +VMINPDZrmbkz 2999 +VMINPDZrmk 3000 +VMINPDZrmkz 3001 +VMINPDZrr 3002 +VMINPDZrrb 3003 +VMINPDZrrbk 3004 +VMINPDZrrbkz 3005 +VMINPDZrrk 3006 +VMINPDZrrkz 3007 +VMINPDrm 3008 +VMINPDrr 3009 +VMINPHZ 3010 +VMINPHZrm 3011 +VMINPHZrmb 3012 +VMINPHZrmbk 3013 +VMINPHZrmbkz 3014 +VMINPHZrmk 3015 +VMINPHZrmkz 3016 +VMINPHZrr 3017 +VMINPHZrrb 3018 +VMINPHZrrbk 3019 +VMINPHZrrbkz 3020 +VMINPHZrrk 3021 +VMINPHZrrkz 3022 +VMINPSYrm 3023 +VMINPSYrr 3024 +VMINPSZ 3025 +VMINPSZrm 3026 +VMINPSZrmb 3027 +VMINPSZrmbk 3028 +VMINPSZrmbkz 3029 +VMINPSZrmk 3030 +VMINPSZrmkz 3031 +VMINPSZrr 3032 +VMINPSZrrb 3033 +VMINPSZrrbk 3034 +VMINPSZrrbkz 3035 +VMINPSZrrk 3036 +VMINPSZrrkz 3037 +VMINPSrm 3038 +VMINPSrr 3039 +VMINSDZrm 3040 +VMINSDZrm_Int 3041 +VMINSDZrmk_Int 3042 +VMINSDZrmkz_Int 3043 +VMINSDZrr 3044 +VMINSDZrr_Int 3045 +VMINSDZrrb_Int 3046 +VMINSDZrrbk_Int 3047 +VMINSDZrrbkz_Int 3048 +VMINSDZrrk_Int 3049 +VMINSDZrrkz_Int 3050 +VMINSDrm 3051 +VMINSDrm_Int 3052 +VMINSDrr 3053 +VMINSDrr_Int 3054 +VMINSHZrm 3055 +VMINSHZrm_Int 3056 +VMINSHZrmk_Int 3057 +VMINSHZrmkz_Int 3058 +VMINSHZrr 3059 +VMINSHZrr_Int 3060 +VMINSHZrrb_Int 3061 +VMINSHZrrbk_Int 3062 +VMINSHZrrbkz_Int 3063 +VMINSHZrrk_Int 3064 +VMINSHZrrkz_Int 3065 +VMINSSZrm 3066 +VMINSSZrm_Int 3067 +VMINSSZrmk_Int 3068 +VMINSSZrmkz_Int 3069 +VMINSSZrr 3070 +VMINSSZrr_Int 3071 +VMINSSZrrb_Int 3072 +VMINSSZrrbk_Int 3073 +VMINSSZrrbkz_Int 3074 +VMINSSZrrk_Int 3075 +VMINSSZrrkz_Int 3076 +VMINSSrm 3077 +VMINSSrm_Int 3078 +VMINSSrr 3079 +VMINSSrr_Int 3080 +VMLAUNCH 3081 +VMLOAD 3082 +VMMCALL 3083 +VMOV 3084 +VMOVAPDYmr 3085 +VMOVAPDYrm 3086 +VMOVAPDYrr 3087 +VMOVAPDYrr_REV 3088 +VMOVAPDZ 3089 +VMOVAPDZmr 3090 +VMOVAPDZmrk 3091 +VMOVAPDZrm 3092 +VMOVAPDZrmk 3093 +VMOVAPDZrmkz 3094 +VMOVAPDZrr 3095 +VMOVAPDZrr_REV 3096 +VMOVAPDZrrk 3097 +VMOVAPDZrrk_REV 3098 +VMOVAPDZrrkz 3099 +VMOVAPDZrrkz_REV 3100 +VMOVAPDmr 3101 +VMOVAPDrm 3102 +VMOVAPDrr 3103 +VMOVAPDrr_REV 3104 +VMOVAPSYmr 3105 +VMOVAPSYrm 3106 +VMOVAPSYrr 3107 +VMOVAPSYrr_REV 3108 +VMOVAPSZ 3109 +VMOVAPSZmr 3110 +VMOVAPSZmrk 3111 +VMOVAPSZrm 3112 +VMOVAPSZrmk 3113 +VMOVAPSZrmkz 3114 +VMOVAPSZrr 3115 +VMOVAPSZrr_REV 3116 +VMOVAPSZrrk 3117 +VMOVAPSZrrk_REV 3118 +VMOVAPSZrrkz 3119 +VMOVAPSZrrkz_REV 3120 +VMOVAPSmr 3121 +VMOVAPSrm 3122 +VMOVAPSrr 3123 +VMOVAPSrr_REV 3124 +VMOVDDUPYrm 3125 +VMOVDDUPYrr 3126 +VMOVDDUPZ 3127 +VMOVDDUPZrm 3128 +VMOVDDUPZrmk 3129 +VMOVDDUPZrmkz 3130 +VMOVDDUPZrr 3131 +VMOVDDUPZrrk 3132 +VMOVDDUPZrrkz 3133 +VMOVDDUPrm 3134 +VMOVDDUPrr 3135 +VMOVDI 3136 +VMOVDQA 3137 +VMOVDQAYmr 3138 +VMOVDQAYrm 3139 +VMOVDQAYrr 3140 +VMOVDQAYrr_REV 3141 +VMOVDQAmr 3142 +VMOVDQArm 3143 +VMOVDQArr 3144 +VMOVDQArr_REV 3145 +VMOVDQU 3146 +VMOVDQUYmr 3147 +VMOVDQUYrm 3148 +VMOVDQUYrr 3149 +VMOVDQUYrr_REV 3150 +VMOVDQUmr 3151 +VMOVDQUrm 3152 +VMOVDQUrr 3153 +VMOVDQUrr_REV 3154 +VMOVHLPSZrr 3155 +VMOVHLPSrr 3156 +VMOVHPDZ 3157 +VMOVHPDmr 3158 +VMOVHPDrm 3159 +VMOVHPSZ 3160 +VMOVHPSmr 3161 +VMOVHPSrm 3162 +VMOVLHPSZrr 3163 +VMOVLHPSrr 3164 +VMOVLPDZ 3165 +VMOVLPDmr 3166 +VMOVLPDrm 3167 +VMOVLPSZ 3168 +VMOVLPSmr 3169 +VMOVLPSrm 3170 +VMOVMSKPDYrr 3171 +VMOVMSKPDrr 3172 +VMOVMSKPSYrr 3173 +VMOVMSKPSrr 3174 +VMOVNTDQAYrm 3175 +VMOVNTDQAZ 3176 +VMOVNTDQAZrm 3177 +VMOVNTDQArm 3178 +VMOVNTDQYmr 3179 +VMOVNTDQZ 3180 +VMOVNTDQZmr 3181 +VMOVNTDQmr 3182 +VMOVNTPDYmr 3183 +VMOVNTPDZ 3184 +VMOVNTPDZmr 3185 +VMOVNTPDmr 3186 +VMOVNTPSYmr 3187 +VMOVNTPSZ 3188 +VMOVNTPSZmr 3189 +VMOVNTPSmr 3190 +VMOVPDI 3191 +VMOVPQI 3192 +VMOVPQIto 3193 +VMOVQI 3194 +VMOVRSBZ 3195 +VMOVRSBZm 3196 +VMOVRSBZmk 3197 +VMOVRSBZmkz 3198 +VMOVRSDZ 3199 +VMOVRSDZm 3200 +VMOVRSDZmk 3201 +VMOVRSDZmkz 3202 +VMOVRSQZ 3203 +VMOVRSQZm 3204 +VMOVRSQZmk 3205 +VMOVRSQZmkz 3206 +VMOVRSWZ 3207 +VMOVRSWZm 3208 +VMOVRSWZmk 3209 +VMOVRSWZmkz 3210 +VMOVSDZmr 3211 +VMOVSDZmrk 3212 +VMOVSDZrm 3213 +VMOVSDZrm_alt 3214 +VMOVSDZrmk 3215 +VMOVSDZrmkz 3216 +VMOVSDZrr 3217 +VMOVSDZrr_REV 3218 +VMOVSDZrrk 3219 +VMOVSDZrrk_REV 3220 +VMOVSDZrrkz 3221 +VMOVSDZrrkz_REV 3222 +VMOVSDmr 3223 +VMOVSDrm 3224 +VMOVSDrm_alt 3225 +VMOVSDrr 3226 +VMOVSDrr_REV 3227 +VMOVSDto 3228 +VMOVSH 3229 +VMOVSHDUPYrm 3230 +VMOVSHDUPYrr 3231 +VMOVSHDUPZ 3232 +VMOVSHDUPZrm 3233 +VMOVSHDUPZrmk 3234 +VMOVSHDUPZrmkz 3235 +VMOVSHDUPZrr 3236 +VMOVSHDUPZrrk 3237 +VMOVSHDUPZrrkz 3238 +VMOVSHDUPrm 3239 +VMOVSHDUPrr 3240 +VMOVSHZmr 3241 +VMOVSHZmrk 3242 +VMOVSHZrm 3243 +VMOVSHZrm_alt 3244 +VMOVSHZrmk 3245 +VMOVSHZrmkz 3246 +VMOVSHZrr 3247 +VMOVSHZrr_REV 3248 +VMOVSHZrrk 3249 +VMOVSHZrrk_REV 3250 +VMOVSHZrrkz 3251 +VMOVSHZrrkz_REV 3252 +VMOVSHtoW 3253 +VMOVSLDUPYrm 3254 +VMOVSLDUPYrr 3255 +VMOVSLDUPZ 3256 +VMOVSLDUPZrm 3257 +VMOVSLDUPZrmk 3258 +VMOVSLDUPZrmkz 3259 +VMOVSLDUPZrr 3260 +VMOVSLDUPZrrk 3261 +VMOVSLDUPZrrkz 3262 +VMOVSLDUPrm 3263 +VMOVSLDUPrr 3264 +VMOVSS 3265 +VMOVSSZmr 3266 +VMOVSSZmrk 3267 +VMOVSSZrm 3268 +VMOVSSZrm_alt 3269 +VMOVSSZrmk 3270 +VMOVSSZrmkz 3271 +VMOVSSZrr 3272 +VMOVSSZrr_REV 3273 +VMOVSSZrrk 3274 +VMOVSSZrrk_REV 3275 +VMOVSSZrrkz 3276 +VMOVSSZrrkz_REV 3277 +VMOVSSmr 3278 +VMOVSSrm 3279 +VMOVSSrm_alt 3280 +VMOVSSrr 3281 +VMOVSSrr_REV 3282 +VMOVUPDYmr 3283 +VMOVUPDYrm 3284 +VMOVUPDYrr 3285 +VMOVUPDYrr_REV 3286 +VMOVUPDZ 3287 +VMOVUPDZmr 3288 +VMOVUPDZmrk 3289 +VMOVUPDZrm 3290 +VMOVUPDZrmk 3291 +VMOVUPDZrmkz 3292 +VMOVUPDZrr 3293 +VMOVUPDZrr_REV 3294 +VMOVUPDZrrk 3295 +VMOVUPDZrrk_REV 3296 +VMOVUPDZrrkz 3297 +VMOVUPDZrrkz_REV 3298 +VMOVUPDmr 3299 +VMOVUPDrm 3300 +VMOVUPDrr 3301 +VMOVUPDrr_REV 3302 +VMOVUPSYmr 3303 +VMOVUPSYrm 3304 +VMOVUPSYrr 3305 +VMOVUPSYrr_REV 3306 +VMOVUPSZ 3307 +VMOVUPSZmr 3308 +VMOVUPSZmrk 3309 +VMOVUPSZrm 3310 +VMOVUPSZrmk 3311 +VMOVUPSZrmkz 3312 +VMOVUPSZrr 3313 +VMOVUPSZrr_REV 3314 +VMOVUPSZrrk 3315 +VMOVUPSZrrk_REV 3316 +VMOVUPSZrrkz 3317 +VMOVUPSZrrkz_REV 3318 +VMOVUPSmr 3319 +VMOVUPSrm 3320 +VMOVUPSrr 3321 +VMOVUPSrr_REV 3322 +VMOVW 3323 +VMOVWmr 3324 +VMOVWrm 3325 +VMOVZPDILo 3326 +VMOVZPQILo 3327 +VMOVZPWILo 3328 +VMPSADBWYrmi 3329 +VMPSADBWYrri 3330 +VMPSADBWZ 3331 +VMPSADBWZrmi 3332 +VMPSADBWZrmik 3333 +VMPSADBWZrmikz 3334 +VMPSADBWZrri 3335 +VMPSADBWZrrik 3336 +VMPSADBWZrrikz 3337 +VMPSADBWrmi 3338 +VMPSADBWrri 3339 +VMPTRLDm 3340 +VMPTRSTm 3341 +VMREAD 3342 +VMRESUME 3343 +VMRUN 3344 +VMSAVE 3345 +VMULBF 3346 +VMULPDYrm 3347 +VMULPDYrr 3348 +VMULPDZ 3349 +VMULPDZrm 3350 +VMULPDZrmb 3351 +VMULPDZrmbk 3352 +VMULPDZrmbkz 3353 +VMULPDZrmk 3354 +VMULPDZrmkz 3355 +VMULPDZrr 3356 +VMULPDZrrb 3357 +VMULPDZrrbk 3358 +VMULPDZrrbkz 3359 +VMULPDZrrk 3360 +VMULPDZrrkz 3361 +VMULPDrm 3362 +VMULPDrr 3363 +VMULPHZ 3364 +VMULPHZrm 3365 +VMULPHZrmb 3366 +VMULPHZrmbk 3367 +VMULPHZrmbkz 3368 +VMULPHZrmk 3369 +VMULPHZrmkz 3370 +VMULPHZrr 3371 +VMULPHZrrb 3372 +VMULPHZrrbk 3373 +VMULPHZrrbkz 3374 +VMULPHZrrk 3375 +VMULPHZrrkz 3376 +VMULPSYrm 3377 +VMULPSYrr 3378 +VMULPSZ 3379 +VMULPSZrm 3380 +VMULPSZrmb 3381 +VMULPSZrmbk 3382 +VMULPSZrmbkz 3383 +VMULPSZrmk 3384 +VMULPSZrmkz 3385 +VMULPSZrr 3386 +VMULPSZrrb 3387 +VMULPSZrrbk 3388 +VMULPSZrrbkz 3389 +VMULPSZrrk 3390 +VMULPSZrrkz 3391 +VMULPSrm 3392 +VMULPSrr 3393 +VMULSDZrm 3394 +VMULSDZrm_Int 3395 +VMULSDZrmk_Int 3396 +VMULSDZrmkz_Int 3397 +VMULSDZrr 3398 +VMULSDZrr_Int 3399 +VMULSDZrrb_Int 3400 +VMULSDZrrbk_Int 3401 +VMULSDZrrbkz_Int 3402 +VMULSDZrrk_Int 3403 +VMULSDZrrkz_Int 3404 +VMULSDrm 3405 +VMULSDrm_Int 3406 +VMULSDrr 3407 +VMULSDrr_Int 3408 +VMULSHZrm 3409 +VMULSHZrm_Int 3410 +VMULSHZrmk_Int 3411 +VMULSHZrmkz_Int 3412 +VMULSHZrr 3413 +VMULSHZrr_Int 3414 +VMULSHZrrb_Int 3415 +VMULSHZrrbk_Int 3416 +VMULSHZrrbkz_Int 3417 +VMULSHZrrk_Int 3418 +VMULSHZrrkz_Int 3419 +VMULSSZrm 3420 +VMULSSZrm_Int 3421 +VMULSSZrmk_Int 3422 +VMULSSZrmkz_Int 3423 +VMULSSZrr 3424 +VMULSSZrr_Int 3425 +VMULSSZrrb_Int 3426 +VMULSSZrrbk_Int 3427 +VMULSSZrrbkz_Int 3428 +VMULSSZrrk_Int 3429 +VMULSSZrrkz_Int 3430 +VMULSSrm 3431 +VMULSSrm_Int 3432 +VMULSSrr 3433 +VMULSSrr_Int 3434 +VMWRITE 3435 +VMXOFF 3436 +VMXON 3437 +VORPDYrm 3438 +VORPDYrr 3439 +VORPDZ 3440 +VORPDZrm 3441 +VORPDZrmb 3442 +VORPDZrmbk 3443 +VORPDZrmbkz 3444 +VORPDZrmk 3445 +VORPDZrmkz 3446 +VORPDZrr 3447 +VORPDZrrk 3448 +VORPDZrrkz 3449 +VORPDrm 3450 +VORPDrr 3451 +VORPSYrm 3452 +VORPSYrr 3453 +VORPSZ 3454 +VORPSZrm 3455 +VORPSZrmb 3456 +VORPSZrmbk 3457 +VORPSZrmbkz 3458 +VORPSZrmk 3459 +VORPSZrmkz 3460 +VORPSZrr 3461 +VORPSZrrk 3462 +VORPSZrrkz 3463 +VORPSrm 3464 +VORPSrr 3465 +VP 3466 +VPABSBYrm 3467 +VPABSBYrr 3468 +VPABSBZ 3469 +VPABSBZrm 3470 +VPABSBZrmk 3471 +VPABSBZrmkz 3472 +VPABSBZrr 3473 +VPABSBZrrk 3474 +VPABSBZrrkz 3475 +VPABSBrm 3476 +VPABSBrr 3477 +VPABSDYrm 3478 +VPABSDYrr 3479 +VPABSDZ 3480 +VPABSDZrm 3481 +VPABSDZrmb 3482 +VPABSDZrmbk 3483 +VPABSDZrmbkz 3484 +VPABSDZrmk 3485 +VPABSDZrmkz 3486 +VPABSDZrr 3487 +VPABSDZrrk 3488 +VPABSDZrrkz 3489 +VPABSDrm 3490 +VPABSDrr 3491 +VPABSQZ 3492 +VPABSQZrm 3493 +VPABSQZrmb 3494 +VPABSQZrmbk 3495 +VPABSQZrmbkz 3496 +VPABSQZrmk 3497 +VPABSQZrmkz 3498 +VPABSQZrr 3499 +VPABSQZrrk 3500 +VPABSQZrrkz 3501 +VPABSWYrm 3502 +VPABSWYrr 3503 +VPABSWZ 3504 +VPABSWZrm 3505 +VPABSWZrmk 3506 +VPABSWZrmkz 3507 +VPABSWZrr 3508 +VPABSWZrrk 3509 +VPABSWZrrkz 3510 +VPABSWrm 3511 +VPABSWrr 3512 +VPACKSSDWYrm 3513 +VPACKSSDWYrr 3514 +VPACKSSDWZ 3515 +VPACKSSDWZrm 3516 +VPACKSSDWZrmb 3517 +VPACKSSDWZrmbk 3518 +VPACKSSDWZrmbkz 3519 +VPACKSSDWZrmk 3520 +VPACKSSDWZrmkz 3521 +VPACKSSDWZrr 3522 +VPACKSSDWZrrk 3523 +VPACKSSDWZrrkz 3524 +VPACKSSDWrm 3525 +VPACKSSDWrr 3526 +VPACKSSWBYrm 3527 +VPACKSSWBYrr 3528 +VPACKSSWBZ 3529 +VPACKSSWBZrm 3530 +VPACKSSWBZrmk 3531 +VPACKSSWBZrmkz 3532 +VPACKSSWBZrr 3533 +VPACKSSWBZrrk 3534 +VPACKSSWBZrrkz 3535 +VPACKSSWBrm 3536 +VPACKSSWBrr 3537 +VPACKUSDWYrm 3538 +VPACKUSDWYrr 3539 +VPACKUSDWZ 3540 +VPACKUSDWZrm 3541 +VPACKUSDWZrmb 3542 +VPACKUSDWZrmbk 3543 +VPACKUSDWZrmbkz 3544 +VPACKUSDWZrmk 3545 +VPACKUSDWZrmkz 3546 +VPACKUSDWZrr 3547 +VPACKUSDWZrrk 3548 +VPACKUSDWZrrkz 3549 +VPACKUSDWrm 3550 +VPACKUSDWrr 3551 +VPACKUSWBYrm 3552 +VPACKUSWBYrr 3553 +VPACKUSWBZ 3554 +VPACKUSWBZrm 3555 +VPACKUSWBZrmk 3556 +VPACKUSWBZrmkz 3557 +VPACKUSWBZrr 3558 +VPACKUSWBZrrk 3559 +VPACKUSWBZrrkz 3560 +VPACKUSWBrm 3561 +VPACKUSWBrr 3562 +VPADDBYrm 3563 +VPADDBYrr 3564 +VPADDBZ 3565 +VPADDBZrm 3566 +VPADDBZrmk 3567 +VPADDBZrmkz 3568 +VPADDBZrr 3569 +VPADDBZrrk 3570 +VPADDBZrrkz 3571 +VPADDBrm 3572 +VPADDBrr 3573 +VPADDDYrm 3574 +VPADDDYrr 3575 +VPADDDZ 3576 +VPADDDZrm 3577 +VPADDDZrmb 3578 +VPADDDZrmbk 3579 +VPADDDZrmbkz 3580 +VPADDDZrmk 3581 +VPADDDZrmkz 3582 +VPADDDZrr 3583 +VPADDDZrrk 3584 +VPADDDZrrkz 3585 +VPADDDrm 3586 +VPADDDrr 3587 +VPADDQYrm 3588 +VPADDQYrr 3589 +VPADDQZ 3590 +VPADDQZrm 3591 +VPADDQZrmb 3592 +VPADDQZrmbk 3593 +VPADDQZrmbkz 3594 +VPADDQZrmk 3595 +VPADDQZrmkz 3596 +VPADDQZrr 3597 +VPADDQZrrk 3598 +VPADDQZrrkz 3599 +VPADDQrm 3600 +VPADDQrr 3601 +VPADDSBYrm 3602 +VPADDSBYrr 3603 +VPADDSBZ 3604 +VPADDSBZrm 3605 +VPADDSBZrmk 3606 +VPADDSBZrmkz 3607 +VPADDSBZrr 3608 +VPADDSBZrrk 3609 +VPADDSBZrrkz 3610 +VPADDSBrm 3611 +VPADDSBrr 3612 +VPADDSWYrm 3613 +VPADDSWYrr 3614 +VPADDSWZ 3615 +VPADDSWZrm 3616 +VPADDSWZrmk 3617 +VPADDSWZrmkz 3618 +VPADDSWZrr 3619 +VPADDSWZrrk 3620 +VPADDSWZrrkz 3621 +VPADDSWrm 3622 +VPADDSWrr 3623 +VPADDUSBYrm 3624 +VPADDUSBYrr 3625 +VPADDUSBZ 3626 +VPADDUSBZrm 3627 +VPADDUSBZrmk 3628 +VPADDUSBZrmkz 3629 +VPADDUSBZrr 3630 +VPADDUSBZrrk 3631 +VPADDUSBZrrkz 3632 +VPADDUSBrm 3633 +VPADDUSBrr 3634 +VPADDUSWYrm 3635 +VPADDUSWYrr 3636 +VPADDUSWZ 3637 +VPADDUSWZrm 3638 +VPADDUSWZrmk 3639 +VPADDUSWZrmkz 3640 +VPADDUSWZrr 3641 +VPADDUSWZrrk 3642 +VPADDUSWZrrkz 3643 +VPADDUSWrm 3644 +VPADDUSWrr 3645 +VPADDWYrm 3646 +VPADDWYrr 3647 +VPADDWZ 3648 +VPADDWZrm 3649 +VPADDWZrmk 3650 +VPADDWZrmkz 3651 +VPADDWZrr 3652 +VPADDWZrrk 3653 +VPADDWZrrkz 3654 +VPADDWrm 3655 +VPADDWrr 3656 +VPALIGNRYrmi 3657 +VPALIGNRYrri 3658 +VPALIGNRZ 3659 +VPALIGNRZrmi 3660 +VPALIGNRZrmik 3661 +VPALIGNRZrmikz 3662 +VPALIGNRZrri 3663 +VPALIGNRZrrik 3664 +VPALIGNRZrrikz 3665 +VPALIGNRrmi 3666 +VPALIGNRrri 3667 +VPANDDZ 3668 +VPANDDZrm 3669 +VPANDDZrmb 3670 +VPANDDZrmbk 3671 +VPANDDZrmbkz 3672 +VPANDDZrmk 3673 +VPANDDZrmkz 3674 +VPANDDZrr 3675 +VPANDDZrrk 3676 +VPANDDZrrkz 3677 +VPANDNDZ 3678 +VPANDNDZrm 3679 +VPANDNDZrmb 3680 +VPANDNDZrmbk 3681 +VPANDNDZrmbkz 3682 +VPANDNDZrmk 3683 +VPANDNDZrmkz 3684 +VPANDNDZrr 3685 +VPANDNDZrrk 3686 +VPANDNDZrrkz 3687 +VPANDNQZ 3688 +VPANDNQZrm 3689 +VPANDNQZrmb 3690 +VPANDNQZrmbk 3691 +VPANDNQZrmbkz 3692 +VPANDNQZrmk 3693 +VPANDNQZrmkz 3694 +VPANDNQZrr 3695 +VPANDNQZrrk 3696 +VPANDNQZrrkz 3697 +VPANDNYrm 3698 +VPANDNYrr 3699 +VPANDNrm 3700 +VPANDNrr 3701 +VPANDQZ 3702 +VPANDQZrm 3703 +VPANDQZrmb 3704 +VPANDQZrmbk 3705 +VPANDQZrmbkz 3706 +VPANDQZrmk 3707 +VPANDQZrmkz 3708 +VPANDQZrr 3709 +VPANDQZrrk 3710 +VPANDQZrrkz 3711 +VPANDYrm 3712 +VPANDYrr 3713 +VPANDrm 3714 +VPANDrr 3715 +VPAVGBYrm 3716 +VPAVGBYrr 3717 +VPAVGBZ 3718 +VPAVGBZrm 3719 +VPAVGBZrmk 3720 +VPAVGBZrmkz 3721 +VPAVGBZrr 3722 +VPAVGBZrrk 3723 +VPAVGBZrrkz 3724 +VPAVGBrm 3725 +VPAVGBrr 3726 +VPAVGWYrm 3727 +VPAVGWYrr 3728 +VPAVGWZ 3729 +VPAVGWZrm 3730 +VPAVGWZrmk 3731 +VPAVGWZrmkz 3732 +VPAVGWZrr 3733 +VPAVGWZrrk 3734 +VPAVGWZrrkz 3735 +VPAVGWrm 3736 +VPAVGWrr 3737 +VPBLENDDYrmi 3738 +VPBLENDDYrri 3739 +VPBLENDDrmi 3740 +VPBLENDDrri 3741 +VPBLENDMBZ 3742 +VPBLENDMBZrm 3743 +VPBLENDMBZrmk 3744 +VPBLENDMBZrmkz 3745 +VPBLENDMBZrr 3746 +VPBLENDMBZrrk 3747 +VPBLENDMBZrrkz 3748 +VPBLENDMDZ 3749 +VPBLENDMDZrm 3750 +VPBLENDMDZrmb 3751 +VPBLENDMDZrmbk 3752 +VPBLENDMDZrmbkz 3753 +VPBLENDMDZrmk 3754 +VPBLENDMDZrmkz 3755 +VPBLENDMDZrr 3756 +VPBLENDMDZrrk 3757 +VPBLENDMDZrrkz 3758 +VPBLENDMQZ 3759 +VPBLENDMQZrm 3760 +VPBLENDMQZrmb 3761 +VPBLENDMQZrmbk 3762 +VPBLENDMQZrmbkz 3763 +VPBLENDMQZrmk 3764 +VPBLENDMQZrmkz 3765 +VPBLENDMQZrr 3766 +VPBLENDMQZrrk 3767 +VPBLENDMQZrrkz 3768 +VPBLENDMWZ 3769 +VPBLENDMWZrm 3770 +VPBLENDMWZrmk 3771 +VPBLENDMWZrmkz 3772 +VPBLENDMWZrr 3773 +VPBLENDMWZrrk 3774 +VPBLENDMWZrrkz 3775 +VPBLENDVBYrmr 3776 +VPBLENDVBYrrr 3777 +VPBLENDVBrmr 3778 +VPBLENDVBrrr 3779 +VPBLENDWYrmi 3780 +VPBLENDWYrri 3781 +VPBLENDWrmi 3782 +VPBLENDWrri 3783 +VPBROADCASTBYrm 3784 +VPBROADCASTBYrr 3785 +VPBROADCASTBZ 3786 +VPBROADCASTBZrm 3787 +VPBROADCASTBZrmk 3788 +VPBROADCASTBZrmkz 3789 +VPBROADCASTBZrr 3790 +VPBROADCASTBZrrk 3791 +VPBROADCASTBZrrkz 3792 +VPBROADCASTBrZ 3793 +VPBROADCASTBrZrr 3794 +VPBROADCASTBrZrrk 3795 +VPBROADCASTBrZrrkz 3796 +VPBROADCASTBrm 3797 +VPBROADCASTBrr 3798 +VPBROADCASTDYrm 3799 +VPBROADCASTDYrr 3800 +VPBROADCASTDZ 3801 +VPBROADCASTDZrm 3802 +VPBROADCASTDZrmk 3803 +VPBROADCASTDZrmkz 3804 +VPBROADCASTDZrr 3805 +VPBROADCASTDZrrk 3806 +VPBROADCASTDZrrkz 3807 +VPBROADCASTDrZ 3808 +VPBROADCASTDrZrr 3809 +VPBROADCASTDrZrrk 3810 +VPBROADCASTDrZrrkz 3811 +VPBROADCASTDrm 3812 +VPBROADCASTDrr 3813 +VPBROADCASTMB 3814 +VPBROADCASTMW 3815 +VPBROADCASTQYrm 3816 +VPBROADCASTQYrr 3817 +VPBROADCASTQZ 3818 +VPBROADCASTQZrm 3819 +VPBROADCASTQZrmk 3820 +VPBROADCASTQZrmkz 3821 +VPBROADCASTQZrr 3822 +VPBROADCASTQZrrk 3823 +VPBROADCASTQZrrkz 3824 +VPBROADCASTQrZ 3825 +VPBROADCASTQrZrr 3826 +VPBROADCASTQrZrrk 3827 +VPBROADCASTQrZrrkz 3828 +VPBROADCASTQrm 3829 +VPBROADCASTQrr 3830 +VPBROADCASTWYrm 3831 +VPBROADCASTWYrr 3832 +VPBROADCASTWZ 3833 +VPBROADCASTWZrm 3834 +VPBROADCASTWZrmk 3835 +VPBROADCASTWZrmkz 3836 +VPBROADCASTWZrr 3837 +VPBROADCASTWZrrk 3838 +VPBROADCASTWZrrkz 3839 +VPBROADCASTWrZ 3840 +VPBROADCASTWrZrr 3841 +VPBROADCASTWrZrrk 3842 +VPBROADCASTWrZrrkz 3843 +VPBROADCASTWrm 3844 +VPBROADCASTWrr 3845 +VPCLMULQDQYrmi 3846 +VPCLMULQDQYrri 3847 +VPCLMULQDQZ 3848 +VPCLMULQDQZrmi 3849 +VPCLMULQDQZrri 3850 +VPCLMULQDQrmi 3851 +VPCLMULQDQrri 3852 +VPCMOVYrmr 3853 +VPCMOVYrrm 3854 +VPCMOVYrrr 3855 +VPCMOVYrrr_REV 3856 +VPCMOVrmr 3857 +VPCMOVrrm 3858 +VPCMOVrrr 3859 +VPCMOVrrr_REV 3860 +VPCMPBZ 3861 +VPCMPBZrmi 3862 +VPCMPBZrmik 3863 +VPCMPBZrri 3864 +VPCMPBZrrik 3865 +VPCMPDZ 3866 +VPCMPDZrmbi 3867 +VPCMPDZrmbik 3868 +VPCMPDZrmi 3869 +VPCMPDZrmik 3870 +VPCMPDZrri 3871 +VPCMPDZrrik 3872 +VPCMPEQBYrm 3873 +VPCMPEQBYrr 3874 +VPCMPEQBZ 3875 +VPCMPEQBZrm 3876 +VPCMPEQBZrmk 3877 +VPCMPEQBZrr 3878 +VPCMPEQBZrrk 3879 +VPCMPEQBrm 3880 +VPCMPEQBrr 3881 +VPCMPEQDYrm 3882 +VPCMPEQDYrr 3883 +VPCMPEQDZ 3884 +VPCMPEQDZrm 3885 +VPCMPEQDZrmb 3886 +VPCMPEQDZrmbk 3887 +VPCMPEQDZrmk 3888 +VPCMPEQDZrr 3889 +VPCMPEQDZrrk 3890 +VPCMPEQDrm 3891 +VPCMPEQDrr 3892 +VPCMPEQQYrm 3893 +VPCMPEQQYrr 3894 +VPCMPEQQZ 3895 +VPCMPEQQZrm 3896 +VPCMPEQQZrmb 3897 +VPCMPEQQZrmbk 3898 +VPCMPEQQZrmk 3899 +VPCMPEQQZrr 3900 +VPCMPEQQZrrk 3901 +VPCMPEQQrm 3902 +VPCMPEQQrr 3903 +VPCMPEQWYrm 3904 +VPCMPEQWYrr 3905 +VPCMPEQWZ 3906 +VPCMPEQWZrm 3907 +VPCMPEQWZrmk 3908 +VPCMPEQWZrr 3909 +VPCMPEQWZrrk 3910 +VPCMPEQWrm 3911 +VPCMPEQWrr 3912 +VPCMPESTRIrmi 3913 +VPCMPESTRIrri 3914 +VPCMPESTRMrmi 3915 +VPCMPESTRMrri 3916 +VPCMPGTBYrm 3917 +VPCMPGTBYrr 3918 +VPCMPGTBZ 3919 +VPCMPGTBZrm 3920 +VPCMPGTBZrmk 3921 +VPCMPGTBZrr 3922 +VPCMPGTBZrrk 3923 +VPCMPGTBrm 3924 +VPCMPGTBrr 3925 +VPCMPGTDYrm 3926 +VPCMPGTDYrr 3927 +VPCMPGTDZ 3928 +VPCMPGTDZrm 3929 +VPCMPGTDZrmb 3930 +VPCMPGTDZrmbk 3931 +VPCMPGTDZrmk 3932 +VPCMPGTDZrr 3933 +VPCMPGTDZrrk 3934 +VPCMPGTDrm 3935 +VPCMPGTDrr 3936 +VPCMPGTQYrm 3937 +VPCMPGTQYrr 3938 +VPCMPGTQZ 3939 +VPCMPGTQZrm 3940 +VPCMPGTQZrmb 3941 +VPCMPGTQZrmbk 3942 +VPCMPGTQZrmk 3943 +VPCMPGTQZrr 3944 +VPCMPGTQZrrk 3945 +VPCMPGTQrm 3946 +VPCMPGTQrr 3947 +VPCMPGTWYrm 3948 +VPCMPGTWYrr 3949 +VPCMPGTWZ 3950 +VPCMPGTWZrm 3951 +VPCMPGTWZrmk 3952 +VPCMPGTWZrr 3953 +VPCMPGTWZrrk 3954 +VPCMPGTWrm 3955 +VPCMPGTWrr 3956 +VPCMPISTRIrmi 3957 +VPCMPISTRIrri 3958 +VPCMPISTRMrmi 3959 +VPCMPISTRMrri 3960 +VPCMPQZ 3961 +VPCMPQZrmbi 3962 +VPCMPQZrmbik 3963 +VPCMPQZrmi 3964 +VPCMPQZrmik 3965 +VPCMPQZrri 3966 +VPCMPQZrrik 3967 +VPCMPUBZ 3968 +VPCMPUBZrmi 3969 +VPCMPUBZrmik 3970 +VPCMPUBZrri 3971 +VPCMPUBZrrik 3972 +VPCMPUDZ 3973 +VPCMPUDZrmbi 3974 +VPCMPUDZrmbik 3975 +VPCMPUDZrmi 3976 +VPCMPUDZrmik 3977 +VPCMPUDZrri 3978 +VPCMPUDZrrik 3979 +VPCMPUQZ 3980 +VPCMPUQZrmbi 3981 +VPCMPUQZrmbik 3982 +VPCMPUQZrmi 3983 +VPCMPUQZrmik 3984 +VPCMPUQZrri 3985 +VPCMPUQZrrik 3986 +VPCMPUWZ 3987 +VPCMPUWZrmi 3988 +VPCMPUWZrmik 3989 +VPCMPUWZrri 3990 +VPCMPUWZrrik 3991 +VPCMPWZ 3992 +VPCMPWZrmi 3993 +VPCMPWZrmik 3994 +VPCMPWZrri 3995 +VPCMPWZrrik 3996 +VPCOMBmi 3997 +VPCOMBri 3998 +VPCOMDmi 3999 +VPCOMDri 4000 +VPCOMPRESSBZ 4001 +VPCOMPRESSBZmr 4002 +VPCOMPRESSBZmrk 4003 +VPCOMPRESSBZrr 4004 +VPCOMPRESSBZrrk 4005 +VPCOMPRESSBZrrkz 4006 +VPCOMPRESSDZ 4007 +VPCOMPRESSDZmr 4008 +VPCOMPRESSDZmrk 4009 +VPCOMPRESSDZrr 4010 +VPCOMPRESSDZrrk 4011 +VPCOMPRESSDZrrkz 4012 +VPCOMPRESSQZ 4013 +VPCOMPRESSQZmr 4014 +VPCOMPRESSQZmrk 4015 +VPCOMPRESSQZrr 4016 +VPCOMPRESSQZrrk 4017 +VPCOMPRESSQZrrkz 4018 +VPCOMPRESSWZ 4019 +VPCOMPRESSWZmr 4020 +VPCOMPRESSWZmrk 4021 +VPCOMPRESSWZrr 4022 +VPCOMPRESSWZrrk 4023 +VPCOMPRESSWZrrkz 4024 +VPCOMQmi 4025 +VPCOMQri 4026 +VPCOMUBmi 4027 +VPCOMUBri 4028 +VPCOMUDmi 4029 +VPCOMUDri 4030 +VPCOMUQmi 4031 +VPCOMUQri 4032 +VPCOMUWmi 4033 +VPCOMUWri 4034 +VPCOMWmi 4035 +VPCOMWri 4036 +VPCONFLICTDZ 4037 +VPCONFLICTDZrm 4038 +VPCONFLICTDZrmb 4039 +VPCONFLICTDZrmbk 4040 +VPCONFLICTDZrmbkz 4041 +VPCONFLICTDZrmk 4042 +VPCONFLICTDZrmkz 4043 +VPCONFLICTDZrr 4044 +VPCONFLICTDZrrk 4045 +VPCONFLICTDZrrkz 4046 +VPCONFLICTQZ 4047 +VPCONFLICTQZrm 4048 +VPCONFLICTQZrmb 4049 +VPCONFLICTQZrmbk 4050 +VPCONFLICTQZrmbkz 4051 +VPCONFLICTQZrmk 4052 +VPCONFLICTQZrmkz 4053 +VPCONFLICTQZrr 4054 +VPCONFLICTQZrrk 4055 +VPCONFLICTQZrrkz 4056 +VPDPBSSDSYrm 4057 +VPDPBSSDSYrr 4058 +VPDPBSSDSZ 4059 +VPDPBSSDSZrm 4060 +VPDPBSSDSZrmb 4061 +VPDPBSSDSZrmbk 4062 +VPDPBSSDSZrmbkz 4063 +VPDPBSSDSZrmk 4064 +VPDPBSSDSZrmkz 4065 +VPDPBSSDSZrr 4066 +VPDPBSSDSZrrk 4067 +VPDPBSSDSZrrkz 4068 +VPDPBSSDSrm 4069 +VPDPBSSDSrr 4070 +VPDPBSSDYrm 4071 +VPDPBSSDYrr 4072 +VPDPBSSDZ 4073 +VPDPBSSDZrm 4074 +VPDPBSSDZrmb 4075 +VPDPBSSDZrmbk 4076 +VPDPBSSDZrmbkz 4077 +VPDPBSSDZrmk 4078 +VPDPBSSDZrmkz 4079 +VPDPBSSDZrr 4080 +VPDPBSSDZrrk 4081 +VPDPBSSDZrrkz 4082 +VPDPBSSDrm 4083 +VPDPBSSDrr 4084 +VPDPBSUDSYrm 4085 +VPDPBSUDSYrr 4086 +VPDPBSUDSZ 4087 +VPDPBSUDSZrm 4088 +VPDPBSUDSZrmb 4089 +VPDPBSUDSZrmbk 4090 +VPDPBSUDSZrmbkz 4091 +VPDPBSUDSZrmk 4092 +VPDPBSUDSZrmkz 4093 +VPDPBSUDSZrr 4094 +VPDPBSUDSZrrk 4095 +VPDPBSUDSZrrkz 4096 +VPDPBSUDSrm 4097 +VPDPBSUDSrr 4098 +VPDPBSUDYrm 4099 +VPDPBSUDYrr 4100 +VPDPBSUDZ 4101 +VPDPBSUDZrm 4102 +VPDPBSUDZrmb 4103 +VPDPBSUDZrmbk 4104 +VPDPBSUDZrmbkz 4105 +VPDPBSUDZrmk 4106 +VPDPBSUDZrmkz 4107 +VPDPBSUDZrr 4108 +VPDPBSUDZrrk 4109 +VPDPBSUDZrrkz 4110 +VPDPBSUDrm 4111 +VPDPBSUDrr 4112 +VPDPBUSDSYrm 4113 +VPDPBUSDSYrr 4114 +VPDPBUSDSZ 4115 +VPDPBUSDSZrm 4116 +VPDPBUSDSZrmb 4117 +VPDPBUSDSZrmbk 4118 +VPDPBUSDSZrmbkz 4119 +VPDPBUSDSZrmk 4120 +VPDPBUSDSZrmkz 4121 +VPDPBUSDSZrr 4122 +VPDPBUSDSZrrk 4123 +VPDPBUSDSZrrkz 4124 +VPDPBUSDSrm 4125 +VPDPBUSDSrr 4126 +VPDPBUSDYrm 4127 +VPDPBUSDYrr 4128 +VPDPBUSDZ 4129 +VPDPBUSDZrm 4130 +VPDPBUSDZrmb 4131 +VPDPBUSDZrmbk 4132 +VPDPBUSDZrmbkz 4133 +VPDPBUSDZrmk 4134 +VPDPBUSDZrmkz 4135 +VPDPBUSDZrr 4136 +VPDPBUSDZrrk 4137 +VPDPBUSDZrrkz 4138 +VPDPBUSDrm 4139 +VPDPBUSDrr 4140 +VPDPBUUDSYrm 4141 +VPDPBUUDSYrr 4142 +VPDPBUUDSZ 4143 +VPDPBUUDSZrm 4144 +VPDPBUUDSZrmb 4145 +VPDPBUUDSZrmbk 4146 +VPDPBUUDSZrmbkz 4147 +VPDPBUUDSZrmk 4148 +VPDPBUUDSZrmkz 4149 +VPDPBUUDSZrr 4150 +VPDPBUUDSZrrk 4151 +VPDPBUUDSZrrkz 4152 +VPDPBUUDSrm 4153 +VPDPBUUDSrr 4154 +VPDPBUUDYrm 4155 +VPDPBUUDYrr 4156 +VPDPBUUDZ 4157 +VPDPBUUDZrm 4158 +VPDPBUUDZrmb 4159 +VPDPBUUDZrmbk 4160 +VPDPBUUDZrmbkz 4161 +VPDPBUUDZrmk 4162 +VPDPBUUDZrmkz 4163 +VPDPBUUDZrr 4164 +VPDPBUUDZrrk 4165 +VPDPBUUDZrrkz 4166 +VPDPBUUDrm 4167 +VPDPBUUDrr 4168 +VPDPWSSDSYrm 4169 +VPDPWSSDSYrr 4170 +VPDPWSSDSZ 4171 +VPDPWSSDSZrm 4172 +VPDPWSSDSZrmb 4173 +VPDPWSSDSZrmbk 4174 +VPDPWSSDSZrmbkz 4175 +VPDPWSSDSZrmk 4176 +VPDPWSSDSZrmkz 4177 +VPDPWSSDSZrr 4178 +VPDPWSSDSZrrk 4179 +VPDPWSSDSZrrkz 4180 +VPDPWSSDSrm 4181 +VPDPWSSDSrr 4182 +VPDPWSSDYrm 4183 +VPDPWSSDYrr 4184 +VPDPWSSDZ 4185 +VPDPWSSDZrm 4186 +VPDPWSSDZrmb 4187 +VPDPWSSDZrmbk 4188 +VPDPWSSDZrmbkz 4189 +VPDPWSSDZrmk 4190 +VPDPWSSDZrmkz 4191 +VPDPWSSDZrr 4192 +VPDPWSSDZrrk 4193 +VPDPWSSDZrrkz 4194 +VPDPWSSDrm 4195 +VPDPWSSDrr 4196 +VPDPWSUDSYrm 4197 +VPDPWSUDSYrr 4198 +VPDPWSUDSZ 4199 +VPDPWSUDSZrm 4200 +VPDPWSUDSZrmb 4201 +VPDPWSUDSZrmbk 4202 +VPDPWSUDSZrmbkz 4203 +VPDPWSUDSZrmk 4204 +VPDPWSUDSZrmkz 4205 +VPDPWSUDSZrr 4206 +VPDPWSUDSZrrk 4207 +VPDPWSUDSZrrkz 4208 +VPDPWSUDSrm 4209 +VPDPWSUDSrr 4210 +VPDPWSUDYrm 4211 +VPDPWSUDYrr 4212 +VPDPWSUDZ 4213 +VPDPWSUDZrm 4214 +VPDPWSUDZrmb 4215 +VPDPWSUDZrmbk 4216 +VPDPWSUDZrmbkz 4217 +VPDPWSUDZrmk 4218 +VPDPWSUDZrmkz 4219 +VPDPWSUDZrr 4220 +VPDPWSUDZrrk 4221 +VPDPWSUDZrrkz 4222 +VPDPWSUDrm 4223 +VPDPWSUDrr 4224 +VPDPWUSDSYrm 4225 +VPDPWUSDSYrr 4226 +VPDPWUSDSZ 4227 +VPDPWUSDSZrm 4228 +VPDPWUSDSZrmb 4229 +VPDPWUSDSZrmbk 4230 +VPDPWUSDSZrmbkz 4231 +VPDPWUSDSZrmk 4232 +VPDPWUSDSZrmkz 4233 +VPDPWUSDSZrr 4234 +VPDPWUSDSZrrk 4235 +VPDPWUSDSZrrkz 4236 +VPDPWUSDSrm 4237 +VPDPWUSDSrr 4238 +VPDPWUSDYrm 4239 +VPDPWUSDYrr 4240 +VPDPWUSDZ 4241 +VPDPWUSDZrm 4242 +VPDPWUSDZrmb 4243 +VPDPWUSDZrmbk 4244 +VPDPWUSDZrmbkz 4245 +VPDPWUSDZrmk 4246 +VPDPWUSDZrmkz 4247 +VPDPWUSDZrr 4248 +VPDPWUSDZrrk 4249 +VPDPWUSDZrrkz 4250 +VPDPWUSDrm 4251 +VPDPWUSDrr 4252 +VPDPWUUDSYrm 4253 +VPDPWUUDSYrr 4254 +VPDPWUUDSZ 4255 +VPDPWUUDSZrm 4256 +VPDPWUUDSZrmb 4257 +VPDPWUUDSZrmbk 4258 +VPDPWUUDSZrmbkz 4259 +VPDPWUUDSZrmk 4260 +VPDPWUUDSZrmkz 4261 +VPDPWUUDSZrr 4262 +VPDPWUUDSZrrk 4263 +VPDPWUUDSZrrkz 4264 +VPDPWUUDSrm 4265 +VPDPWUUDSrr 4266 +VPDPWUUDYrm 4267 +VPDPWUUDYrr 4268 +VPDPWUUDZ 4269 +VPDPWUUDZrm 4270 +VPDPWUUDZrmb 4271 +VPDPWUUDZrmbk 4272 +VPDPWUUDZrmbkz 4273 +VPDPWUUDZrmk 4274 +VPDPWUUDZrmkz 4275 +VPDPWUUDZrr 4276 +VPDPWUUDZrrk 4277 +VPDPWUUDZrrkz 4278 +VPDPWUUDrm 4279 +VPDPWUUDrr 4280 +VPERM 4281 +VPERMBZ 4282 +VPERMBZrm 4283 +VPERMBZrmk 4284 +VPERMBZrmkz 4285 +VPERMBZrr 4286 +VPERMBZrrk 4287 +VPERMBZrrkz 4288 +VPERMDYrm 4289 +VPERMDYrr 4290 +VPERMDZ 4291 +VPERMDZrm 4292 +VPERMDZrmb 4293 +VPERMDZrmbk 4294 +VPERMDZrmbkz 4295 +VPERMDZrmk 4296 +VPERMDZrmkz 4297 +VPERMDZrr 4298 +VPERMDZrrk 4299 +VPERMDZrrkz 4300 +VPERMI 4301 +VPERMIL 4302 +VPERMILPDYmi 4303 +VPERMILPDYri 4304 +VPERMILPDYrm 4305 +VPERMILPDYrr 4306 +VPERMILPDZ 4307 +VPERMILPDZmbi 4308 +VPERMILPDZmbik 4309 +VPERMILPDZmbikz 4310 +VPERMILPDZmi 4311 +VPERMILPDZmik 4312 +VPERMILPDZmikz 4313 +VPERMILPDZri 4314 +VPERMILPDZrik 4315 +VPERMILPDZrikz 4316 +VPERMILPDZrm 4317 +VPERMILPDZrmb 4318 +VPERMILPDZrmbk 4319 +VPERMILPDZrmbkz 4320 +VPERMILPDZrmk 4321 +VPERMILPDZrmkz 4322 +VPERMILPDZrr 4323 +VPERMILPDZrrk 4324 +VPERMILPDZrrkz 4325 +VPERMILPDmi 4326 +VPERMILPDri 4327 +VPERMILPDrm 4328 +VPERMILPDrr 4329 +VPERMILPSYmi 4330 +VPERMILPSYri 4331 +VPERMILPSYrm 4332 +VPERMILPSYrr 4333 +VPERMILPSZ 4334 +VPERMILPSZmbi 4335 +VPERMILPSZmbik 4336 +VPERMILPSZmbikz 4337 +VPERMILPSZmi 4338 +VPERMILPSZmik 4339 +VPERMILPSZmikz 4340 +VPERMILPSZri 4341 +VPERMILPSZrik 4342 +VPERMILPSZrikz 4343 +VPERMILPSZrm 4344 +VPERMILPSZrmb 4345 +VPERMILPSZrmbk 4346 +VPERMILPSZrmbkz 4347 +VPERMILPSZrmk 4348 +VPERMILPSZrmkz 4349 +VPERMILPSZrr 4350 +VPERMILPSZrrk 4351 +VPERMILPSZrrkz 4352 +VPERMILPSmi 4353 +VPERMILPSri 4354 +VPERMILPSrm 4355 +VPERMILPSrr 4356 +VPERMPDYmi 4357 +VPERMPDYri 4358 +VPERMPDZ 4359 +VPERMPDZmbi 4360 +VPERMPDZmbik 4361 +VPERMPDZmbikz 4362 +VPERMPDZmi 4363 +VPERMPDZmik 4364 +VPERMPDZmikz 4365 +VPERMPDZri 4366 +VPERMPDZrik 4367 +VPERMPDZrikz 4368 +VPERMPDZrm 4369 +VPERMPDZrmb 4370 +VPERMPDZrmbk 4371 +VPERMPDZrmbkz 4372 +VPERMPDZrmk 4373 +VPERMPDZrmkz 4374 +VPERMPDZrr 4375 +VPERMPDZrrk 4376 +VPERMPDZrrkz 4377 +VPERMPSYrm 4378 +VPERMPSYrr 4379 +VPERMPSZ 4380 +VPERMPSZrm 4381 +VPERMPSZrmb 4382 +VPERMPSZrmbk 4383 +VPERMPSZrmbkz 4384 +VPERMPSZrmk 4385 +VPERMPSZrmkz 4386 +VPERMPSZrr 4387 +VPERMPSZrrk 4388 +VPERMPSZrrkz 4389 +VPERMQYmi 4390 +VPERMQYri 4391 +VPERMQZ 4392 +VPERMQZmbi 4393 +VPERMQZmbik 4394 +VPERMQZmbikz 4395 +VPERMQZmi 4396 +VPERMQZmik 4397 +VPERMQZmikz 4398 +VPERMQZri 4399 +VPERMQZrik 4400 +VPERMQZrikz 4401 +VPERMQZrm 4402 +VPERMQZrmb 4403 +VPERMQZrmbk 4404 +VPERMQZrmbkz 4405 +VPERMQZrmk 4406 +VPERMQZrmkz 4407 +VPERMQZrr 4408 +VPERMQZrrk 4409 +VPERMQZrrkz 4410 +VPERMT 4411 +VPERMWZ 4412 +VPERMWZrm 4413 +VPERMWZrmk 4414 +VPERMWZrmkz 4415 +VPERMWZrr 4416 +VPERMWZrrk 4417 +VPERMWZrrkz 4418 +VPEXPANDBZ 4419 +VPEXPANDBZrm 4420 +VPEXPANDBZrmk 4421 +VPEXPANDBZrmkz 4422 +VPEXPANDBZrr 4423 +VPEXPANDBZrrk 4424 +VPEXPANDBZrrkz 4425 +VPEXPANDDZ 4426 +VPEXPANDDZrm 4427 +VPEXPANDDZrmk 4428 +VPEXPANDDZrmkz 4429 +VPEXPANDDZrr 4430 +VPEXPANDDZrrk 4431 +VPEXPANDDZrrkz 4432 +VPEXPANDQZ 4433 +VPEXPANDQZrm 4434 +VPEXPANDQZrmk 4435 +VPEXPANDQZrmkz 4436 +VPEXPANDQZrr 4437 +VPEXPANDQZrrk 4438 +VPEXPANDQZrrkz 4439 +VPEXPANDWZ 4440 +VPEXPANDWZrm 4441 +VPEXPANDWZrmk 4442 +VPEXPANDWZrmkz 4443 +VPEXPANDWZrr 4444 +VPEXPANDWZrrk 4445 +VPEXPANDWZrrkz 4446 +VPEXTRBZmri 4447 +VPEXTRBZrri 4448 +VPEXTRBmri 4449 +VPEXTRBrri 4450 +VPEXTRDZmri 4451 +VPEXTRDZrri 4452 +VPEXTRDmri 4453 +VPEXTRDrri 4454 +VPEXTRQZmri 4455 +VPEXTRQZrri 4456 +VPEXTRQmri 4457 +VPEXTRQrri 4458 +VPEXTRWZmri 4459 +VPEXTRWZrri 4460 +VPEXTRWZrri_REV 4461 +VPEXTRWmri 4462 +VPEXTRWrri 4463 +VPEXTRWrri_REV 4464 +VPGATHERDDYrm 4465 +VPGATHERDDZ 4466 +VPGATHERDDZrm 4467 +VPGATHERDDrm 4468 +VPGATHERDQYrm 4469 +VPGATHERDQZ 4470 +VPGATHERDQZrm 4471 +VPGATHERDQrm 4472 +VPGATHERQDYrm 4473 +VPGATHERQDZ 4474 +VPGATHERQDZrm 4475 +VPGATHERQDrm 4476 +VPGATHERQQYrm 4477 +VPGATHERQQZ 4478 +VPGATHERQQZrm 4479 +VPGATHERQQrm 4480 +VPHADDBDrm 4481 +VPHADDBDrr 4482 +VPHADDBQrm 4483 +VPHADDBQrr 4484 +VPHADDBWrm 4485 +VPHADDBWrr 4486 +VPHADDDQrm 4487 +VPHADDDQrr 4488 +VPHADDDYrm 4489 +VPHADDDYrr 4490 +VPHADDDrm 4491 +VPHADDDrr 4492 +VPHADDSWYrm 4493 +VPHADDSWYrr 4494 +VPHADDSWrm 4495 +VPHADDSWrr 4496 +VPHADDUBDrm 4497 +VPHADDUBDrr 4498 +VPHADDUBQrm 4499 +VPHADDUBQrr 4500 +VPHADDUBWrm 4501 +VPHADDUBWrr 4502 +VPHADDUDQrm 4503 +VPHADDUDQrr 4504 +VPHADDUWDrm 4505 +VPHADDUWDrr 4506 +VPHADDUWQrm 4507 +VPHADDUWQrr 4508 +VPHADDWDrm 4509 +VPHADDWDrr 4510 +VPHADDWQrm 4511 +VPHADDWQrr 4512 +VPHADDWYrm 4513 +VPHADDWYrr 4514 +VPHADDWrm 4515 +VPHADDWrr 4516 +VPHMINPOSUWrm 4517 +VPHMINPOSUWrr 4518 +VPHSUBBWrm 4519 +VPHSUBBWrr 4520 +VPHSUBDQrm 4521 +VPHSUBDQrr 4522 +VPHSUBDYrm 4523 +VPHSUBDYrr 4524 +VPHSUBDrm 4525 +VPHSUBDrr 4526 +VPHSUBSWYrm 4527 +VPHSUBSWYrr 4528 +VPHSUBSWrm 4529 +VPHSUBSWrr 4530 +VPHSUBWDrm 4531 +VPHSUBWDrr 4532 +VPHSUBWYrm 4533 +VPHSUBWYrr 4534 +VPHSUBWrm 4535 +VPHSUBWrr 4536 +VPINSRBZrmi 4537 +VPINSRBZrri 4538 +VPINSRBrmi 4539 +VPINSRBrri 4540 +VPINSRDZrmi 4541 +VPINSRDZrri 4542 +VPINSRDrmi 4543 +VPINSRDrri 4544 +VPINSRQZrmi 4545 +VPINSRQZrri 4546 +VPINSRQrmi 4547 +VPINSRQrri 4548 +VPINSRWZrmi 4549 +VPINSRWZrri 4550 +VPINSRWrmi 4551 +VPINSRWrri 4552 +VPLZCNTDZ 4553 +VPLZCNTDZrm 4554 +VPLZCNTDZrmb 4555 +VPLZCNTDZrmbk 4556 +VPLZCNTDZrmbkz 4557 +VPLZCNTDZrmk 4558 +VPLZCNTDZrmkz 4559 +VPLZCNTDZrr 4560 +VPLZCNTDZrrk 4561 +VPLZCNTDZrrkz 4562 +VPLZCNTQZ 4563 +VPLZCNTQZrm 4564 +VPLZCNTQZrmb 4565 +VPLZCNTQZrmbk 4566 +VPLZCNTQZrmbkz 4567 +VPLZCNTQZrmk 4568 +VPLZCNTQZrmkz 4569 +VPLZCNTQZrr 4570 +VPLZCNTQZrrk 4571 +VPLZCNTQZrrkz 4572 +VPMACSDDrm 4573 +VPMACSDDrr 4574 +VPMACSDQHrm 4575 +VPMACSDQHrr 4576 +VPMACSDQLrm 4577 +VPMACSDQLrr 4578 +VPMACSSDDrm 4579 +VPMACSSDDrr 4580 +VPMACSSDQHrm 4581 +VPMACSSDQHrr 4582 +VPMACSSDQLrm 4583 +VPMACSSDQLrr 4584 +VPMACSSWDrm 4585 +VPMACSSWDrr 4586 +VPMACSSWWrm 4587 +VPMACSSWWrr 4588 +VPMACSWDrm 4589 +VPMACSWDrr 4590 +VPMACSWWrm 4591 +VPMACSWWrr 4592 +VPMADCSSWDrm 4593 +VPMADCSSWDrr 4594 +VPMADCSWDrm 4595 +VPMADCSWDrr 4596 +VPMADD 4597 +VPMADDUBSWYrm 4598 +VPMADDUBSWYrr 4599 +VPMADDUBSWZ 4600 +VPMADDUBSWZrm 4601 +VPMADDUBSWZrmk 4602 +VPMADDUBSWZrmkz 4603 +VPMADDUBSWZrr 4604 +VPMADDUBSWZrrk 4605 +VPMADDUBSWZrrkz 4606 +VPMADDUBSWrm 4607 +VPMADDUBSWrr 4608 +VPMADDWDYrm 4609 +VPMADDWDYrr 4610 +VPMADDWDZ 4611 +VPMADDWDZrm 4612 +VPMADDWDZrmk 4613 +VPMADDWDZrmkz 4614 +VPMADDWDZrr 4615 +VPMADDWDZrrk 4616 +VPMADDWDZrrkz 4617 +VPMADDWDrm 4618 +VPMADDWDrr 4619 +VPMASKMOVDYmr 4620 +VPMASKMOVDYrm 4621 +VPMASKMOVDmr 4622 +VPMASKMOVDrm 4623 +VPMASKMOVQYmr 4624 +VPMASKMOVQYrm 4625 +VPMASKMOVQmr 4626 +VPMASKMOVQrm 4627 +VPMAXSBYrm 4628 +VPMAXSBYrr 4629 +VPMAXSBZ 4630 +VPMAXSBZrm 4631 +VPMAXSBZrmk 4632 +VPMAXSBZrmkz 4633 +VPMAXSBZrr 4634 +VPMAXSBZrrk 4635 +VPMAXSBZrrkz 4636 +VPMAXSBrm 4637 +VPMAXSBrr 4638 +VPMAXSDYrm 4639 +VPMAXSDYrr 4640 +VPMAXSDZ 4641 +VPMAXSDZrm 4642 +VPMAXSDZrmb 4643 +VPMAXSDZrmbk 4644 +VPMAXSDZrmbkz 4645 +VPMAXSDZrmk 4646 +VPMAXSDZrmkz 4647 +VPMAXSDZrr 4648 +VPMAXSDZrrk 4649 +VPMAXSDZrrkz 4650 +VPMAXSDrm 4651 +VPMAXSDrr 4652 +VPMAXSQZ 4653 +VPMAXSQZrm 4654 +VPMAXSQZrmb 4655 +VPMAXSQZrmbk 4656 +VPMAXSQZrmbkz 4657 +VPMAXSQZrmk 4658 +VPMAXSQZrmkz 4659 +VPMAXSQZrr 4660 +VPMAXSQZrrk 4661 +VPMAXSQZrrkz 4662 +VPMAXSWYrm 4663 +VPMAXSWYrr 4664 +VPMAXSWZ 4665 +VPMAXSWZrm 4666 +VPMAXSWZrmk 4667 +VPMAXSWZrmkz 4668 +VPMAXSWZrr 4669 +VPMAXSWZrrk 4670 +VPMAXSWZrrkz 4671 +VPMAXSWrm 4672 +VPMAXSWrr 4673 +VPMAXUBYrm 4674 +VPMAXUBYrr 4675 +VPMAXUBZ 4676 +VPMAXUBZrm 4677 +VPMAXUBZrmk 4678 +VPMAXUBZrmkz 4679 +VPMAXUBZrr 4680 +VPMAXUBZrrk 4681 +VPMAXUBZrrkz 4682 +VPMAXUBrm 4683 +VPMAXUBrr 4684 +VPMAXUDYrm 4685 +VPMAXUDYrr 4686 +VPMAXUDZ 4687 +VPMAXUDZrm 4688 +VPMAXUDZrmb 4689 +VPMAXUDZrmbk 4690 +VPMAXUDZrmbkz 4691 +VPMAXUDZrmk 4692 +VPMAXUDZrmkz 4693 +VPMAXUDZrr 4694 +VPMAXUDZrrk 4695 +VPMAXUDZrrkz 4696 +VPMAXUDrm 4697 +VPMAXUDrr 4698 +VPMAXUQZ 4699 +VPMAXUQZrm 4700 +VPMAXUQZrmb 4701 +VPMAXUQZrmbk 4702 +VPMAXUQZrmbkz 4703 +VPMAXUQZrmk 4704 +VPMAXUQZrmkz 4705 +VPMAXUQZrr 4706 +VPMAXUQZrrk 4707 +VPMAXUQZrrkz 4708 +VPMAXUWYrm 4709 +VPMAXUWYrr 4710 +VPMAXUWZ 4711 +VPMAXUWZrm 4712 +VPMAXUWZrmk 4713 +VPMAXUWZrmkz 4714 +VPMAXUWZrr 4715 +VPMAXUWZrrk 4716 +VPMAXUWZrrkz 4717 +VPMAXUWrm 4718 +VPMAXUWrr 4719 +VPMINSBYrm 4720 +VPMINSBYrr 4721 +VPMINSBZ 4722 +VPMINSBZrm 4723 +VPMINSBZrmk 4724 +VPMINSBZrmkz 4725 +VPMINSBZrr 4726 +VPMINSBZrrk 4727 +VPMINSBZrrkz 4728 +VPMINSBrm 4729 +VPMINSBrr 4730 +VPMINSDYrm 4731 +VPMINSDYrr 4732 +VPMINSDZ 4733 +VPMINSDZrm 4734 +VPMINSDZrmb 4735 +VPMINSDZrmbk 4736 +VPMINSDZrmbkz 4737 +VPMINSDZrmk 4738 +VPMINSDZrmkz 4739 +VPMINSDZrr 4740 +VPMINSDZrrk 4741 +VPMINSDZrrkz 4742 +VPMINSDrm 4743 +VPMINSDrr 4744 +VPMINSQZ 4745 +VPMINSQZrm 4746 +VPMINSQZrmb 4747 +VPMINSQZrmbk 4748 +VPMINSQZrmbkz 4749 +VPMINSQZrmk 4750 +VPMINSQZrmkz 4751 +VPMINSQZrr 4752 +VPMINSQZrrk 4753 +VPMINSQZrrkz 4754 +VPMINSWYrm 4755 +VPMINSWYrr 4756 +VPMINSWZ 4757 +VPMINSWZrm 4758 +VPMINSWZrmk 4759 +VPMINSWZrmkz 4760 +VPMINSWZrr 4761 +VPMINSWZrrk 4762 +VPMINSWZrrkz 4763 +VPMINSWrm 4764 +VPMINSWrr 4765 +VPMINUBYrm 4766 +VPMINUBYrr 4767 +VPMINUBZ 4768 +VPMINUBZrm 4769 +VPMINUBZrmk 4770 +VPMINUBZrmkz 4771 +VPMINUBZrr 4772 +VPMINUBZrrk 4773 +VPMINUBZrrkz 4774 +VPMINUBrm 4775 +VPMINUBrr 4776 +VPMINUDYrm 4777 +VPMINUDYrr 4778 +VPMINUDZ 4779 +VPMINUDZrm 4780 +VPMINUDZrmb 4781 +VPMINUDZrmbk 4782 +VPMINUDZrmbkz 4783 +VPMINUDZrmk 4784 +VPMINUDZrmkz 4785 +VPMINUDZrr 4786 +VPMINUDZrrk 4787 +VPMINUDZrrkz 4788 +VPMINUDrm 4789 +VPMINUDrr 4790 +VPMINUQZ 4791 +VPMINUQZrm 4792 +VPMINUQZrmb 4793 +VPMINUQZrmbk 4794 +VPMINUQZrmbkz 4795 +VPMINUQZrmk 4796 +VPMINUQZrmkz 4797 +VPMINUQZrr 4798 +VPMINUQZrrk 4799 +VPMINUQZrrkz 4800 +VPMINUWYrm 4801 +VPMINUWYrr 4802 +VPMINUWZ 4803 +VPMINUWZrm 4804 +VPMINUWZrmk 4805 +VPMINUWZrmkz 4806 +VPMINUWZrr 4807 +VPMINUWZrrk 4808 +VPMINUWZrrkz 4809 +VPMINUWrm 4810 +VPMINUWrr 4811 +VPMOVB 4812 +VPMOVD 4813 +VPMOVDBZ 4814 +VPMOVDBZmr 4815 +VPMOVDBZmrk 4816 +VPMOVDBZrr 4817 +VPMOVDBZrrk 4818 +VPMOVDBZrrkz 4819 +VPMOVDWZ 4820 +VPMOVDWZmr 4821 +VPMOVDWZmrk 4822 +VPMOVDWZrr 4823 +VPMOVDWZrrk 4824 +VPMOVDWZrrkz 4825 +VPMOVM 4826 +VPMOVMSKBYrr 4827 +VPMOVMSKBrr 4828 +VPMOVQ 4829 +VPMOVQBZ 4830 +VPMOVQBZmr 4831 +VPMOVQBZmrk 4832 +VPMOVQBZrr 4833 +VPMOVQBZrrk 4834 +VPMOVQBZrrkz 4835 +VPMOVQDZ 4836 +VPMOVQDZmr 4837 +VPMOVQDZmrk 4838 +VPMOVQDZrr 4839 +VPMOVQDZrrk 4840 +VPMOVQDZrrkz 4841 +VPMOVQWZ 4842 +VPMOVQWZmr 4843 +VPMOVQWZmrk 4844 +VPMOVQWZrr 4845 +VPMOVQWZrrk 4846 +VPMOVQWZrrkz 4847 +VPMOVSDBZ 4848 +VPMOVSDBZmr 4849 +VPMOVSDBZmrk 4850 +VPMOVSDBZrr 4851 +VPMOVSDBZrrk 4852 +VPMOVSDBZrrkz 4853 +VPMOVSDWZ 4854 +VPMOVSDWZmr 4855 +VPMOVSDWZmrk 4856 +VPMOVSDWZrr 4857 +VPMOVSDWZrrk 4858 +VPMOVSDWZrrkz 4859 +VPMOVSQBZ 4860 +VPMOVSQBZmr 4861 +VPMOVSQBZmrk 4862 +VPMOVSQBZrr 4863 +VPMOVSQBZrrk 4864 +VPMOVSQBZrrkz 4865 +VPMOVSQDZ 4866 +VPMOVSQDZmr 4867 +VPMOVSQDZmrk 4868 +VPMOVSQDZrr 4869 +VPMOVSQDZrrk 4870 +VPMOVSQDZrrkz 4871 +VPMOVSQWZ 4872 +VPMOVSQWZmr 4873 +VPMOVSQWZmrk 4874 +VPMOVSQWZrr 4875 +VPMOVSQWZrrk 4876 +VPMOVSQWZrrkz 4877 +VPMOVSWBZ 4878 +VPMOVSWBZmr 4879 +VPMOVSWBZmrk 4880 +VPMOVSWBZrr 4881 +VPMOVSWBZrrk 4882 +VPMOVSWBZrrkz 4883 +VPMOVSXBDYrm 4884 +VPMOVSXBDYrr 4885 +VPMOVSXBDZ 4886 +VPMOVSXBDZrm 4887 +VPMOVSXBDZrmk 4888 +VPMOVSXBDZrmkz 4889 +VPMOVSXBDZrr 4890 +VPMOVSXBDZrrk 4891 +VPMOVSXBDZrrkz 4892 +VPMOVSXBDrm 4893 +VPMOVSXBDrr 4894 +VPMOVSXBQYrm 4895 +VPMOVSXBQYrr 4896 +VPMOVSXBQZ 4897 +VPMOVSXBQZrm 4898 +VPMOVSXBQZrmk 4899 +VPMOVSXBQZrmkz 4900 +VPMOVSXBQZrr 4901 +VPMOVSXBQZrrk 4902 +VPMOVSXBQZrrkz 4903 +VPMOVSXBQrm 4904 +VPMOVSXBQrr 4905 +VPMOVSXBWYrm 4906 +VPMOVSXBWYrr 4907 +VPMOVSXBWZ 4908 +VPMOVSXBWZrm 4909 +VPMOVSXBWZrmk 4910 +VPMOVSXBWZrmkz 4911 +VPMOVSXBWZrr 4912 +VPMOVSXBWZrrk 4913 +VPMOVSXBWZrrkz 4914 +VPMOVSXBWrm 4915 +VPMOVSXBWrr 4916 +VPMOVSXDQYrm 4917 +VPMOVSXDQYrr 4918 +VPMOVSXDQZ 4919 +VPMOVSXDQZrm 4920 +VPMOVSXDQZrmk 4921 +VPMOVSXDQZrmkz 4922 +VPMOVSXDQZrr 4923 +VPMOVSXDQZrrk 4924 +VPMOVSXDQZrrkz 4925 +VPMOVSXDQrm 4926 +VPMOVSXDQrr 4927 +VPMOVSXWDYrm 4928 +VPMOVSXWDYrr 4929 +VPMOVSXWDZ 4930 +VPMOVSXWDZrm 4931 +VPMOVSXWDZrmk 4932 +VPMOVSXWDZrmkz 4933 +VPMOVSXWDZrr 4934 +VPMOVSXWDZrrk 4935 +VPMOVSXWDZrrkz 4936 +VPMOVSXWDrm 4937 +VPMOVSXWDrr 4938 +VPMOVSXWQYrm 4939 +VPMOVSXWQYrr 4940 +VPMOVSXWQZ 4941 +VPMOVSXWQZrm 4942 +VPMOVSXWQZrmk 4943 +VPMOVSXWQZrmkz 4944 +VPMOVSXWQZrr 4945 +VPMOVSXWQZrrk 4946 +VPMOVSXWQZrrkz 4947 +VPMOVSXWQrm 4948 +VPMOVSXWQrr 4949 +VPMOVUSDBZ 4950 +VPMOVUSDBZmr 4951 +VPMOVUSDBZmrk 4952 +VPMOVUSDBZrr 4953 +VPMOVUSDBZrrk 4954 +VPMOVUSDBZrrkz 4955 +VPMOVUSDWZ 4956 +VPMOVUSDWZmr 4957 +VPMOVUSDWZmrk 4958 +VPMOVUSDWZrr 4959 +VPMOVUSDWZrrk 4960 +VPMOVUSDWZrrkz 4961 +VPMOVUSQBZ 4962 +VPMOVUSQBZmr 4963 +VPMOVUSQBZmrk 4964 +VPMOVUSQBZrr 4965 +VPMOVUSQBZrrk 4966 +VPMOVUSQBZrrkz 4967 +VPMOVUSQDZ 4968 +VPMOVUSQDZmr 4969 +VPMOVUSQDZmrk 4970 +VPMOVUSQDZrr 4971 +VPMOVUSQDZrrk 4972 +VPMOVUSQDZrrkz 4973 +VPMOVUSQWZ 4974 +VPMOVUSQWZmr 4975 +VPMOVUSQWZmrk 4976 +VPMOVUSQWZrr 4977 +VPMOVUSQWZrrk 4978 +VPMOVUSQWZrrkz 4979 +VPMOVUSWBZ 4980 +VPMOVUSWBZmr 4981 +VPMOVUSWBZmrk 4982 +VPMOVUSWBZrr 4983 +VPMOVUSWBZrrk 4984 +VPMOVUSWBZrrkz 4985 +VPMOVW 4986 +VPMOVWBZ 4987 +VPMOVWBZmr 4988 +VPMOVWBZmrk 4989 +VPMOVWBZrr 4990 +VPMOVWBZrrk 4991 +VPMOVWBZrrkz 4992 +VPMOVZXBDYrm 4993 +VPMOVZXBDYrr 4994 +VPMOVZXBDZ 4995 +VPMOVZXBDZrm 4996 +VPMOVZXBDZrmk 4997 +VPMOVZXBDZrmkz 4998 +VPMOVZXBDZrr 4999 +VPMOVZXBDZrrk 5000 +VPMOVZXBDZrrkz 5001 +VPMOVZXBDrm 5002 +VPMOVZXBDrr 5003 +VPMOVZXBQYrm 5004 +VPMOVZXBQYrr 5005 +VPMOVZXBQZ 5006 +VPMOVZXBQZrm 5007 +VPMOVZXBQZrmk 5008 +VPMOVZXBQZrmkz 5009 +VPMOVZXBQZrr 5010 +VPMOVZXBQZrrk 5011 +VPMOVZXBQZrrkz 5012 +VPMOVZXBQrm 5013 +VPMOVZXBQrr 5014 +VPMOVZXBWYrm 5015 +VPMOVZXBWYrr 5016 +VPMOVZXBWZ 5017 +VPMOVZXBWZrm 5018 +VPMOVZXBWZrmk 5019 +VPMOVZXBWZrmkz 5020 +VPMOVZXBWZrr 5021 +VPMOVZXBWZrrk 5022 +VPMOVZXBWZrrkz 5023 +VPMOVZXBWrm 5024 +VPMOVZXBWrr 5025 +VPMOVZXDQYrm 5026 +VPMOVZXDQYrr 5027 +VPMOVZXDQZ 5028 +VPMOVZXDQZrm 5029 +VPMOVZXDQZrmk 5030 +VPMOVZXDQZrmkz 5031 +VPMOVZXDQZrr 5032 +VPMOVZXDQZrrk 5033 +VPMOVZXDQZrrkz 5034 +VPMOVZXDQrm 5035 +VPMOVZXDQrr 5036 +VPMOVZXWDYrm 5037 +VPMOVZXWDYrr 5038 +VPMOVZXWDZ 5039 +VPMOVZXWDZrm 5040 +VPMOVZXWDZrmk 5041 +VPMOVZXWDZrmkz 5042 +VPMOVZXWDZrr 5043 +VPMOVZXWDZrrk 5044 +VPMOVZXWDZrrkz 5045 +VPMOVZXWDrm 5046 +VPMOVZXWDrr 5047 +VPMOVZXWQYrm 5048 +VPMOVZXWQYrr 5049 +VPMOVZXWQZ 5050 +VPMOVZXWQZrm 5051 +VPMOVZXWQZrmk 5052 +VPMOVZXWQZrmkz 5053 +VPMOVZXWQZrr 5054 +VPMOVZXWQZrrk 5055 +VPMOVZXWQZrrkz 5056 +VPMOVZXWQrm 5057 +VPMOVZXWQrr 5058 +VPMULDQYrm 5059 +VPMULDQYrr 5060 +VPMULDQZ 5061 +VPMULDQZrm 5062 +VPMULDQZrmb 5063 +VPMULDQZrmbk 5064 +VPMULDQZrmbkz 5065 +VPMULDQZrmk 5066 +VPMULDQZrmkz 5067 +VPMULDQZrr 5068 +VPMULDQZrrk 5069 +VPMULDQZrrkz 5070 +VPMULDQrm 5071 +VPMULDQrr 5072 +VPMULHRSWYrm 5073 +VPMULHRSWYrr 5074 +VPMULHRSWZ 5075 +VPMULHRSWZrm 5076 +VPMULHRSWZrmk 5077 +VPMULHRSWZrmkz 5078 +VPMULHRSWZrr 5079 +VPMULHRSWZrrk 5080 +VPMULHRSWZrrkz 5081 +VPMULHRSWrm 5082 +VPMULHRSWrr 5083 +VPMULHUWYrm 5084 +VPMULHUWYrr 5085 +VPMULHUWZ 5086 +VPMULHUWZrm 5087 +VPMULHUWZrmk 5088 +VPMULHUWZrmkz 5089 +VPMULHUWZrr 5090 +VPMULHUWZrrk 5091 +VPMULHUWZrrkz 5092 +VPMULHUWrm 5093 +VPMULHUWrr 5094 +VPMULHWYrm 5095 +VPMULHWYrr 5096 +VPMULHWZ 5097 +VPMULHWZrm 5098 +VPMULHWZrmk 5099 +VPMULHWZrmkz 5100 +VPMULHWZrr 5101 +VPMULHWZrrk 5102 +VPMULHWZrrkz 5103 +VPMULHWrm 5104 +VPMULHWrr 5105 +VPMULLDYrm 5106 +VPMULLDYrr 5107 +VPMULLDZ 5108 +VPMULLDZrm 5109 +VPMULLDZrmb 5110 +VPMULLDZrmbk 5111 +VPMULLDZrmbkz 5112 +VPMULLDZrmk 5113 +VPMULLDZrmkz 5114 +VPMULLDZrr 5115 +VPMULLDZrrk 5116 +VPMULLDZrrkz 5117 +VPMULLDrm 5118 +VPMULLDrr 5119 +VPMULLQZ 5120 +VPMULLQZrm 5121 +VPMULLQZrmb 5122 +VPMULLQZrmbk 5123 +VPMULLQZrmbkz 5124 +VPMULLQZrmk 5125 +VPMULLQZrmkz 5126 +VPMULLQZrr 5127 +VPMULLQZrrk 5128 +VPMULLQZrrkz 5129 +VPMULLWYrm 5130 +VPMULLWYrr 5131 +VPMULLWZ 5132 +VPMULLWZrm 5133 +VPMULLWZrmk 5134 +VPMULLWZrmkz 5135 +VPMULLWZrr 5136 +VPMULLWZrrk 5137 +VPMULLWZrrkz 5138 +VPMULLWrm 5139 +VPMULLWrr 5140 +VPMULTISHIFTQBZ 5141 +VPMULTISHIFTQBZrm 5142 +VPMULTISHIFTQBZrmb 5143 +VPMULTISHIFTQBZrmbk 5144 +VPMULTISHIFTQBZrmbkz 5145 +VPMULTISHIFTQBZrmk 5146 +VPMULTISHIFTQBZrmkz 5147 +VPMULTISHIFTQBZrr 5148 +VPMULTISHIFTQBZrrk 5149 +VPMULTISHIFTQBZrrkz 5150 +VPMULUDQYrm 5151 +VPMULUDQYrr 5152 +VPMULUDQZ 5153 +VPMULUDQZrm 5154 +VPMULUDQZrmb 5155 +VPMULUDQZrmbk 5156 +VPMULUDQZrmbkz 5157 +VPMULUDQZrmk 5158 +VPMULUDQZrmkz 5159 +VPMULUDQZrr 5160 +VPMULUDQZrrk 5161 +VPMULUDQZrrkz 5162 +VPMULUDQrm 5163 +VPMULUDQrr 5164 +VPOPCNTBZ 5165 +VPOPCNTBZrm 5166 +VPOPCNTBZrmk 5167 +VPOPCNTBZrmkz 5168 +VPOPCNTBZrr 5169 +VPOPCNTBZrrk 5170 +VPOPCNTBZrrkz 5171 +VPOPCNTDZ 5172 +VPOPCNTDZrm 5173 +VPOPCNTDZrmb 5174 +VPOPCNTDZrmbk 5175 +VPOPCNTDZrmbkz 5176 +VPOPCNTDZrmk 5177 +VPOPCNTDZrmkz 5178 +VPOPCNTDZrr 5179 +VPOPCNTDZrrk 5180 +VPOPCNTDZrrkz 5181 +VPOPCNTQZ 5182 +VPOPCNTQZrm 5183 +VPOPCNTQZrmb 5184 +VPOPCNTQZrmbk 5185 +VPOPCNTQZrmbkz 5186 +VPOPCNTQZrmk 5187 +VPOPCNTQZrmkz 5188 +VPOPCNTQZrr 5189 +VPOPCNTQZrrk 5190 +VPOPCNTQZrrkz 5191 +VPOPCNTWZ 5192 +VPOPCNTWZrm 5193 +VPOPCNTWZrmk 5194 +VPOPCNTWZrmkz 5195 +VPOPCNTWZrr 5196 +VPOPCNTWZrrk 5197 +VPOPCNTWZrrkz 5198 +VPORDZ 5199 +VPORDZrm 5200 +VPORDZrmb 5201 +VPORDZrmbk 5202 +VPORDZrmbkz 5203 +VPORDZrmk 5204 +VPORDZrmkz 5205 +VPORDZrr 5206 +VPORDZrrk 5207 +VPORDZrrkz 5208 +VPORQZ 5209 +VPORQZrm 5210 +VPORQZrmb 5211 +VPORQZrmbk 5212 +VPORQZrmbkz 5213 +VPORQZrmk 5214 +VPORQZrmkz 5215 +VPORQZrr 5216 +VPORQZrrk 5217 +VPORQZrrkz 5218 +VPORYrm 5219 +VPORYrr 5220 +VPORrm 5221 +VPORrr 5222 +VPPERMrmr 5223 +VPPERMrrm 5224 +VPPERMrrr 5225 +VPPERMrrr_REV 5226 +VPROLDZ 5227 +VPROLDZmbi 5228 +VPROLDZmbik 5229 +VPROLDZmbikz 5230 +VPROLDZmi 5231 +VPROLDZmik 5232 +VPROLDZmikz 5233 +VPROLDZri 5234 +VPROLDZrik 5235 +VPROLDZrikz 5236 +VPROLQZ 5237 +VPROLQZmbi 5238 +VPROLQZmbik 5239 +VPROLQZmbikz 5240 +VPROLQZmi 5241 +VPROLQZmik 5242 +VPROLQZmikz 5243 +VPROLQZri 5244 +VPROLQZrik 5245 +VPROLQZrikz 5246 +VPROLVDZ 5247 +VPROLVDZrm 5248 +VPROLVDZrmb 5249 +VPROLVDZrmbk 5250 +VPROLVDZrmbkz 5251 +VPROLVDZrmk 5252 +VPROLVDZrmkz 5253 +VPROLVDZrr 5254 +VPROLVDZrrk 5255 +VPROLVDZrrkz 5256 +VPROLVQZ 5257 +VPROLVQZrm 5258 +VPROLVQZrmb 5259 +VPROLVQZrmbk 5260 +VPROLVQZrmbkz 5261 +VPROLVQZrmk 5262 +VPROLVQZrmkz 5263 +VPROLVQZrr 5264 +VPROLVQZrrk 5265 +VPROLVQZrrkz 5266 +VPRORDZ 5267 +VPRORDZmbi 5268 +VPRORDZmbik 5269 +VPRORDZmbikz 5270 +VPRORDZmi 5271 +VPRORDZmik 5272 +VPRORDZmikz 5273 +VPRORDZri 5274 +VPRORDZrik 5275 +VPRORDZrikz 5276 +VPRORQZ 5277 +VPRORQZmbi 5278 +VPRORQZmbik 5279 +VPRORQZmbikz 5280 +VPRORQZmi 5281 +VPRORQZmik 5282 +VPRORQZmikz 5283 +VPRORQZri 5284 +VPRORQZrik 5285 +VPRORQZrikz 5286 +VPRORVDZ 5287 +VPRORVDZrm 5288 +VPRORVDZrmb 5289 +VPRORVDZrmbk 5290 +VPRORVDZrmbkz 5291 +VPRORVDZrmk 5292 +VPRORVDZrmkz 5293 +VPRORVDZrr 5294 +VPRORVDZrrk 5295 +VPRORVDZrrkz 5296 +VPRORVQZ 5297 +VPRORVQZrm 5298 +VPRORVQZrmb 5299 +VPRORVQZrmbk 5300 +VPRORVQZrmbkz 5301 +VPRORVQZrmk 5302 +VPRORVQZrmkz 5303 +VPRORVQZrr 5304 +VPRORVQZrrk 5305 +VPRORVQZrrkz 5306 +VPROTBmi 5307 +VPROTBmr 5308 +VPROTBri 5309 +VPROTBrm 5310 +VPROTBrr 5311 +VPROTBrr_REV 5312 +VPROTDmi 5313 +VPROTDmr 5314 +VPROTDri 5315 +VPROTDrm 5316 +VPROTDrr 5317 +VPROTDrr_REV 5318 +VPROTQmi 5319 +VPROTQmr 5320 +VPROTQri 5321 +VPROTQrm 5322 +VPROTQrr 5323 +VPROTQrr_REV 5324 +VPROTWmi 5325 +VPROTWmr 5326 +VPROTWri 5327 +VPROTWrm 5328 +VPROTWrr 5329 +VPROTWrr_REV 5330 +VPSADBWYrm 5331 +VPSADBWYrr 5332 +VPSADBWZ 5333 +VPSADBWZrm 5334 +VPSADBWZrr 5335 +VPSADBWrm 5336 +VPSADBWrr 5337 +VPSCATTERDDZ 5338 +VPSCATTERDDZmr 5339 +VPSCATTERDQZ 5340 +VPSCATTERDQZmr 5341 +VPSCATTERQDZ 5342 +VPSCATTERQDZmr 5343 +VPSCATTERQQZ 5344 +VPSCATTERQQZmr 5345 +VPSHABmr 5346 +VPSHABrm 5347 +VPSHABrr 5348 +VPSHABrr_REV 5349 +VPSHADmr 5350 +VPSHADrm 5351 +VPSHADrr 5352 +VPSHADrr_REV 5353 +VPSHAQmr 5354 +VPSHAQrm 5355 +VPSHAQrr 5356 +VPSHAQrr_REV 5357 +VPSHAWmr 5358 +VPSHAWrm 5359 +VPSHAWrr 5360 +VPSHAWrr_REV 5361 +VPSHLBmr 5362 +VPSHLBrm 5363 +VPSHLBrr 5364 +VPSHLBrr_REV 5365 +VPSHLDDZ 5366 +VPSHLDDZrmbi 5367 +VPSHLDDZrmbik 5368 +VPSHLDDZrmbikz 5369 +VPSHLDDZrmi 5370 +VPSHLDDZrmik 5371 +VPSHLDDZrmikz 5372 +VPSHLDDZrri 5373 +VPSHLDDZrrik 5374 +VPSHLDDZrrikz 5375 +VPSHLDQZ 5376 +VPSHLDQZrmbi 5377 +VPSHLDQZrmbik 5378 +VPSHLDQZrmbikz 5379 +VPSHLDQZrmi 5380 +VPSHLDQZrmik 5381 +VPSHLDQZrmikz 5382 +VPSHLDQZrri 5383 +VPSHLDQZrrik 5384 +VPSHLDQZrrikz 5385 +VPSHLDVDZ 5386 +VPSHLDVDZm 5387 +VPSHLDVDZmb 5388 +VPSHLDVDZmbk 5389 +VPSHLDVDZmbkz 5390 +VPSHLDVDZmk 5391 +VPSHLDVDZmkz 5392 +VPSHLDVDZr 5393 +VPSHLDVDZrk 5394 +VPSHLDVDZrkz 5395 +VPSHLDVQZ 5396 +VPSHLDVQZm 5397 +VPSHLDVQZmb 5398 +VPSHLDVQZmbk 5399 +VPSHLDVQZmbkz 5400 +VPSHLDVQZmk 5401 +VPSHLDVQZmkz 5402 +VPSHLDVQZr 5403 +VPSHLDVQZrk 5404 +VPSHLDVQZrkz 5405 +VPSHLDVWZ 5406 +VPSHLDVWZm 5407 +VPSHLDVWZmk 5408 +VPSHLDVWZmkz 5409 +VPSHLDVWZr 5410 +VPSHLDVWZrk 5411 +VPSHLDVWZrkz 5412 +VPSHLDWZ 5413 +VPSHLDWZrmi 5414 +VPSHLDWZrmik 5415 +VPSHLDWZrmikz 5416 +VPSHLDWZrri 5417 +VPSHLDWZrrik 5418 +VPSHLDWZrrikz 5419 +VPSHLDmr 5420 +VPSHLDrm 5421 +VPSHLDrr 5422 +VPSHLDrr_REV 5423 +VPSHLQmr 5424 +VPSHLQrm 5425 +VPSHLQrr 5426 +VPSHLQrr_REV 5427 +VPSHLWmr 5428 +VPSHLWrm 5429 +VPSHLWrr 5430 +VPSHLWrr_REV 5431 +VPSHRDDZ 5432 +VPSHRDDZrmbi 5433 +VPSHRDDZrmbik 5434 +VPSHRDDZrmbikz 5435 +VPSHRDDZrmi 5436 +VPSHRDDZrmik 5437 +VPSHRDDZrmikz 5438 +VPSHRDDZrri 5439 +VPSHRDDZrrik 5440 +VPSHRDDZrrikz 5441 +VPSHRDQZ 5442 +VPSHRDQZrmbi 5443 +VPSHRDQZrmbik 5444 +VPSHRDQZrmbikz 5445 +VPSHRDQZrmi 5446 +VPSHRDQZrmik 5447 +VPSHRDQZrmikz 5448 +VPSHRDQZrri 5449 +VPSHRDQZrrik 5450 +VPSHRDQZrrikz 5451 +VPSHRDVDZ 5452 +VPSHRDVDZm 5453 +VPSHRDVDZmb 5454 +VPSHRDVDZmbk 5455 +VPSHRDVDZmbkz 5456 +VPSHRDVDZmk 5457 +VPSHRDVDZmkz 5458 +VPSHRDVDZr 5459 +VPSHRDVDZrk 5460 +VPSHRDVDZrkz 5461 +VPSHRDVQZ 5462 +VPSHRDVQZm 5463 +VPSHRDVQZmb 5464 +VPSHRDVQZmbk 5465 +VPSHRDVQZmbkz 5466 +VPSHRDVQZmk 5467 +VPSHRDVQZmkz 5468 +VPSHRDVQZr 5469 +VPSHRDVQZrk 5470 +VPSHRDVQZrkz 5471 +VPSHRDVWZ 5472 +VPSHRDVWZm 5473 +VPSHRDVWZmk 5474 +VPSHRDVWZmkz 5475 +VPSHRDVWZr 5476 +VPSHRDVWZrk 5477 +VPSHRDVWZrkz 5478 +VPSHRDWZ 5479 +VPSHRDWZrmi 5480 +VPSHRDWZrmik 5481 +VPSHRDWZrmikz 5482 +VPSHRDWZrri 5483 +VPSHRDWZrrik 5484 +VPSHRDWZrrikz 5485 +VPSHUFBITQMBZ 5486 +VPSHUFBITQMBZrm 5487 +VPSHUFBITQMBZrmk 5488 +VPSHUFBITQMBZrr 5489 +VPSHUFBITQMBZrrk 5490 +VPSHUFBYrm 5491 +VPSHUFBYrr 5492 +VPSHUFBZ 5493 +VPSHUFBZrm 5494 +VPSHUFBZrmk 5495 +VPSHUFBZrmkz 5496 +VPSHUFBZrr 5497 +VPSHUFBZrrk 5498 +VPSHUFBZrrkz 5499 +VPSHUFBrm 5500 +VPSHUFBrr 5501 +VPSHUFDYmi 5502 +VPSHUFDYri 5503 +VPSHUFDZ 5504 +VPSHUFDZmbi 5505 +VPSHUFDZmbik 5506 +VPSHUFDZmbikz 5507 +VPSHUFDZmi 5508 +VPSHUFDZmik 5509 +VPSHUFDZmikz 5510 +VPSHUFDZri 5511 +VPSHUFDZrik 5512 +VPSHUFDZrikz 5513 +VPSHUFDmi 5514 +VPSHUFDri 5515 +VPSHUFHWYmi 5516 +VPSHUFHWYri 5517 +VPSHUFHWZ 5518 +VPSHUFHWZmi 5519 +VPSHUFHWZmik 5520 +VPSHUFHWZmikz 5521 +VPSHUFHWZri 5522 +VPSHUFHWZrik 5523 +VPSHUFHWZrikz 5524 +VPSHUFHWmi 5525 +VPSHUFHWri 5526 +VPSHUFLWYmi 5527 +VPSHUFLWYri 5528 +VPSHUFLWZ 5529 +VPSHUFLWZmi 5530 +VPSHUFLWZmik 5531 +VPSHUFLWZmikz 5532 +VPSHUFLWZri 5533 +VPSHUFLWZrik 5534 +VPSHUFLWZrikz 5535 +VPSHUFLWmi 5536 +VPSHUFLWri 5537 +VPSIGNBYrm 5538 +VPSIGNBYrr 5539 +VPSIGNBrm 5540 +VPSIGNBrr 5541 +VPSIGNDYrm 5542 +VPSIGNDYrr 5543 +VPSIGNDrm 5544 +VPSIGNDrr 5545 +VPSIGNWYrm 5546 +VPSIGNWYrr 5547 +VPSIGNWrm 5548 +VPSIGNWrr 5549 +VPSLLDQYri 5550 +VPSLLDQZ 5551 +VPSLLDQZmi 5552 +VPSLLDQZri 5553 +VPSLLDQri 5554 +VPSLLDYri 5555 +VPSLLDYrm 5556 +VPSLLDYrr 5557 +VPSLLDZ 5558 +VPSLLDZmbi 5559 +VPSLLDZmbik 5560 +VPSLLDZmbikz 5561 +VPSLLDZmi 5562 +VPSLLDZmik 5563 +VPSLLDZmikz 5564 +VPSLLDZri 5565 +VPSLLDZrik 5566 +VPSLLDZrikz 5567 +VPSLLDZrm 5568 +VPSLLDZrmk 5569 +VPSLLDZrmkz 5570 +VPSLLDZrr 5571 +VPSLLDZrrk 5572 +VPSLLDZrrkz 5573 +VPSLLDri 5574 +VPSLLDrm 5575 +VPSLLDrr 5576 +VPSLLQYri 5577 +VPSLLQYrm 5578 +VPSLLQYrr 5579 +VPSLLQZ 5580 +VPSLLQZmbi 5581 +VPSLLQZmbik 5582 +VPSLLQZmbikz 5583 +VPSLLQZmi 5584 +VPSLLQZmik 5585 +VPSLLQZmikz 5586 +VPSLLQZri 5587 +VPSLLQZrik 5588 +VPSLLQZrikz 5589 +VPSLLQZrm 5590 +VPSLLQZrmk 5591 +VPSLLQZrmkz 5592 +VPSLLQZrr 5593 +VPSLLQZrrk 5594 +VPSLLQZrrkz 5595 +VPSLLQri 5596 +VPSLLQrm 5597 +VPSLLQrr 5598 +VPSLLVDYrm 5599 +VPSLLVDYrr 5600 +VPSLLVDZ 5601 +VPSLLVDZrm 5602 +VPSLLVDZrmb 5603 +VPSLLVDZrmbk 5604 +VPSLLVDZrmbkz 5605 +VPSLLVDZrmk 5606 +VPSLLVDZrmkz 5607 +VPSLLVDZrr 5608 +VPSLLVDZrrk 5609 +VPSLLVDZrrkz 5610 +VPSLLVDrm 5611 +VPSLLVDrr 5612 +VPSLLVQYrm 5613 +VPSLLVQYrr 5614 +VPSLLVQZ 5615 +VPSLLVQZrm 5616 +VPSLLVQZrmb 5617 +VPSLLVQZrmbk 5618 +VPSLLVQZrmbkz 5619 +VPSLLVQZrmk 5620 +VPSLLVQZrmkz 5621 +VPSLLVQZrr 5622 +VPSLLVQZrrk 5623 +VPSLLVQZrrkz 5624 +VPSLLVQrm 5625 +VPSLLVQrr 5626 +VPSLLVWZ 5627 +VPSLLVWZrm 5628 +VPSLLVWZrmk 5629 +VPSLLVWZrmkz 5630 +VPSLLVWZrr 5631 +VPSLLVWZrrk 5632 +VPSLLVWZrrkz 5633 +VPSLLWYri 5634 +VPSLLWYrm 5635 +VPSLLWYrr 5636 +VPSLLWZ 5637 +VPSLLWZmi 5638 +VPSLLWZmik 5639 +VPSLLWZmikz 5640 +VPSLLWZri 5641 +VPSLLWZrik 5642 +VPSLLWZrikz 5643 +VPSLLWZrm 5644 +VPSLLWZrmk 5645 +VPSLLWZrmkz 5646 +VPSLLWZrr 5647 +VPSLLWZrrk 5648 +VPSLLWZrrkz 5649 +VPSLLWri 5650 +VPSLLWrm 5651 +VPSLLWrr 5652 +VPSRADYri 5653 +VPSRADYrm 5654 +VPSRADYrr 5655 +VPSRADZ 5656 +VPSRADZmbi 5657 +VPSRADZmbik 5658 +VPSRADZmbikz 5659 +VPSRADZmi 5660 +VPSRADZmik 5661 +VPSRADZmikz 5662 +VPSRADZri 5663 +VPSRADZrik 5664 +VPSRADZrikz 5665 +VPSRADZrm 5666 +VPSRADZrmk 5667 +VPSRADZrmkz 5668 +VPSRADZrr 5669 +VPSRADZrrk 5670 +VPSRADZrrkz 5671 +VPSRADri 5672 +VPSRADrm 5673 +VPSRADrr 5674 +VPSRAQZ 5675 +VPSRAQZmbi 5676 +VPSRAQZmbik 5677 +VPSRAQZmbikz 5678 +VPSRAQZmi 5679 +VPSRAQZmik 5680 +VPSRAQZmikz 5681 +VPSRAQZri 5682 +VPSRAQZrik 5683 +VPSRAQZrikz 5684 +VPSRAQZrm 5685 +VPSRAQZrmk 5686 +VPSRAQZrmkz 5687 +VPSRAQZrr 5688 +VPSRAQZrrk 5689 +VPSRAQZrrkz 5690 +VPSRAVDYrm 5691 +VPSRAVDYrr 5692 +VPSRAVDZ 5693 +VPSRAVDZrm 5694 +VPSRAVDZrmb 5695 +VPSRAVDZrmbk 5696 +VPSRAVDZrmbkz 5697 +VPSRAVDZrmk 5698 +VPSRAVDZrmkz 5699 +VPSRAVDZrr 5700 +VPSRAVDZrrk 5701 +VPSRAVDZrrkz 5702 +VPSRAVDrm 5703 +VPSRAVDrr 5704 +VPSRAVQZ 5705 +VPSRAVQZrm 5706 +VPSRAVQZrmb 5707 +VPSRAVQZrmbk 5708 +VPSRAVQZrmbkz 5709 +VPSRAVQZrmk 5710 +VPSRAVQZrmkz 5711 +VPSRAVQZrr 5712 +VPSRAVQZrrk 5713 +VPSRAVQZrrkz 5714 +VPSRAVWZ 5715 +VPSRAVWZrm 5716 +VPSRAVWZrmk 5717 +VPSRAVWZrmkz 5718 +VPSRAVWZrr 5719 +VPSRAVWZrrk 5720 +VPSRAVWZrrkz 5721 +VPSRAWYri 5722 +VPSRAWYrm 5723 +VPSRAWYrr 5724 +VPSRAWZ 5725 +VPSRAWZmi 5726 +VPSRAWZmik 5727 +VPSRAWZmikz 5728 +VPSRAWZri 5729 +VPSRAWZrik 5730 +VPSRAWZrikz 5731 +VPSRAWZrm 5732 +VPSRAWZrmk 5733 +VPSRAWZrmkz 5734 +VPSRAWZrr 5735 +VPSRAWZrrk 5736 +VPSRAWZrrkz 5737 +VPSRAWri 5738 +VPSRAWrm 5739 +VPSRAWrr 5740 +VPSRLDQYri 5741 +VPSRLDQZ 5742 +VPSRLDQZmi 5743 +VPSRLDQZri 5744 +VPSRLDQri 5745 +VPSRLDYri 5746 +VPSRLDYrm 5747 +VPSRLDYrr 5748 +VPSRLDZ 5749 +VPSRLDZmbi 5750 +VPSRLDZmbik 5751 +VPSRLDZmbikz 5752 +VPSRLDZmi 5753 +VPSRLDZmik 5754 +VPSRLDZmikz 5755 +VPSRLDZri 5756 +VPSRLDZrik 5757 +VPSRLDZrikz 5758 +VPSRLDZrm 5759 +VPSRLDZrmk 5760 +VPSRLDZrmkz 5761 +VPSRLDZrr 5762 +VPSRLDZrrk 5763 +VPSRLDZrrkz 5764 +VPSRLDri 5765 +VPSRLDrm 5766 +VPSRLDrr 5767 +VPSRLQYri 5768 +VPSRLQYrm 5769 +VPSRLQYrr 5770 +VPSRLQZ 5771 +VPSRLQZmbi 5772 +VPSRLQZmbik 5773 +VPSRLQZmbikz 5774 +VPSRLQZmi 5775 +VPSRLQZmik 5776 +VPSRLQZmikz 5777 +VPSRLQZri 5778 +VPSRLQZrik 5779 +VPSRLQZrikz 5780 +VPSRLQZrm 5781 +VPSRLQZrmk 5782 +VPSRLQZrmkz 5783 +VPSRLQZrr 5784 +VPSRLQZrrk 5785 +VPSRLQZrrkz 5786 +VPSRLQri 5787 +VPSRLQrm 5788 +VPSRLQrr 5789 +VPSRLVDYrm 5790 +VPSRLVDYrr 5791 +VPSRLVDZ 5792 +VPSRLVDZrm 5793 +VPSRLVDZrmb 5794 +VPSRLVDZrmbk 5795 +VPSRLVDZrmbkz 5796 +VPSRLVDZrmk 5797 +VPSRLVDZrmkz 5798 +VPSRLVDZrr 5799 +VPSRLVDZrrk 5800 +VPSRLVDZrrkz 5801 +VPSRLVDrm 5802 +VPSRLVDrr 5803 +VPSRLVQYrm 5804 +VPSRLVQYrr 5805 +VPSRLVQZ 5806 +VPSRLVQZrm 5807 +VPSRLVQZrmb 5808 +VPSRLVQZrmbk 5809 +VPSRLVQZrmbkz 5810 +VPSRLVQZrmk 5811 +VPSRLVQZrmkz 5812 +VPSRLVQZrr 5813 +VPSRLVQZrrk 5814 +VPSRLVQZrrkz 5815 +VPSRLVQrm 5816 +VPSRLVQrr 5817 +VPSRLVWZ 5818 +VPSRLVWZrm 5819 +VPSRLVWZrmk 5820 +VPSRLVWZrmkz 5821 +VPSRLVWZrr 5822 +VPSRLVWZrrk 5823 +VPSRLVWZrrkz 5824 +VPSRLWYri 5825 +VPSRLWYrm 5826 +VPSRLWYrr 5827 +VPSRLWZ 5828 +VPSRLWZmi 5829 +VPSRLWZmik 5830 +VPSRLWZmikz 5831 +VPSRLWZri 5832 +VPSRLWZrik 5833 +VPSRLWZrikz 5834 +VPSRLWZrm 5835 +VPSRLWZrmk 5836 +VPSRLWZrmkz 5837 +VPSRLWZrr 5838 +VPSRLWZrrk 5839 +VPSRLWZrrkz 5840 +VPSRLWri 5841 +VPSRLWrm 5842 +VPSRLWrr 5843 +VPSUBBYrm 5844 +VPSUBBYrr 5845 +VPSUBBZ 5846 +VPSUBBZrm 5847 +VPSUBBZrmk 5848 +VPSUBBZrmkz 5849 +VPSUBBZrr 5850 +VPSUBBZrrk 5851 +VPSUBBZrrkz 5852 +VPSUBBrm 5853 +VPSUBBrr 5854 +VPSUBDYrm 5855 +VPSUBDYrr 5856 +VPSUBDZ 5857 +VPSUBDZrm 5858 +VPSUBDZrmb 5859 +VPSUBDZrmbk 5860 +VPSUBDZrmbkz 5861 +VPSUBDZrmk 5862 +VPSUBDZrmkz 5863 +VPSUBDZrr 5864 +VPSUBDZrrk 5865 +VPSUBDZrrkz 5866 +VPSUBDrm 5867 +VPSUBDrr 5868 +VPSUBQYrm 5869 +VPSUBQYrr 5870 +VPSUBQZ 5871 +VPSUBQZrm 5872 +VPSUBQZrmb 5873 +VPSUBQZrmbk 5874 +VPSUBQZrmbkz 5875 +VPSUBQZrmk 5876 +VPSUBQZrmkz 5877 +VPSUBQZrr 5878 +VPSUBQZrrk 5879 +VPSUBQZrrkz 5880 +VPSUBQrm 5881 +VPSUBQrr 5882 +VPSUBSBYrm 5883 +VPSUBSBYrr 5884 +VPSUBSBZ 5885 +VPSUBSBZrm 5886 +VPSUBSBZrmk 5887 +VPSUBSBZrmkz 5888 +VPSUBSBZrr 5889 +VPSUBSBZrrk 5890 +VPSUBSBZrrkz 5891 +VPSUBSBrm 5892 +VPSUBSBrr 5893 +VPSUBSWYrm 5894 +VPSUBSWYrr 5895 +VPSUBSWZ 5896 +VPSUBSWZrm 5897 +VPSUBSWZrmk 5898 +VPSUBSWZrmkz 5899 +VPSUBSWZrr 5900 +VPSUBSWZrrk 5901 +VPSUBSWZrrkz 5902 +VPSUBSWrm 5903 +VPSUBSWrr 5904 +VPSUBUSBYrm 5905 +VPSUBUSBYrr 5906 +VPSUBUSBZ 5907 +VPSUBUSBZrm 5908 +VPSUBUSBZrmk 5909 +VPSUBUSBZrmkz 5910 +VPSUBUSBZrr 5911 +VPSUBUSBZrrk 5912 +VPSUBUSBZrrkz 5913 +VPSUBUSBrm 5914 +VPSUBUSBrr 5915 +VPSUBUSWYrm 5916 +VPSUBUSWYrr 5917 +VPSUBUSWZ 5918 +VPSUBUSWZrm 5919 +VPSUBUSWZrmk 5920 +VPSUBUSWZrmkz 5921 +VPSUBUSWZrr 5922 +VPSUBUSWZrrk 5923 +VPSUBUSWZrrkz 5924 +VPSUBUSWrm 5925 +VPSUBUSWrr 5926 +VPSUBWYrm 5927 +VPSUBWYrr 5928 +VPSUBWZ 5929 +VPSUBWZrm 5930 +VPSUBWZrmk 5931 +VPSUBWZrmkz 5932 +VPSUBWZrr 5933 +VPSUBWZrrk 5934 +VPSUBWZrrkz 5935 +VPSUBWrm 5936 +VPSUBWrr 5937 +VPTERNLOGDZ 5938 +VPTERNLOGDZrmbi 5939 +VPTERNLOGDZrmbik 5940 +VPTERNLOGDZrmbikz 5941 +VPTERNLOGDZrmi 5942 +VPTERNLOGDZrmik 5943 +VPTERNLOGDZrmikz 5944 +VPTERNLOGDZrri 5945 +VPTERNLOGDZrrik 5946 +VPTERNLOGDZrrikz 5947 +VPTERNLOGQZ 5948 +VPTERNLOGQZrmbi 5949 +VPTERNLOGQZrmbik 5950 +VPTERNLOGQZrmbikz 5951 +VPTERNLOGQZrmi 5952 +VPTERNLOGQZrmik 5953 +VPTERNLOGQZrmikz 5954 +VPTERNLOGQZrri 5955 +VPTERNLOGQZrrik 5956 +VPTERNLOGQZrrikz 5957 +VPTESTMBZ 5958 +VPTESTMBZrm 5959 +VPTESTMBZrmk 5960 +VPTESTMBZrr 5961 +VPTESTMBZrrk 5962 +VPTESTMDZ 5963 +VPTESTMDZrm 5964 +VPTESTMDZrmb 5965 +VPTESTMDZrmbk 5966 +VPTESTMDZrmk 5967 +VPTESTMDZrr 5968 +VPTESTMDZrrk 5969 +VPTESTMQZ 5970 +VPTESTMQZrm 5971 +VPTESTMQZrmb 5972 +VPTESTMQZrmbk 5973 +VPTESTMQZrmk 5974 +VPTESTMQZrr 5975 +VPTESTMQZrrk 5976 +VPTESTMWZ 5977 +VPTESTMWZrm 5978 +VPTESTMWZrmk 5979 +VPTESTMWZrr 5980 +VPTESTMWZrrk 5981 +VPTESTNMBZ 5982 +VPTESTNMBZrm 5983 +VPTESTNMBZrmk 5984 +VPTESTNMBZrr 5985 +VPTESTNMBZrrk 5986 +VPTESTNMDZ 5987 +VPTESTNMDZrm 5988 +VPTESTNMDZrmb 5989 +VPTESTNMDZrmbk 5990 +VPTESTNMDZrmk 5991 +VPTESTNMDZrr 5992 +VPTESTNMDZrrk 5993 +VPTESTNMQZ 5994 +VPTESTNMQZrm 5995 +VPTESTNMQZrmb 5996 +VPTESTNMQZrmbk 5997 +VPTESTNMQZrmk 5998 +VPTESTNMQZrr 5999 +VPTESTNMQZrrk 6000 +VPTESTNMWZ 6001 +VPTESTNMWZrm 6002 +VPTESTNMWZrmk 6003 +VPTESTNMWZrr 6004 +VPTESTNMWZrrk 6005 +VPTESTYrm 6006 +VPTESTYrr 6007 +VPTESTrm 6008 +VPTESTrr 6009 +VPUNPCKHBWYrm 6010 +VPUNPCKHBWYrr 6011 +VPUNPCKHBWZ 6012 +VPUNPCKHBWZrm 6013 +VPUNPCKHBWZrmk 6014 +VPUNPCKHBWZrmkz 6015 +VPUNPCKHBWZrr 6016 +VPUNPCKHBWZrrk 6017 +VPUNPCKHBWZrrkz 6018 +VPUNPCKHBWrm 6019 +VPUNPCKHBWrr 6020 +VPUNPCKHDQYrm 6021 +VPUNPCKHDQYrr 6022 +VPUNPCKHDQZ 6023 +VPUNPCKHDQZrm 6024 +VPUNPCKHDQZrmb 6025 +VPUNPCKHDQZrmbk 6026 +VPUNPCKHDQZrmbkz 6027 +VPUNPCKHDQZrmk 6028 +VPUNPCKHDQZrmkz 6029 +VPUNPCKHDQZrr 6030 +VPUNPCKHDQZrrk 6031 +VPUNPCKHDQZrrkz 6032 +VPUNPCKHDQrm 6033 +VPUNPCKHDQrr 6034 +VPUNPCKHQDQYrm 6035 +VPUNPCKHQDQYrr 6036 +VPUNPCKHQDQZ 6037 +VPUNPCKHQDQZrm 6038 +VPUNPCKHQDQZrmb 6039 +VPUNPCKHQDQZrmbk 6040 +VPUNPCKHQDQZrmbkz 6041 +VPUNPCKHQDQZrmk 6042 +VPUNPCKHQDQZrmkz 6043 +VPUNPCKHQDQZrr 6044 +VPUNPCKHQDQZrrk 6045 +VPUNPCKHQDQZrrkz 6046 +VPUNPCKHQDQrm 6047 +VPUNPCKHQDQrr 6048 +VPUNPCKHWDYrm 6049 +VPUNPCKHWDYrr 6050 +VPUNPCKHWDZ 6051 +VPUNPCKHWDZrm 6052 +VPUNPCKHWDZrmk 6053 +VPUNPCKHWDZrmkz 6054 +VPUNPCKHWDZrr 6055 +VPUNPCKHWDZrrk 6056 +VPUNPCKHWDZrrkz 6057 +VPUNPCKHWDrm 6058 +VPUNPCKHWDrr 6059 +VPUNPCKLBWYrm 6060 +VPUNPCKLBWYrr 6061 +VPUNPCKLBWZ 6062 +VPUNPCKLBWZrm 6063 +VPUNPCKLBWZrmk 6064 +VPUNPCKLBWZrmkz 6065 +VPUNPCKLBWZrr 6066 +VPUNPCKLBWZrrk 6067 +VPUNPCKLBWZrrkz 6068 +VPUNPCKLBWrm 6069 +VPUNPCKLBWrr 6070 +VPUNPCKLDQYrm 6071 +VPUNPCKLDQYrr 6072 +VPUNPCKLDQZ 6073 +VPUNPCKLDQZrm 6074 +VPUNPCKLDQZrmb 6075 +VPUNPCKLDQZrmbk 6076 +VPUNPCKLDQZrmbkz 6077 +VPUNPCKLDQZrmk 6078 +VPUNPCKLDQZrmkz 6079 +VPUNPCKLDQZrr 6080 +VPUNPCKLDQZrrk 6081 +VPUNPCKLDQZrrkz 6082 +VPUNPCKLDQrm 6083 +VPUNPCKLDQrr 6084 +VPUNPCKLQDQYrm 6085 +VPUNPCKLQDQYrr 6086 +VPUNPCKLQDQZ 6087 +VPUNPCKLQDQZrm 6088 +VPUNPCKLQDQZrmb 6089 +VPUNPCKLQDQZrmbk 6090 +VPUNPCKLQDQZrmbkz 6091 +VPUNPCKLQDQZrmk 6092 +VPUNPCKLQDQZrmkz 6093 +VPUNPCKLQDQZrr 6094 +VPUNPCKLQDQZrrk 6095 +VPUNPCKLQDQZrrkz 6096 +VPUNPCKLQDQrm 6097 +VPUNPCKLQDQrr 6098 +VPUNPCKLWDYrm 6099 +VPUNPCKLWDYrr 6100 +VPUNPCKLWDZ 6101 +VPUNPCKLWDZrm 6102 +VPUNPCKLWDZrmk 6103 +VPUNPCKLWDZrmkz 6104 +VPUNPCKLWDZrr 6105 +VPUNPCKLWDZrrk 6106 +VPUNPCKLWDZrrkz 6107 +VPUNPCKLWDrm 6108 +VPUNPCKLWDrr 6109 +VPXORDZ 6110 +VPXORDZrm 6111 +VPXORDZrmb 6112 +VPXORDZrmbk 6113 +VPXORDZrmbkz 6114 +VPXORDZrmk 6115 +VPXORDZrmkz 6116 +VPXORDZrr 6117 +VPXORDZrrk 6118 +VPXORDZrrkz 6119 +VPXORQZ 6120 +VPXORQZrm 6121 +VPXORQZrmb 6122 +VPXORQZrmbk 6123 +VPXORQZrmbkz 6124 +VPXORQZrmk 6125 +VPXORQZrmkz 6126 +VPXORQZrr 6127 +VPXORQZrrk 6128 +VPXORQZrrkz 6129 +VPXORYrm 6130 +VPXORYrr 6131 +VPXORrm 6132 +VPXORrr 6133 +VRANGEPDZ 6134 +VRANGEPDZrmbi 6135 +VRANGEPDZrmbik 6136 +VRANGEPDZrmbikz 6137 +VRANGEPDZrmi 6138 +VRANGEPDZrmik 6139 +VRANGEPDZrmikz 6140 +VRANGEPDZrri 6141 +VRANGEPDZrrib 6142 +VRANGEPDZrribk 6143 +VRANGEPDZrribkz 6144 +VRANGEPDZrrik 6145 +VRANGEPDZrrikz 6146 +VRANGEPSZ 6147 +VRANGEPSZrmbi 6148 +VRANGEPSZrmbik 6149 +VRANGEPSZrmbikz 6150 +VRANGEPSZrmi 6151 +VRANGEPSZrmik 6152 +VRANGEPSZrmikz 6153 +VRANGEPSZrri 6154 +VRANGEPSZrrib 6155 +VRANGEPSZrribk 6156 +VRANGEPSZrribkz 6157 +VRANGEPSZrrik 6158 +VRANGEPSZrrikz 6159 +VRANGESDZrmi 6160 +VRANGESDZrmik 6161 +VRANGESDZrmikz 6162 +VRANGESDZrri 6163 +VRANGESDZrrib 6164 +VRANGESDZrribk 6165 +VRANGESDZrribkz 6166 +VRANGESDZrrik 6167 +VRANGESDZrrikz 6168 +VRANGESSZrmi 6169 +VRANGESSZrmik 6170 +VRANGESSZrmikz 6171 +VRANGESSZrri 6172 +VRANGESSZrrib 6173 +VRANGESSZrribk 6174 +VRANGESSZrribkz 6175 +VRANGESSZrrik 6176 +VRANGESSZrrikz 6177 +VRCP 6178 +VRCPBF 6179 +VRCPPHZ 6180 +VRCPPHZm 6181 +VRCPPHZmb 6182 +VRCPPHZmbk 6183 +VRCPPHZmbkz 6184 +VRCPPHZmk 6185 +VRCPPHZmkz 6186 +VRCPPHZr 6187 +VRCPPHZrk 6188 +VRCPPHZrkz 6189 +VRCPPSYm 6190 +VRCPPSYr 6191 +VRCPPSm 6192 +VRCPPSr 6193 +VRCPSHZrm 6194 +VRCPSHZrmk 6195 +VRCPSHZrmkz 6196 +VRCPSHZrr 6197 +VRCPSHZrrk 6198 +VRCPSHZrrkz 6199 +VRCPSSm 6200 +VRCPSSm_Int 6201 +VRCPSSr 6202 +VRCPSSr_Int 6203 +VREDUCEBF 6204 +VREDUCEPDZ 6205 +VREDUCEPDZrmbi 6206 +VREDUCEPDZrmbik 6207 +VREDUCEPDZrmbikz 6208 +VREDUCEPDZrmi 6209 +VREDUCEPDZrmik 6210 +VREDUCEPDZrmikz 6211 +VREDUCEPDZrri 6212 +VREDUCEPDZrrib 6213 +VREDUCEPDZrribk 6214 +VREDUCEPDZrribkz 6215 +VREDUCEPDZrrik 6216 +VREDUCEPDZrrikz 6217 +VREDUCEPHZ 6218 +VREDUCEPHZrmbi 6219 +VREDUCEPHZrmbik 6220 +VREDUCEPHZrmbikz 6221 +VREDUCEPHZrmi 6222 +VREDUCEPHZrmik 6223 +VREDUCEPHZrmikz 6224 +VREDUCEPHZrri 6225 +VREDUCEPHZrrib 6226 +VREDUCEPHZrribk 6227 +VREDUCEPHZrribkz 6228 +VREDUCEPHZrrik 6229 +VREDUCEPHZrrikz 6230 +VREDUCEPSZ 6231 +VREDUCEPSZrmbi 6232 +VREDUCEPSZrmbik 6233 +VREDUCEPSZrmbikz 6234 +VREDUCEPSZrmi 6235 +VREDUCEPSZrmik 6236 +VREDUCEPSZrmikz 6237 +VREDUCEPSZrri 6238 +VREDUCEPSZrrib 6239 +VREDUCEPSZrribk 6240 +VREDUCEPSZrribkz 6241 +VREDUCEPSZrrik 6242 +VREDUCEPSZrrikz 6243 +VREDUCESDZrmi 6244 +VREDUCESDZrmik 6245 +VREDUCESDZrmikz 6246 +VREDUCESDZrri 6247 +VREDUCESDZrrib 6248 +VREDUCESDZrribk 6249 +VREDUCESDZrribkz 6250 +VREDUCESDZrrik 6251 +VREDUCESDZrrikz 6252 +VREDUCESHZrmi 6253 +VREDUCESHZrmik 6254 +VREDUCESHZrmikz 6255 +VREDUCESHZrri 6256 +VREDUCESHZrrib 6257 +VREDUCESHZrribk 6258 +VREDUCESHZrribkz 6259 +VREDUCESHZrrik 6260 +VREDUCESHZrrikz 6261 +VREDUCESSZrmi 6262 +VREDUCESSZrmik 6263 +VREDUCESSZrmikz 6264 +VREDUCESSZrri 6265 +VREDUCESSZrrib 6266 +VREDUCESSZrribk 6267 +VREDUCESSZrribkz 6268 +VREDUCESSZrrik 6269 +VREDUCESSZrrikz 6270 +VRNDSCALEBF 6271 +VRNDSCALEPDZ 6272 +VRNDSCALEPDZrmbi 6273 +VRNDSCALEPDZrmbik 6274 +VRNDSCALEPDZrmbikz 6275 +VRNDSCALEPDZrmi 6276 +VRNDSCALEPDZrmik 6277 +VRNDSCALEPDZrmikz 6278 +VRNDSCALEPDZrri 6279 +VRNDSCALEPDZrrib 6280 +VRNDSCALEPDZrribk 6281 +VRNDSCALEPDZrribkz 6282 +VRNDSCALEPDZrrik 6283 +VRNDSCALEPDZrrikz 6284 +VRNDSCALEPHZ 6285 +VRNDSCALEPHZrmbi 6286 +VRNDSCALEPHZrmbik 6287 +VRNDSCALEPHZrmbikz 6288 +VRNDSCALEPHZrmi 6289 +VRNDSCALEPHZrmik 6290 +VRNDSCALEPHZrmikz 6291 +VRNDSCALEPHZrri 6292 +VRNDSCALEPHZrrib 6293 +VRNDSCALEPHZrribk 6294 +VRNDSCALEPHZrribkz 6295 +VRNDSCALEPHZrrik 6296 +VRNDSCALEPHZrrikz 6297 +VRNDSCALEPSZ 6298 +VRNDSCALEPSZrmbi 6299 +VRNDSCALEPSZrmbik 6300 +VRNDSCALEPSZrmbikz 6301 +VRNDSCALEPSZrmi 6302 +VRNDSCALEPSZrmik 6303 +VRNDSCALEPSZrmikz 6304 +VRNDSCALEPSZrri 6305 +VRNDSCALEPSZrrib 6306 +VRNDSCALEPSZrribk 6307 +VRNDSCALEPSZrribkz 6308 +VRNDSCALEPSZrrik 6309 +VRNDSCALEPSZrrikz 6310 +VRNDSCALESDZrmi 6311 +VRNDSCALESDZrmi_Int 6312 +VRNDSCALESDZrmik_Int 6313 +VRNDSCALESDZrmikz_Int 6314 +VRNDSCALESDZrri 6315 +VRNDSCALESDZrri_Int 6316 +VRNDSCALESDZrrib_Int 6317 +VRNDSCALESDZrribk_Int 6318 +VRNDSCALESDZrribkz_Int 6319 +VRNDSCALESDZrrik_Int 6320 +VRNDSCALESDZrrikz_Int 6321 +VRNDSCALESHZrmi 6322 +VRNDSCALESHZrmi_Int 6323 +VRNDSCALESHZrmik_Int 6324 +VRNDSCALESHZrmikz_Int 6325 +VRNDSCALESHZrri 6326 +VRNDSCALESHZrri_Int 6327 +VRNDSCALESHZrrib_Int 6328 +VRNDSCALESHZrribk_Int 6329 +VRNDSCALESHZrribkz_Int 6330 +VRNDSCALESHZrrik_Int 6331 +VRNDSCALESHZrrikz_Int 6332 +VRNDSCALESSZrmi 6333 +VRNDSCALESSZrmi_Int 6334 +VRNDSCALESSZrmik_Int 6335 +VRNDSCALESSZrmikz_Int 6336 +VRNDSCALESSZrri 6337 +VRNDSCALESSZrri_Int 6338 +VRNDSCALESSZrrib_Int 6339 +VRNDSCALESSZrribk_Int 6340 +VRNDSCALESSZrribkz_Int 6341 +VRNDSCALESSZrrik_Int 6342 +VRNDSCALESSZrrikz_Int 6343 +VROUNDPDYmi 6344 +VROUNDPDYri 6345 +VROUNDPDmi 6346 +VROUNDPDri 6347 +VROUNDPSYmi 6348 +VROUNDPSYri 6349 +VROUNDPSmi 6350 +VROUNDPSri 6351 +VROUNDSDmi 6352 +VROUNDSDmi_Int 6353 +VROUNDSDri 6354 +VROUNDSDri_Int 6355 +VROUNDSSmi 6356 +VROUNDSSmi_Int 6357 +VROUNDSSri 6358 +VROUNDSSri_Int 6359 +VRSQRT 6360 +VRSQRTBF 6361 +VRSQRTPHZ 6362 +VRSQRTPHZm 6363 +VRSQRTPHZmb 6364 +VRSQRTPHZmbk 6365 +VRSQRTPHZmbkz 6366 +VRSQRTPHZmk 6367 +VRSQRTPHZmkz 6368 +VRSQRTPHZr 6369 +VRSQRTPHZrk 6370 +VRSQRTPHZrkz 6371 +VRSQRTPSYm 6372 +VRSQRTPSYr 6373 +VRSQRTPSm 6374 +VRSQRTPSr 6375 +VRSQRTSHZrm 6376 +VRSQRTSHZrmk 6377 +VRSQRTSHZrmkz 6378 +VRSQRTSHZrr 6379 +VRSQRTSHZrrk 6380 +VRSQRTSHZrrkz 6381 +VRSQRTSSm 6382 +VRSQRTSSm_Int 6383 +VRSQRTSSr 6384 +VRSQRTSSr_Int 6385 +VSCALEFBF 6386 +VSCALEFPDZ 6387 +VSCALEFPDZrm 6388 +VSCALEFPDZrmb 6389 +VSCALEFPDZrmbk 6390 +VSCALEFPDZrmbkz 6391 +VSCALEFPDZrmk 6392 +VSCALEFPDZrmkz 6393 +VSCALEFPDZrr 6394 +VSCALEFPDZrrb 6395 +VSCALEFPDZrrbk 6396 +VSCALEFPDZrrbkz 6397 +VSCALEFPDZrrk 6398 +VSCALEFPDZrrkz 6399 +VSCALEFPHZ 6400 +VSCALEFPHZrm 6401 +VSCALEFPHZrmb 6402 +VSCALEFPHZrmbk 6403 +VSCALEFPHZrmbkz 6404 +VSCALEFPHZrmk 6405 +VSCALEFPHZrmkz 6406 +VSCALEFPHZrr 6407 +VSCALEFPHZrrb 6408 +VSCALEFPHZrrbk 6409 +VSCALEFPHZrrbkz 6410 +VSCALEFPHZrrk 6411 +VSCALEFPHZrrkz 6412 +VSCALEFPSZ 6413 +VSCALEFPSZrm 6414 +VSCALEFPSZrmb 6415 +VSCALEFPSZrmbk 6416 +VSCALEFPSZrmbkz 6417 +VSCALEFPSZrmk 6418 +VSCALEFPSZrmkz 6419 +VSCALEFPSZrr 6420 +VSCALEFPSZrrb 6421 +VSCALEFPSZrrbk 6422 +VSCALEFPSZrrbkz 6423 +VSCALEFPSZrrk 6424 +VSCALEFPSZrrkz 6425 +VSCALEFSDZrm 6426 +VSCALEFSDZrmk 6427 +VSCALEFSDZrmkz 6428 +VSCALEFSDZrr 6429 +VSCALEFSDZrrb_Int 6430 +VSCALEFSDZrrbk_Int 6431 +VSCALEFSDZrrbkz_Int 6432 +VSCALEFSDZrrk 6433 +VSCALEFSDZrrkz 6434 +VSCALEFSHZrm 6435 +VSCALEFSHZrmk 6436 +VSCALEFSHZrmkz 6437 +VSCALEFSHZrr 6438 +VSCALEFSHZrrb_Int 6439 +VSCALEFSHZrrbk_Int 6440 +VSCALEFSHZrrbkz_Int 6441 +VSCALEFSHZrrk 6442 +VSCALEFSHZrrkz 6443 +VSCALEFSSZrm 6444 +VSCALEFSSZrmk 6445 +VSCALEFSSZrmkz 6446 +VSCALEFSSZrr 6447 +VSCALEFSSZrrb_Int 6448 +VSCALEFSSZrrbk_Int 6449 +VSCALEFSSZrrbkz_Int 6450 +VSCALEFSSZrrk 6451 +VSCALEFSSZrrkz 6452 +VSCATTERDPDZ 6453 +VSCATTERDPDZmr 6454 +VSCATTERDPSZ 6455 +VSCATTERDPSZmr 6456 +VSCATTERPF 6457 +VSCATTERQPDZ 6458 +VSCATTERQPDZmr 6459 +VSCATTERQPSZ 6460 +VSCATTERQPSZmr 6461 +VSHA 6462 +VSHUFF 6463 +VSHUFI 6464 +VSHUFPDYrmi 6465 +VSHUFPDYrri 6466 +VSHUFPDZ 6467 +VSHUFPDZrmbi 6468 +VSHUFPDZrmbik 6469 +VSHUFPDZrmbikz 6470 +VSHUFPDZrmi 6471 +VSHUFPDZrmik 6472 +VSHUFPDZrmikz 6473 +VSHUFPDZrri 6474 +VSHUFPDZrrik 6475 +VSHUFPDZrrikz 6476 +VSHUFPDrmi 6477 +VSHUFPDrri 6478 +VSHUFPSYrmi 6479 +VSHUFPSYrri 6480 +VSHUFPSZ 6481 +VSHUFPSZrmbi 6482 +VSHUFPSZrmbik 6483 +VSHUFPSZrmbikz 6484 +VSHUFPSZrmi 6485 +VSHUFPSZrmik 6486 +VSHUFPSZrmikz 6487 +VSHUFPSZrri 6488 +VSHUFPSZrrik 6489 +VSHUFPSZrrikz 6490 +VSHUFPSrmi 6491 +VSHUFPSrri 6492 +VSM 6493 +VSQRTBF 6494 +VSQRTPDYm 6495 +VSQRTPDYr 6496 +VSQRTPDZ 6497 +VSQRTPDZm 6498 +VSQRTPDZmb 6499 +VSQRTPDZmbk 6500 +VSQRTPDZmbkz 6501 +VSQRTPDZmk 6502 +VSQRTPDZmkz 6503 +VSQRTPDZr 6504 +VSQRTPDZrb 6505 +VSQRTPDZrbk 6506 +VSQRTPDZrbkz 6507 +VSQRTPDZrk 6508 +VSQRTPDZrkz 6509 +VSQRTPDm 6510 +VSQRTPDr 6511 +VSQRTPHZ 6512 +VSQRTPHZm 6513 +VSQRTPHZmb 6514 +VSQRTPHZmbk 6515 +VSQRTPHZmbkz 6516 +VSQRTPHZmk 6517 +VSQRTPHZmkz 6518 +VSQRTPHZr 6519 +VSQRTPHZrb 6520 +VSQRTPHZrbk 6521 +VSQRTPHZrbkz 6522 +VSQRTPHZrk 6523 +VSQRTPHZrkz 6524 +VSQRTPSYm 6525 +VSQRTPSYr 6526 +VSQRTPSZ 6527 +VSQRTPSZm 6528 +VSQRTPSZmb 6529 +VSQRTPSZmbk 6530 +VSQRTPSZmbkz 6531 +VSQRTPSZmk 6532 +VSQRTPSZmkz 6533 +VSQRTPSZr 6534 +VSQRTPSZrb 6535 +VSQRTPSZrbk 6536 +VSQRTPSZrbkz 6537 +VSQRTPSZrk 6538 +VSQRTPSZrkz 6539 +VSQRTPSm 6540 +VSQRTPSr 6541 +VSQRTSDZm 6542 +VSQRTSDZm_Int 6543 +VSQRTSDZmk_Int 6544 +VSQRTSDZmkz_Int 6545 +VSQRTSDZr 6546 +VSQRTSDZr_Int 6547 +VSQRTSDZrb_Int 6548 +VSQRTSDZrbk_Int 6549 +VSQRTSDZrbkz_Int 6550 +VSQRTSDZrk_Int 6551 +VSQRTSDZrkz_Int 6552 +VSQRTSDm 6553 +VSQRTSDm_Int 6554 +VSQRTSDr 6555 +VSQRTSDr_Int 6556 +VSQRTSHZm 6557 +VSQRTSHZm_Int 6558 +VSQRTSHZmk_Int 6559 +VSQRTSHZmkz_Int 6560 +VSQRTSHZr 6561 +VSQRTSHZr_Int 6562 +VSQRTSHZrb_Int 6563 +VSQRTSHZrbk_Int 6564 +VSQRTSHZrbkz_Int 6565 +VSQRTSHZrk_Int 6566 +VSQRTSHZrkz_Int 6567 +VSQRTSSZm 6568 +VSQRTSSZm_Int 6569 +VSQRTSSZmk_Int 6570 +VSQRTSSZmkz_Int 6571 +VSQRTSSZr 6572 +VSQRTSSZr_Int 6573 +VSQRTSSZrb_Int 6574 +VSQRTSSZrbk_Int 6575 +VSQRTSSZrbkz_Int 6576 +VSQRTSSZrk_Int 6577 +VSQRTSSZrkz_Int 6578 +VSQRTSSm 6579 +VSQRTSSm_Int 6580 +VSQRTSSr 6581 +VSQRTSSr_Int 6582 +VSTMXCSR 6583 +VSUBBF 6584 +VSUBPDYrm 6585 +VSUBPDYrr 6586 +VSUBPDZ 6587 +VSUBPDZrm 6588 +VSUBPDZrmb 6589 +VSUBPDZrmbk 6590 +VSUBPDZrmbkz 6591 +VSUBPDZrmk 6592 +VSUBPDZrmkz 6593 +VSUBPDZrr 6594 +VSUBPDZrrb 6595 +VSUBPDZrrbk 6596 +VSUBPDZrrbkz 6597 +VSUBPDZrrk 6598 +VSUBPDZrrkz 6599 +VSUBPDrm 6600 +VSUBPDrr 6601 +VSUBPHZ 6602 +VSUBPHZrm 6603 +VSUBPHZrmb 6604 +VSUBPHZrmbk 6605 +VSUBPHZrmbkz 6606 +VSUBPHZrmk 6607 +VSUBPHZrmkz 6608 +VSUBPHZrr 6609 +VSUBPHZrrb 6610 +VSUBPHZrrbk 6611 +VSUBPHZrrbkz 6612 +VSUBPHZrrk 6613 +VSUBPHZrrkz 6614 +VSUBPSYrm 6615 +VSUBPSYrr 6616 +VSUBPSZ 6617 +VSUBPSZrm 6618 +VSUBPSZrmb 6619 +VSUBPSZrmbk 6620 +VSUBPSZrmbkz 6621 +VSUBPSZrmk 6622 +VSUBPSZrmkz 6623 +VSUBPSZrr 6624 +VSUBPSZrrb 6625 +VSUBPSZrrbk 6626 +VSUBPSZrrbkz 6627 +VSUBPSZrrk 6628 +VSUBPSZrrkz 6629 +VSUBPSrm 6630 +VSUBPSrr 6631 +VSUBSDZrm 6632 +VSUBSDZrm_Int 6633 +VSUBSDZrmk_Int 6634 +VSUBSDZrmkz_Int 6635 +VSUBSDZrr 6636 +VSUBSDZrr_Int 6637 +VSUBSDZrrb_Int 6638 +VSUBSDZrrbk_Int 6639 +VSUBSDZrrbkz_Int 6640 +VSUBSDZrrk_Int 6641 +VSUBSDZrrkz_Int 6642 +VSUBSDrm 6643 +VSUBSDrm_Int 6644 +VSUBSDrr 6645 +VSUBSDrr_Int 6646 +VSUBSHZrm 6647 +VSUBSHZrm_Int 6648 +VSUBSHZrmk_Int 6649 +VSUBSHZrmkz_Int 6650 +VSUBSHZrr 6651 +VSUBSHZrr_Int 6652 +VSUBSHZrrb_Int 6653 +VSUBSHZrrbk_Int 6654 +VSUBSHZrrbkz_Int 6655 +VSUBSHZrrk_Int 6656 +VSUBSHZrrkz_Int 6657 +VSUBSSZrm 6658 +VSUBSSZrm_Int 6659 +VSUBSSZrmk_Int 6660 +VSUBSSZrmkz_Int 6661 +VSUBSSZrr 6662 +VSUBSSZrr_Int 6663 +VSUBSSZrrb_Int 6664 +VSUBSSZrrbk_Int 6665 +VSUBSSZrrbkz_Int 6666 +VSUBSSZrrk_Int 6667 +VSUBSSZrrkz_Int 6668 +VSUBSSrm 6669 +VSUBSSrm_Int 6670 +VSUBSSrr 6671 +VSUBSSrr_Int 6672 +VTESTPDYrm 6673 +VTESTPDYrr 6674 +VTESTPDrm 6675 +VTESTPDrr 6676 +VTESTPSYrm 6677 +VTESTPSYrr 6678 +VTESTPSrm 6679 +VTESTPSrr 6680 +VUCOMISDZrm 6681 +VUCOMISDZrm_Int 6682 +VUCOMISDZrr 6683 +VUCOMISDZrr_Int 6684 +VUCOMISDZrrb 6685 +VUCOMISDrm 6686 +VUCOMISDrm_Int 6687 +VUCOMISDrr 6688 +VUCOMISDrr_Int 6689 +VUCOMISHZrm 6690 +VUCOMISHZrm_Int 6691 +VUCOMISHZrr 6692 +VUCOMISHZrr_Int 6693 +VUCOMISHZrrb 6694 +VUCOMISSZrm 6695 +VUCOMISSZrm_Int 6696 +VUCOMISSZrr 6697 +VUCOMISSZrr_Int 6698 +VUCOMISSZrrb 6699 +VUCOMISSrm 6700 +VUCOMISSrm_Int 6701 +VUCOMISSrr 6702 +VUCOMISSrr_Int 6703 +VUCOMXSDZrm 6704 +VUCOMXSDZrm_Int 6705 +VUCOMXSDZrr 6706 +VUCOMXSDZrr_Int 6707 +VUCOMXSDZrrb_Int 6708 +VUCOMXSHZrm 6709 +VUCOMXSHZrm_Int 6710 +VUCOMXSHZrr 6711 +VUCOMXSHZrr_Int 6712 +VUCOMXSHZrrb_Int 6713 +VUCOMXSSZrm 6714 +VUCOMXSSZrm_Int 6715 +VUCOMXSSZrr 6716 +VUCOMXSSZrr_Int 6717 +VUCOMXSSZrrb_Int 6718 +VUNPCKHPDYrm 6719 +VUNPCKHPDYrr 6720 +VUNPCKHPDZ 6721 +VUNPCKHPDZrm 6722 +VUNPCKHPDZrmb 6723 +VUNPCKHPDZrmbk 6724 +VUNPCKHPDZrmbkz 6725 +VUNPCKHPDZrmk 6726 +VUNPCKHPDZrmkz 6727 +VUNPCKHPDZrr 6728 +VUNPCKHPDZrrk 6729 +VUNPCKHPDZrrkz 6730 +VUNPCKHPDrm 6731 +VUNPCKHPDrr 6732 +VUNPCKHPSYrm 6733 +VUNPCKHPSYrr 6734 +VUNPCKHPSZ 6735 +VUNPCKHPSZrm 6736 +VUNPCKHPSZrmb 6737 +VUNPCKHPSZrmbk 6738 +VUNPCKHPSZrmbkz 6739 +VUNPCKHPSZrmk 6740 +VUNPCKHPSZrmkz 6741 +VUNPCKHPSZrr 6742 +VUNPCKHPSZrrk 6743 +VUNPCKHPSZrrkz 6744 +VUNPCKHPSrm 6745 +VUNPCKHPSrr 6746 +VUNPCKLPDYrm 6747 +VUNPCKLPDYrr 6748 +VUNPCKLPDZ 6749 +VUNPCKLPDZrm 6750 +VUNPCKLPDZrmb 6751 +VUNPCKLPDZrmbk 6752 +VUNPCKLPDZrmbkz 6753 +VUNPCKLPDZrmk 6754 +VUNPCKLPDZrmkz 6755 +VUNPCKLPDZrr 6756 +VUNPCKLPDZrrk 6757 +VUNPCKLPDZrrkz 6758 +VUNPCKLPDrm 6759 +VUNPCKLPDrr 6760 +VUNPCKLPSYrm 6761 +VUNPCKLPSYrr 6762 +VUNPCKLPSZ 6763 +VUNPCKLPSZrm 6764 +VUNPCKLPSZrmb 6765 +VUNPCKLPSZrmbk 6766 +VUNPCKLPSZrmbkz 6767 +VUNPCKLPSZrmk 6768 +VUNPCKLPSZrmkz 6769 +VUNPCKLPSZrr 6770 +VUNPCKLPSZrrk 6771 +VUNPCKLPSZrrkz 6772 +VUNPCKLPSrm 6773 +VUNPCKLPSrr 6774 +VXORPDYrm 6775 +VXORPDYrr 6776 +VXORPDZ 6777 +VXORPDZrm 6778 +VXORPDZrmb 6779 +VXORPDZrmbk 6780 +VXORPDZrmbkz 6781 +VXORPDZrmk 6782 +VXORPDZrmkz 6783 +VXORPDZrr 6784 +VXORPDZrrk 6785 +VXORPDZrrkz 6786 +VXORPDrm 6787 +VXORPDrr 6788 +VXORPSYrm 6789 +VXORPSYrr 6790 +VXORPSZ 6791 +VXORPSZrm 6792 +VXORPSZrmb 6793 +VXORPSZrmbk 6794 +VXORPSZrmbkz 6795 +VXORPSZrmk 6796 +VXORPSZrmkz 6797 +VXORPSZrr 6798 +VXORPSZrrk 6799 +VXORPSZrrkz 6800 +VXORPSrm 6801 +VXORPSrr 6802 +VZEROALL 6803 +VZEROUPPER 6804 +V_SET 6805 +V_SETALLONES 6806 +WAIT 6807 +WBINVD 6808 +WBNOINVD 6809 +WRFLAGS 6810 +WRFSBASE 6811 +WRGSBASE 6812 +WRMSR 6813 +WRMSRLIST 6814 +WRMSRNS 6815 +WRMSRNSir 6816 +WRMSRNSir_EVEX 6817 +WRPKRUr 6818 +WRSSD 6819 +WRSSD_EVEX 6820 +WRSSQ 6821 +WRSSQ_EVEX 6822 +WRUSSD 6823 +WRUSSD_EVEX 6824 +WRUSSQ 6825 +WRUSSQ_EVEX 6826 +XABORT 6827 +XABORT_DEF 6828 +XACQUIRE_PREFIX 6829 +XADD 6830 +XAM_F 6831 +XAM_Fp 6832 +XBEGIN 6833 +XCHG 6834 +XCH_F 6835 +XCRYPTCBC 6836 +XCRYPTCFB 6837 +XCRYPTCTR 6838 +XCRYPTECB 6839 +XCRYPTOFB 6840 +XEND 6841 +XGETBV 6842 +XLAT 6843 +XOR 6844 +XORPDrm 6845 +XORPDrr 6846 +XORPSrm 6847 +XORPSrr 6848 +XRELEASE_PREFIX 6849 +XRESLDTRK 6850 +XRSTOR 6851 +XRSTORS 6852 +XSAVE 6853 +XSAVEC 6854 +XSAVEOPT 6855 +XSAVES 6856 +XSETBV 6857 +XSHA 6858 +XSTORE 6859 +XSUSLDTRK 6860 +XTEST 6861 +Immediate 6862 +CImmediate 6863 +FPImmediate 6864 +MBB 6865 +FrameIndex 6866 +ConstantPoolIndex 6867 +TargetIndex 6868 +JumpTableIndex 6869 +ExternalSymbol 6870 +GlobalAddress 6871 +BlockAddress 6872 +RegisterMask 6873 +RegisterLiveOut 6874 +Metadata 6875 +MCSymbol 6876 +CFIIndex 6877 +IntrinsicID 6878 +Predicate 6879 +ShuffleMask 6880 +PhyReg_GR8 6881 +PhyReg_GRH8 6882 +PhyReg_GR8_NOREX2 6883 +PhyReg_GR8_NOREX 6884 +PhyReg_GR8_ABCD_H 6885 +PhyReg_GR8_ABCD_L 6886 +PhyReg_GRH16 6887 +PhyReg_GR16 6888 +PhyReg_GR16_NOREX2 6889 +PhyReg_GR16_NOREX 6890 +PhyReg_VK1 6891 +PhyReg_VK16 6892 +PhyReg_VK2 6893 +PhyReg_VK4 6894 +PhyReg_VK8 6895 +PhyReg_VK16WM 6896 +PhyReg_VK1WM 6897 +PhyReg_VK2WM 6898 +PhyReg_VK4WM 6899 +PhyReg_VK8WM 6900 +PhyReg_SEGMENT_REG 6901 +PhyReg_GR16_ABCD 6902 +PhyReg_FPCCR 6903 +PhyReg_FR16X 6904 +PhyReg_FR16 6905 +PhyReg_VK16PAIR 6906 +PhyReg_VK1PAIR 6907 +PhyReg_VK2PAIR 6908 +PhyReg_VK4PAIR 6909 +PhyReg_VK8PAIR 6910 +PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM 6911 +PhyReg_LOW32_ADDR_ACCESS_RBP 6912 +PhyReg_LOW32_ADDR_ACCESS 6913 +PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit 6914 +PhyReg_FR32X 6915 +PhyReg_GR32 6916 +PhyReg_GR32_NOSP 6917 +PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2 6918 +PhyReg_DEBUG_REG 6919 +PhyReg_FR32 6920 +PhyReg_GR32_NOREX2 6921 +PhyReg_GR32_NOREX2_NOSP 6922 +PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX 6923 +PhyReg_GR32_NOREX 6924 +PhyReg_VK32 6925 +PhyReg_GR32_NOREX_NOSP 6926 +PhyReg_RFP32 6927 +PhyReg_VK32WM 6928 +PhyReg_GR32_ABCD 6929 +PhyReg_GR32_TC 6930 +PhyReg_GR32_ABCD_and_GR32_TC 6931 +PhyReg_GR32_AD 6932 +PhyReg_GR32_ArgRef 6933 +PhyReg_GR32_BPSP 6934 +PhyReg_GR32_BSI 6935 +PhyReg_GR32_CB 6936 +PhyReg_GR32_DC 6937 +PhyReg_GR32_DIBP 6938 +PhyReg_GR32_SIDI 6939 +PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit 6940 +PhyReg_CCR 6941 +PhyReg_DFCCR 6942 +PhyReg_GR32_ABCD_and_GR32_BSI 6943 +PhyReg_GR32_AD_and_GR32_ArgRef 6944 +PhyReg_GR32_ArgRef_and_GR32_CB 6945 +PhyReg_GR32_BPSP_and_GR32_DIBP 6946 +PhyReg_GR32_BPSP_and_GR32_TC 6947 +PhyReg_GR32_BSI_and_GR32_SIDI 6948 +PhyReg_GR32_DIBP_and_GR32_SIDI 6949 +PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit 6950 +PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit 6951 +PhyReg_RFP64 6952 +PhyReg_GR64 6953 +PhyReg_FR64X 6954 +PhyReg_GR64_with_sub_8bit 6955 +PhyReg_GR64_NOSP 6956 +PhyReg_GR64_NOREX2 6957 +PhyReg_CONTROL_REG 6958 +PhyReg_FR64 6959 +PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2 6960 +PhyReg_GR64_NOREX2_NOSP 6961 +PhyReg_GR64PLTSafe 6962 +PhyReg_GR64_TC 6963 +PhyReg_GR64_NOREX 6964 +PhyReg_GR64_TCW64 6965 +PhyReg_GR64_TC_with_sub_8bit 6966 +PhyReg_GR64_NOREX2_NOSP_and_GR64_TC 6967 +PhyReg_GR64_TCW64_with_sub_8bit 6968 +PhyReg_GR64_TC_and_GR64_TCW64 6969 +PhyReg_GR64_with_sub_16bit_in_GR16_NOREX 6970 +PhyReg_VK64 6971 +PhyReg_VR64 6972 +PhyReg_GR64PLTSafe_and_GR64_TC 6973 +PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64 6974 +PhyReg_GR64_NOREX_NOSP 6975 +PhyReg_GR64_NOREX_and_GR64_TC 6976 +PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit 6977 +PhyReg_VK64WM 6978 +PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64 6979 +PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX 6980 +PhyReg_GR64PLTSafe_and_GR64_TCW64 6981 +PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC 6982 +PhyReg_GR64_NOREX_and_GR64_TCW64 6983 +PhyReg_GR64_ABCD 6984 +PhyReg_GR64_with_sub_32bit_in_GR32_TC 6985 +PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC 6986 +PhyReg_GR64_AD 6987 +PhyReg_GR64_ArgRef 6988 +PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP 6989 +PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef 6990 +PhyReg_GR64_with_sub_32bit_in_GR32_BPSP 6991 +PhyReg_GR64_with_sub_32bit_in_GR32_BSI 6992 +PhyReg_GR64_with_sub_32bit_in_GR32_CB 6993 +PhyReg_GR64_with_sub_32bit_in_GR32_DIBP 6994 +PhyReg_GR64_with_sub_32bit_in_GR32_SIDI 6995 +PhyReg_GR64_A 6996 +PhyReg_GR64_ArgRef_and_GR64_TC 6997 +PhyReg_GR64_and_LOW32_ADDR_ACCESS 6998 +PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI 6999 +PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef 7000 +PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB 7001 +PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP 7002 +PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC 7003 +PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI 7004 +PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI 7005 +PhyReg_RST 7006 +PhyReg_RFP80 7007 +PhyReg_RFP80_7 7008 +PhyReg_VR128X 7009 +PhyReg_VR128 7010 +PhyReg_VR256X 7011 +PhyReg_VR256 7012 +PhyReg_VR512 7013 +PhyReg_VR512_0_15 7014 +PhyReg_TILE 7015 +VirtReg_GR8 7016 +VirtReg_GRH8 7017 +VirtReg_GR8_NOREX2 7018 +VirtReg_GR8_NOREX 7019 +VirtReg_GR8_ABCD_H 7020 +VirtReg_GR8_ABCD_L 7021 +VirtReg_GRH16 7022 +VirtReg_GR16 7023 +VirtReg_GR16_NOREX2 7024 +VirtReg_GR16_NOREX 7025 +VirtReg_VK1 7026 +VirtReg_VK16 7027 +VirtReg_VK2 7028 +VirtReg_VK4 7029 +VirtReg_VK8 7030 +VirtReg_VK16WM 7031 +VirtReg_VK1WM 7032 +VirtReg_VK2WM 7033 +VirtReg_VK4WM 7034 +VirtReg_VK8WM 7035 +VirtReg_SEGMENT_REG 7036 +VirtReg_GR16_ABCD 7037 +VirtReg_FPCCR 7038 +VirtReg_FR16X 7039 +VirtReg_FR16 7040 +VirtReg_VK16PAIR 7041 +VirtReg_VK1PAIR 7042 +VirtReg_VK2PAIR 7043 +VirtReg_VK4PAIR 7044 +VirtReg_VK8PAIR 7045 +VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM 7046 +VirtReg_LOW32_ADDR_ACCESS_RBP 7047 +VirtReg_LOW32_ADDR_ACCESS 7048 +VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit 7049 +VirtReg_FR32X 7050 +VirtReg_GR32 7051 +VirtReg_GR32_NOSP 7052 +VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2 7053 +VirtReg_DEBUG_REG 7054 +VirtReg_FR32 7055 +VirtReg_GR32_NOREX2 7056 +VirtReg_GR32_NOREX2_NOSP 7057 +VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX 7058 +VirtReg_GR32_NOREX 7059 +VirtReg_VK32 7060 +VirtReg_GR32_NOREX_NOSP 7061 +VirtReg_RFP32 7062 +VirtReg_VK32WM 7063 +VirtReg_GR32_ABCD 7064 +VirtReg_GR32_TC 7065 +VirtReg_GR32_ABCD_and_GR32_TC 7066 +VirtReg_GR32_AD 7067 +VirtReg_GR32_ArgRef 7068 +VirtReg_GR32_BPSP 7069 +VirtReg_GR32_BSI 7070 +VirtReg_GR32_CB 7071 +VirtReg_GR32_DC 7072 +VirtReg_GR32_DIBP 7073 +VirtReg_GR32_SIDI 7074 +VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit 7075 +VirtReg_CCR 7076 +VirtReg_DFCCR 7077 +VirtReg_GR32_ABCD_and_GR32_BSI 7078 +VirtReg_GR32_AD_and_GR32_ArgRef 7079 +VirtReg_GR32_ArgRef_and_GR32_CB 7080 +VirtReg_GR32_BPSP_and_GR32_DIBP 7081 +VirtReg_GR32_BPSP_and_GR32_TC 7082 +VirtReg_GR32_BSI_and_GR32_SIDI 7083 +VirtReg_GR32_DIBP_and_GR32_SIDI 7084 +VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit 7085 +VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit 7086 +VirtReg_RFP64 7087 +VirtReg_GR64 7088 +VirtReg_FR64X 7089 +VirtReg_GR64_with_sub_8bit 7090 +VirtReg_GR64_NOSP 7091 +VirtReg_GR64_NOREX2 7092 +VirtReg_CONTROL_REG 7093 +VirtReg_FR64 7094 +VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2 7095 +VirtReg_GR64_NOREX2_NOSP 7096 +VirtReg_GR64PLTSafe 7097 +VirtReg_GR64_TC 7098 +VirtReg_GR64_NOREX 7099 +VirtReg_GR64_TCW64 7100 +VirtReg_GR64_TC_with_sub_8bit 7101 +VirtReg_GR64_NOREX2_NOSP_and_GR64_TC 7102 +VirtReg_GR64_TCW64_with_sub_8bit 7103 +VirtReg_GR64_TC_and_GR64_TCW64 7104 +VirtReg_GR64_with_sub_16bit_in_GR16_NOREX 7105 +VirtReg_VK64 7106 +VirtReg_VR64 7107 +VirtReg_GR64PLTSafe_and_GR64_TC 7108 +VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64 7109 +VirtReg_GR64_NOREX_NOSP 7110 +VirtReg_GR64_NOREX_and_GR64_TC 7111 +VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit 7112 +VirtReg_VK64WM 7113 +VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64 7114 +VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX 7115 +VirtReg_GR64PLTSafe_and_GR64_TCW64 7116 +VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC 7117 +VirtReg_GR64_NOREX_and_GR64_TCW64 7118 +VirtReg_GR64_ABCD 7119 +VirtReg_GR64_with_sub_32bit_in_GR32_TC 7120 +VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC 7121 +VirtReg_GR64_AD 7122 +VirtReg_GR64_ArgRef 7123 +VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP 7124 +VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef 7125 +VirtReg_GR64_with_sub_32bit_in_GR32_BPSP 7126 +VirtReg_GR64_with_sub_32bit_in_GR32_BSI 7127 +VirtReg_GR64_with_sub_32bit_in_GR32_CB 7128 +VirtReg_GR64_with_sub_32bit_in_GR32_DIBP 7129 +VirtReg_GR64_with_sub_32bit_in_GR32_SIDI 7130 +VirtReg_GR64_A 7131 +VirtReg_GR64_ArgRef_and_GR64_TC 7132 +VirtReg_GR64_and_LOW32_ADDR_ACCESS 7133 +VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI 7134 +VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef 7135 +VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB 7136 +VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP 7137 +VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC 7138 +VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI 7139 +VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI 7140 +VirtReg_RST 7141 +VirtReg_RFP80 7142 +VirtReg_RFP80_7 7143 +VirtReg_VR128X 7144 +VirtReg_VR128 7145 +VirtReg_VR256X 7146 +VirtReg_VR256 7147 +VirtReg_VR512 7148 +VirtReg_VR512_0_15 7149 +VirtReg_TILE 7150 diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp index ff87e7b6a1018..235a53dcc156e 100644 --- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp +++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp @@ -1113,7 +1113,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondSpills) { // Create a stack location and ensure it's tracked. SpillLoc SL = {getRegByName("RSP"), StackOffset::getFixed(-8)}; SpillLocationNo SpillNo = *MTracker->getOrTrackSpillLoc(SL); - ASSERT_EQ(MTracker->getNumLocs(), 13u); // Tracks all possible stack locs. + ASSERT_EQ(MTracker->getNumLocs(), 11u); // Tracks all possible stack locs. // Locations are: RSP, stack slots from 2^3 bits wide up to 2^9 for zmm regs, // then slots for sub_8bit_hi and sub_16bit_hi ({8, 8} and {16, 16}). // Finally, one for spilt fp80 registers. diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp index a006888a2352c..44b76ae7e8487 100644 --- a/llvm/utils/TableGen/X86RecognizableInstr.cpp +++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp @@ -1141,7 +1141,6 @@ OperandType RecognizableInstr::typeFromString(StringRef Str, bool hasREX_W, .Case("vz64mem", TYPE_MVSIBZ) .Case("BNDR", TYPE_BNDR) .Case("TILE", TYPE_TMM) - .Case("TILEPair", TYPE_TMM_PAIR) .Default(TYPE_NONE); // clang-format on From ba545840aefe8edd89d3d559111f922e5b11f387 Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Fri, 31 Oct 2025 17:20:49 +0530 Subject: [PATCH 331/539] [MLIR][NVVM] Update mbarrier.init/inval Ops to use AnyTypeOf[] (#165558) This patch updates the mbarrier.init/inval Ops to use the AnyTypeOf[] construct for their `addr` argument. This enables us to have a single Op that can take a pointer in either generic or shared memory space and generate the right intrinsics during the lowering. * Updated existing tests accordingly. * Verified locally that there are no new regressions in `integration` tests. * TODO: Additional updates for the remaining mbarrier Ops are in progress. These will be refactored in subsequent patches. Signed-off-by: Durgadoss R --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 4 +- flang/test/Lower/CUDA/cuda-device-proc.cuf | 2 +- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 71 ++++++++----------- .../Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp | 9 +-- mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 43 +++++++++++ .../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir | 8 +-- .../Conversion/NVVMToLLVM/nvvm-to-llvm.mlir | 2 +- mlir/test/Dialect/LLVMIR/nvvm.mlir | 8 +-- 8 files changed, 85 insertions(+), 62 deletions(-) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index ca3e1cd46db7d..15ea84565dd75 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -3359,8 +3359,8 @@ void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef args) { assert(args.size() == 2); mlir::Value barrier = convertPtrToNVVMSpace( builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); - mlir::NVVM::MBarrierInitSharedOp::create(builder, loc, barrier, - fir::getBase(args[1]), {}); + mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier, + fir::getBase(args[1]), {}); auto kind = mlir::NVVM::ProxyKindAttr::get( builder.getContext(), mlir::NVVM::ProxyKind::async_shared); auto space = mlir::NVVM::SharedSpaceAttr::get( diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index e5d3c437d7152..09b4302446ee7 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -431,7 +431,7 @@ end subroutine ! CHECK: %[[COUNT:.*]] = arith.constant 256 : i32 ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref) -> !llvm.ptr ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3> -! CHECK: nvvm.mbarrier.init.shared %[[SHARED_PTR]], %[[COUNT]] : !llvm.ptr<3>, i32 +! CHECK: nvvm.mbarrier.init %[[SHARED_PTR]], %[[COUNT]] : !llvm.ptr<3>, i32 ! CHECK: nvvm.fence.proxy {kind = #nvvm.proxy_kind, space = #nvvm.shared_space} ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref) -> !llvm.ptr diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 4f483859ac18d..b572ef9c1d07b 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -579,7 +579,8 @@ def NVVM_PMEventOp : NVVM_PTXBuilder_Op<"pmevent">, /// mbarrier.init instruction with generic pointer type def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">, - Arguments<(ins LLVM_AnyPointer:$addr, I32:$count, PtxPredicate:$predicate)> { + Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr, + I32:$count, PtxPredicate:$predicate)> { let summary = "MBarrier Initialization Op"; let description = [{ The `nvvm.mbarrier.init` operation initializes an *mbarrier object* at the specified @@ -592,48 +593,35 @@ def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">, - Transaction count (tx-count): 0 The operation takes the following operands: - - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic - addressing, but the address must still be in the shared memory space. + - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr` + must be a pointer to generic or shared::cta memory. When it is generic, the + underlying address must be within the shared::cta memory space; otherwise + the behavior is undefined. - `count`: Integer specifying the number of threads that will participate in barrier synchronization. Must be in the range [1, 2²⁰ - 1]. - `predicate`: Optional predicate for conditional execution. [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init) }]; - string llvmBuilder = [{ - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_init, {$addr, $count}); - }]; let assemblyFormat = "$addr `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)"; + let extraClassDeclaration = [{ bool hasIntrinsic() { if(getPredicate()) return false; return true; } - }]; - let extraClassDefinition = [{ - std::string $cppClass::getPtx() { return std::string("mbarrier.init.b64 [%0], %1;"); } - }]; -} -/// mbarrier.init instruction with shared pointer type -def NVVM_MBarrierInitSharedOp : NVVM_PTXBuilder_Op<"mbarrier.init.shared", [NVVMRequiresSM<80>, DeclareOpInterfaceMethods]>, - Arguments<(ins LLVM_PointerShared:$addr, I32:$count, PtxPredicate:$predicate)> { - let summary = "Shared MBarrier Initialization Op"; - let description = [{ - This Op is the same as `nvvm.mbarrier.init` except that the *mbarrier object* - should be accessed using a shared-memory pointer instead of a generic-memory pointer. - - [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init) + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); }]; + string llvmBuilder = [{ - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_init_shared, {$addr, $count}); - }]; - let assemblyFormat = "$addr `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)"; - let extraClassDeclaration = "bool hasIntrinsic() { return !getPredicate(); }"; - let extraClassDefinition = [{ - std::string $cppClass::getPtx() { return std::string("mbarrier.init.shared.b64 [%0], %1;"); } + auto [id, args] = NVVM::MBarrierInitOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, id, args); }]; } def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">, - Arguments<(ins LLVM_AnyPointer:$addr)> { + Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr)> { let summary = "MBarrier Invalidation Operation"; let description = [{ The `nvvm.mbarrier.inval` operation invalidates an *mbarrier object* at the @@ -644,30 +632,27 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">, It is undefined behavior if the *mbarrier object* is already invalid. The operation takes the following operand: - - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic - addressing, but the address must still be in the shared memory space. + - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr` + must be a pointer to generic or shared::cta memory. When it is generic, the + underlying address must be within the shared::cta memory space; otherwise + the behavior is undefined. [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval) }]; - string llvmBuilder = [{ - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_inval, {$addr}); - }]; - let assemblyFormat = "$addr attr-dict `:` type(operands)"; -} -def NVVM_MBarrierInvalSharedOp : NVVM_Op<"mbarrier.inval.shared">, - Arguments<(ins LLVM_PointerShared:$addr)> { - let summary = "Shared MBarrier Invalidation Operation"; - let description = [{ - This Op is the same as `nvvm.mbarrier.inval` except that the *mbarrier object* - should be accessed using a shared-memory pointer instead of a generic-memory pointer. + let assemblyFormat = "$addr attr-dict `:` type(operands)"; - [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval) + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); }]; + string llvmBuilder = [{ - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_inval_shared, {$addr}); + auto [id, args] = NVVM::MBarrierInvalOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, id, args); }]; - let assemblyFormat = "$addr attr-dict `:` type(operands)"; } def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">, diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp index a9efada28a320..ec182f1db48ac 100644 --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -846,13 +846,8 @@ struct NVGPUMBarrierInitLowering Value barrier = getMbarrierPtr(b, mbarrierType, adaptor.getBarriers(), adaptor.getMbarId(), rewriter); Value count = truncToI32(b, adaptor.getCount()); - if (isMbarrierShared(mbarrierType)) { - rewriter.replaceOpWithNewOp( - op, barrier, count, adaptor.getPredicate()); - } else { - rewriter.replaceOpWithNewOp(op, barrier, count, - adaptor.getPredicate()); - } + rewriter.replaceOpWithNewOp(op, barrier, count, + adaptor.getPredicate()); return success(); } }; diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index f0de4dbcc1d4b..53a6f43c0bbcf 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -1607,10 +1607,53 @@ void Tcgen05MmaSmemDescOp::createSmemDescriptor(Operation &op, mt.mapValue(thisOp.getRes()) = smemDesc; } +//===----------------------------------------------------------------------===// +// getPtx methods +//===----------------------------------------------------------------------===// + +std::string NVVM::MBarrierInitOp::getPtx() { + unsigned addressSpace = + llvm::cast(getAddr().getType()).getAddressSpace(); + return (addressSpace == NVVMMemorySpace::Shared) + ? std::string("mbarrier.init.shared.b64 [%0], %1;") + : std::string("mbarrier.init.b64 [%0], %1;"); +} + //===----------------------------------------------------------------------===// // getIntrinsicID/getIntrinsicIDAndArgs methods //===----------------------------------------------------------------------===// +mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast(op); + unsigned addressSpace = + llvm::cast(thisOp.getAddr().getType()) + .getAddressSpace(); + llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared) + ? llvm::Intrinsic::nvvm_mbarrier_init_shared + : llvm::Intrinsic::nvvm_mbarrier_init; + + // Fill the Intrinsic Args + llvm::SmallVector args; + args.push_back(mt.lookupValue(thisOp.getAddr())); + args.push_back(mt.lookupValue(thisOp.getCount())); + + return {id, std::move(args)}; +} + +mlir::NVVM::IDArgPair MBarrierInvalOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast(op); + unsigned addressSpace = + llvm::cast(thisOp.getAddr().getType()) + .getAddressSpace(); + llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared) + ? llvm::Intrinsic::nvvm_mbarrier_inval_shared + : llvm::Intrinsic::nvvm_mbarrier_inval; + + return {id, {mt.lookupValue(thisOp.getAddr())}}; +} + #define CP_ASYNC_ID_IMPL(mod, size, suffix) \ llvm::Intrinsic::nvvm_cp_async_##mod##_shared_global_##size##suffix diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir index 5755ca9258283..8cce6308018e2 100644 --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -486,7 +486,7 @@ func.func @mbarrier() { // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] + // CHECK: nvvm.mbarrier.init %[[barPtr]] nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> @@ -516,7 +516,7 @@ func.func @mbarrier_nocomplete() { // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] + // CHECK: nvvm.mbarrier.init %[[barPtr]] nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> @@ -592,7 +592,7 @@ func.func @mbarrier_txcount() { // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] + // CHECK: nvvm.mbarrier.init %[[barPtr]] nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType %tidxreg = nvvm.read.ptx.sreg.tid.x : i32 @@ -643,7 +643,7 @@ func.func @mbarrier_txcount_pred() { // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: nvvm.mbarrier.init.shared %[[barPtr]], {{.*}}, predicate = %[[P]] + // CHECK: nvvm.mbarrier.init %[[barPtr]], {{.*}}, predicate = %[[P]] nvgpu.mbarrier.init %barrier[%c0], %mine, predicate = %pred : !barrierType %txcount = arith.constant 256 : index diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir index 6960e83be3573..fbc4c0af60360 100644 --- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir +++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir @@ -8,7 +8,7 @@ // CHECK-LABEL: @init_mbarrier llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %count : i32, %pred : i1) { //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.shared.b64 [$0], $1;", "r,r,b" - nvvm.mbarrier.init.shared %barrier, %count, predicate = %pred : !llvm.ptr<3>, i32, i1 + nvvm.mbarrier.init %barrier, %count, predicate = %pred : !llvm.ptr<3>, i32, i1 //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.b64 [$0], $1;", "l,r,b" nvvm.mbarrier.init %barrier_gen, %count, predicate = %pred : !llvm.ptr, i32, i1 llvm.return diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir index 0243f5eb8c862..2505e56407c2b 100644 --- a/mlir/test/Dialect/LLVMIR/nvvm.mlir +++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir @@ -419,8 +419,8 @@ llvm.func private @mbarrier_init_generic(%barrier: !llvm.ptr) { llvm.func private @mbarrier_init_shared(%barrier: !llvm.ptr<3>) { %count = nvvm.read.ptx.sreg.ntid.x : i32 - // CHECK: nvvm.mbarrier.init.shared %{{.*}}, %{{.*}} : !llvm.ptr<3>, i32 - nvvm.mbarrier.init.shared %barrier, %count : !llvm.ptr<3>, i32 + // CHECK: nvvm.mbarrier.init %{{.*}}, %{{.*}} : !llvm.ptr<3>, i32 + nvvm.mbarrier.init %barrier, %count : !llvm.ptr<3>, i32 llvm.return } @@ -433,8 +433,8 @@ llvm.func private @mbarrier_inval_generic(%barrier: !llvm.ptr) { llvm.func private @mbarrier_inval_shared(%barrier: !llvm.ptr<3>) { - // CHECK: nvvm.mbarrier.inval.shared %{{.*}} : !llvm.ptr<3> - nvvm.mbarrier.inval.shared %barrier : !llvm.ptr<3> + // CHECK: nvvm.mbarrier.inval %{{.*}} : !llvm.ptr<3> + nvvm.mbarrier.inval %barrier : !llvm.ptr<3> llvm.return } From 91360382698cbe084164b45a3b3e4f2b17129d8a Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Fri, 31 Oct 2025 12:56:37 +0100 Subject: [PATCH 332/539] [libc++] Simplify the implementation of destroy_at a bit (#165392) --- libcxx/include/__memory/construct_at.h | 31 +++++++++----------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/libcxx/include/__memory/construct_at.h b/libcxx/include/__memory/construct_at.h index 658269158d945..5378c03abab3a 100644 --- a/libcxx/include/__memory/construct_at.h +++ b/libcxx/include/__memory/construct_at.h @@ -14,7 +14,6 @@ #include <__config> #include <__memory/addressof.h> #include <__new/placement_new_delete.h> -#include <__type_traits/enable_if.h> #include <__type_traits/is_array.h> #include <__utility/declval.h> #include <__utility/forward.h> @@ -55,35 +54,25 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp* __construct_at(_Tp* __l // The internal functions are available regardless of the language version (with the exception of the `__destroy_at` // taking an array). -template ::value, int> = 0> +template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __destroy_at(_Tp* __loc) { _LIBCPP_ASSERT_NON_NULL(__loc != nullptr, "null pointer given to destroy_at"); - __loc->~_Tp(); -} - #if _LIBCPP_STD_VER >= 20 -template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI constexpr void __destroy_at(_Tp* __loc) { - _LIBCPP_ASSERT_NON_NULL(__loc != nullptr, "null pointer given to destroy_at"); - for (auto&& __val : *__loc) - std::__destroy_at(std::addressof(__val)); -} + if constexpr (is_array_v<_Tp>) { + for (auto&& __val : *__loc) + std::__destroy_at(std::addressof(__val)); + } else #endif + { + __loc->~_Tp(); + } +} #if _LIBCPP_STD_VER >= 17 - -template , int> = 0> +template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void destroy_at(_Tp* _LIBCPP_DIAGNOSE_NULLPTR __loc) { std::__destroy_at(__loc); } - -# if _LIBCPP_STD_VER >= 20 -template , int> = 0> -_LIBCPP_HIDE_FROM_ABI constexpr void destroy_at(_Tp* _LIBCPP_DIAGNOSE_NULLPTR __loc) { - std::__destroy_at(__loc); -} -# endif - #endif // _LIBCPP_STD_VER >= 17 _LIBCPP_END_NAMESPACE_STD From e78b085bbe66e9a2651cf99576c97dfb8de039f9 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 31 Oct 2025 12:03:56 +0000 Subject: [PATCH 333/539] [gn build] Port 5322fb626820 --- llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn index 3c523aeada6cb..03e5294b03860 100644 --- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn @@ -132,18 +132,12 @@ copy("Headers") { "amdgpuintrin.h", "ammintrin.h", "amxavx512intrin.h", - "amxbf16transposeintrin.h", "amxcomplexintrin.h", - "amxcomplextransposeintrin.h", "amxfp16intrin.h", - "amxfp16transposeintrin.h", "amxfp8intrin.h", "amxintrin.h", "amxmovrsintrin.h", - "amxmovrstransposeintrin.h", "amxtf32intrin.h", - "amxtf32transposeintrin.h", - "amxtransposeintrin.h", "andes_vector.h", "arm64intr.h", "arm_acle.h", From ff7741c5a08775c65a3aac38557bc0aad9dbc739 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 31 Oct 2025 12:12:56 +0000 Subject: [PATCH 334/539] [X86] narrowBitOpRMW - add handling for single bit insertion patterns (#165742) Insertion of a single bit into a large integer is typically canonicalized to "(X & ~(1 << ShAmt)) | (InsertBit << ShAmt)", which can be simplified to modify the i32 block as a BTR followed by an OR((i32)InsertBit << (ShAmt % 32). We must ensure that the InsertBit is zero apart from the LSB so we can cheaply truncate it to work with the i32 block like the simpler BT patterns. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 43 +- llvm/test/CodeGen/X86/bittest-big-integer.ll | 941 ++----------------- 2 files changed, 116 insertions(+), 868 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1ce419ba00824..fd01363bed709 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -53345,7 +53345,8 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, } // Look for a RMW operation that only touches one bit of a larger than legal -// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value. +// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single +// i32 sub value. static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -53371,14 +53372,20 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, // BTR: X & ~(1 << ShAmt) // BTS: X | (1 << ShAmt) // BTC: X ^ (1 << ShAmt) - SDValue ShAmt; + // + // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt) + SDValue InsertBit, ShAmt; if (!StoredVal.hasOneUse() || !(sd_match(StoredVal, m_And(m_Specific(LoadVal), m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || sd_match(StoredVal, m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || sd_match(StoredVal, - m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))))) + m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || + sd_match(StoredVal, + m_Or(m_And(m_Specific(LoadVal), + m_Not(m_Shl(m_One(), m_Value(ShAmt)))), + m_Shl(m_Value(InsertBit), m_Deferred(ShAmt)))))) return SDValue(); // Ensure the shift amount is in bounds. @@ -53386,6 +53393,13 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, if (KnownAmt.getMaxValue().uge(VT.getSizeInBits())) return SDValue(); + // If we're inserting a bit then it must be the LSB. + if (InsertBit) { + KnownBits KnownInsert = DAG.computeKnownBits(InsertBit); + if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1)) + return SDValue(); + } + // Split the shift into an alignment shift that moves the active i32 block to // the bottom bits for truncation and a modulo shift that can act on the i32. EVT AmtVT = ShAmt.getValueType(); @@ -53393,6 +53407,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, DAG.getSignedConstant(-32LL, DL, AmtVT)); SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT)); + ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8); // Compute the byte offset for the i32 block that is changed by the RMW. // combineTruncate will adjust the load for us in a similar way. @@ -53407,13 +53422,23 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt); X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); - SDValue Mask = - DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), - DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); - if (StoredVal.getOpcode() == ISD::AND) - Mask = DAG.getNOT(DL, Mask, MVT::i32); + SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getConstant(1, DL, MVT::i32), ModuloAmt); + + SDValue Res; + if (InsertBit) { + SDValue BitMask = + DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt); + Res = + DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32)); + Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask); + } else { + if (StoredVal.getOpcode() == ISD::AND) + Mask = DAG.getNOT(DL, Mask, MVT::i32); + Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); + } - SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), Align(), St->getMemOperand()->getFlags()); } diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index 06e7d4773c58d..5776c6c82bcc3 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE -; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX ; bt/btc/btr/bts patterns + 'init' to set single bit value in large integers @@ -356,41 +356,20 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB9_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl $0, %edx -; X86-NEXT: .LBB9_2: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: notl %esi -; X86-NEXT: notl %edx -; X86-NEXT: je .LBB9_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: .LBB9_4: -; X86-NEXT: andl 4(%ebx), %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: andl (%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl $32, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: movl (%ebx,%eax), %eax -; X86-NEXT: btl %ecx, %eax +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi ; X86-NEXT: setae %al -; X86-NEXT: movl %esi, 4(%ebx) -; X86-NEXT: movl %edx, (%ebx) +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -600,201 +579,55 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind { define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-LABEL: init_eq_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $96, %esp -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movzbl 16(%ebp), %ebx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %edi -; X86-NEXT: movl 72(%esp,%edi), %edx -; X86-NEXT: movl 76(%esp,%edi), %esi -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 64(%esp,%edi), %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NEXT: movl 68(%esp,%edi), %ebx -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shldl %cl, %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: notl %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl 40(%esp,%eax), %edi -; X86-NEXT: movl 44(%esp,%eax), %esi -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 12(%ecx), %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: notl %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl 36(%esp,%esi), %esi -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: andl 8(%edx), %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: notl %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl 32(%esp,%eax), %eax -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl 8(%ebp), %edi -; X86-NEXT: andl 4(%edi), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: andl (%edi), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl $96, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: movl (%edi,%eax), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 12(%edi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 8(%edi) -; X86-NEXT: movl %ebx, 4(%edi) -; X86-NEXT: movl %edx, (%edi) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi ; X86-NEXT: setae %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: init_eq_i128: ; SSE: # %bb.0: ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %esi -; SSE-NEXT: xorl %r8d, %r8d -; SSE-NEXT: shldq %cl, %rsi, %r8 -; SSE-NEXT: shlq %cl, %rsi -; SSE-NEXT: movl %edx, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: xorl %r9d, %r9d -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rsi, %r8 -; SSE-NEXT: cmovneq %r9, %rsi -; SSE-NEXT: notq %r8 -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: cmovneq %r9, %rax -; SSE-NEXT: notq %rsi -; SSE-NEXT: andq 8(%rdi), %r8 -; SSE-NEXT: orq %rdx, %r8 -; SSE-NEXT: andq (%rdi), %rsi -; SSE-NEXT: orq %rax, %rsi -; SSE-NEXT: movl %ecx, %eax -; SSE-NEXT: andl $96, %eax -; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: movl (%rdi,%rax), %eax -; SSE-NEXT: btl %ecx, %eax +; SSE-NEXT: andl $96, %esi +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: movl (%rdi,%rsi), %r8d +; SSE-NEXT: btl %ecx, %r8d ; SSE-NEXT: setae %al -; SSE-NEXT: movq %r8, 8(%rdi) -; SSE-NEXT: movq %rsi, (%rdi) +; SSE-NEXT: shll %cl, %edx +; SSE-NEXT: btrl %ecx, %r8d +; SSE-NEXT: orl %r8d, %edx +; SSE-NEXT: movl %edx, (%rdi,%rsi) ; SSE-NEXT: retq ; -; AVX2-LABEL: init_eq_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl $1, %eax -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shldq %cl, %rax, %rsi -; AVX2-NEXT: movl %edx, %edx -; AVX2-NEXT: xorl %r8d, %r8d -; AVX2-NEXT: shldq %cl, %rdx, %r8 -; AVX2-NEXT: xorl %r9d, %r9d -; AVX2-NEXT: shlxq %rcx, %rax, %rax -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rax, %rsi -; AVX2-NEXT: cmovneq %r9, %rax -; AVX2-NEXT: shlxq %rcx, %rdx, %rdx -; AVX2-NEXT: cmovneq %rdx, %r8 -; AVX2-NEXT: cmovneq %r9, %rdx -; AVX2-NEXT: andnq 8(%rdi), %rsi, %rsi -; AVX2-NEXT: orq %r8, %rsi -; AVX2-NEXT: andnq (%rdi), %rax, %r8 -; AVX2-NEXT: orq %rdx, %r8 -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $96, %eax -; AVX2-NEXT: shrl $3, %eax -; AVX2-NEXT: movl (%rdi,%rax), %eax -; AVX2-NEXT: btl %ecx, %eax -; AVX2-NEXT: setae %al -; AVX2-NEXT: movq %rsi, 8(%rdi) -; AVX2-NEXT: movq %r8, (%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: init_eq_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl $1, %eax -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shldq %cl, %rax, %rsi -; AVX512-NEXT: xorl %r8d, %r8d -; AVX512-NEXT: shlxq %rcx, %rax, %rax -; AVX512-NEXT: movl %edx, %edx -; AVX512-NEXT: xorl %r9d, %r9d -; AVX512-NEXT: shldq %cl, %rdx, %r9 -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %rax, %rsi -; AVX512-NEXT: cmovneq %r8, %rax -; AVX512-NEXT: shlxq %rcx, %rdx, %rdx -; AVX512-NEXT: cmovneq %rdx, %r9 -; AVX512-NEXT: cmovneq %r8, %rdx -; AVX512-NEXT: andnq 8(%rdi), %rsi, %rsi -; AVX512-NEXT: orq %r9, %rsi -; AVX512-NEXT: andnq (%rdi), %rax, %r8 -; AVX512-NEXT: orq %rdx, %r8 -; AVX512-NEXT: movl %ecx, %eax -; AVX512-NEXT: andl $96, %eax -; AVX512-NEXT: shrl $3, %eax -; AVX512-NEXT: movl (%rdi,%rax), %eax -; AVX512-NEXT: btl %ecx, %eax -; AVX512-NEXT: setae %al -; AVX512-NEXT: movq %rsi, 8(%rdi) -; AVX512-NEXT: movq %r8, (%rdi) -; AVX512-NEXT: retq +; AVX-LABEL: init_eq_i128: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: andl $96, %ecx +; AVX-NEXT: shrl $3, %ecx +; AVX-NEXT: movl (%rdi,%rcx), %r8d +; AVX-NEXT: btl %esi, %r8d +; AVX-NEXT: setae %al +; AVX-NEXT: btrl %esi, %r8d +; AVX-NEXT: shlxl %esi, %edx, %edx +; AVX-NEXT: orl %r8d, %edx +; AVX-NEXT: movl %edx, (%rdi,%rcx) +; AVX-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -970,665 +803,55 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-LABEL: init_eq_i512: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $352, %esp # imm = 0x160 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrl $3, %edx -; X86-NEXT: andl $60, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 56(%eax), %esi -; X86-NEXT: movl 60(%eax), %ebx -; X86-NEXT: movl 52(%eax), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%eax), %edi -; X86-NEXT: movl 44(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl 16(%ebp), %eax -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: andl $31, %ecx -; X86-NEXT: shldl %cl, %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 56(%eax), %esi -; X86-NEXT: movl 60(%eax), %edi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: andl 60(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 52(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 56(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 48(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 52(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 44(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 48(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 40(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 44(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 36(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 40(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 32(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 36(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 28(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 32(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 24(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 28(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 20(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 24(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 16(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 20(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 12(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 16(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 8(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 12(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 4(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 8(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: movl (%eax), %eax -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: andl 4(%edx), %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: andl (%edx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl (%edx,%eax), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 60(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 56(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 52(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 48(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 44(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 40(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 36(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 32(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 24(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 12(%edx) -; X86-NEXT: movl %ebx, 8(%edx) -; X86-NEXT: movl %edi, 4(%edx) -; X86-NEXT: movl %esi, (%edx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi ; X86-NEXT: setae %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: init_eq_i512: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $184, %rsp -; SSE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: movl %esi, %eax -; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: # kill: def $eax killed $eax killed $rax -; SSE-NEXT: andl $56, %eax -; SSE-NEXT: negl %eax -; SSE-NEXT: movslq %eax, %r12 -; SSE-NEXT: movq 160(%rsp,%r12), %rax -; SSE-NEXT: movq 168(%rsp,%r12), %r10 -; SSE-NEXT: shldq %cl, %rax, %r10 -; SSE-NEXT: movq 152(%rsp,%r12), %rsi -; SSE-NEXT: shldq %cl, %rsi, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 144(%rsp,%r12), %r11 -; SSE-NEXT: shldq %cl, %r11, %rsi -; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 136(%rsp,%r12), %rbx -; SSE-NEXT: shldq %cl, %rbx, %r11 -; SSE-NEXT: movq 128(%rsp,%r12), %r14 -; SSE-NEXT: shldq %cl, %r14, %rbx -; SSE-NEXT: movq 120(%rsp,%r12), %r15 -; SSE-NEXT: shldq %cl, %r15, %r14 -; SSE-NEXT: movq 112(%rsp,%r12), %r13 -; SSE-NEXT: shldq %cl, %r13, %r15 -; SSE-NEXT: shlq %cl, %r13 -; SSE-NEXT: movl %edx, %eax -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq 32(%rsp,%r12), %rax -; SSE-NEXT: movq 40(%rsp,%r12), %rdx -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 24(%rsp,%r12), %rdx -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq 16(%rsp,%r12), %rsi -; SSE-NEXT: shldq %cl, %rsi, %rdx -; SSE-NEXT: movq 8(%rsp,%r12), %r8 -; SSE-NEXT: shldq %cl, %r8, %rsi -; SSE-NEXT: movq (%rsp,%r12), %rbp -; SSE-NEXT: shldq %cl, %rbp, %r8 -; SSE-NEXT: movq -8(%rsp,%r12), %r9 -; SSE-NEXT: shldq %cl, %r9, %rbp -; SSE-NEXT: notq %r10 -; SSE-NEXT: andq 56(%rdi), %r10 -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE-NEXT: notq %r10 -; SSE-NEXT: andq 48(%rdi), %r10 -; SSE-NEXT: orq %rax, %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: notq %rax -; SSE-NEXT: andq 40(%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: notq %r11 -; SSE-NEXT: andq 32(%rdi), %r11 -; SSE-NEXT: orq %rsi, %r11 -; SSE-NEXT: notq %rbx -; SSE-NEXT: andq 24(%rdi), %rbx -; SSE-NEXT: orq %r8, %rbx -; SSE-NEXT: notq %r14 -; SSE-NEXT: andq 16(%rdi), %r14 -; SSE-NEXT: orq %rbp, %r14 -; SSE-NEXT: notq %r15 -; SSE-NEXT: movq -16(%rsp,%r12), %rax -; SSE-NEXT: shldq %cl, %rax, %r9 -; SSE-NEXT: andq 8(%rdi), %r15 -; SSE-NEXT: orq %r9, %r15 -; SSE-NEXT: notq %r13 -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: andq (%rdi), %r13 -; SSE-NEXT: orq %rax, %r13 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andl $60, %eax -; SSE-NEXT: movl (%rdi,%rax), %eax -; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; SSE-NEXT: btl %ecx, %eax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: movq %rax, 56(%rdi) -; SSE-NEXT: movq %r10, 48(%rdi) -; SSE-NEXT: movq %rdx, 40(%rdi) -; SSE-NEXT: movq %r11, 32(%rdi) -; SSE-NEXT: movq %rbx, 24(%rdi) -; SSE-NEXT: movq %r14, 16(%rdi) -; SSE-NEXT: movq %r15, 8(%rdi) -; SSE-NEXT: movq %r13, (%rdi) +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $60, %esi +; SSE-NEXT: movl (%rdi,%rsi), %r8d +; SSE-NEXT: btl %ecx, %r8d ; SSE-NEXT: setae %al -; SSE-NEXT: addq $184, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp +; SSE-NEXT: shll %cl, %edx +; SSE-NEXT: btrl %ecx, %r8d +; SSE-NEXT: orl %r8d, %edx +; SSE-NEXT: movl %edx, (%rdi,%rsi) ; SSE-NEXT: retq ; -; AVX2-LABEL: init_eq_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $168, %rsp -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: movl %esi, %r11d -; AVX2-NEXT: shrl $3, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: andl $56, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: movslq %eax, %r10 -; AVX2-NEXT: movq 104(%rsp,%r10), %r15 -; AVX2-NEXT: movq 112(%rsp,%r10), %rax -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: shldq %cl, %r15, %rsi -; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 120(%rsp,%r10), %rsi -; AVX2-NEXT: movq %rsi, %r8 -; AVX2-NEXT: shldq %cl, %rax, %r8 -; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 128(%rsp,%r10), %rax -; AVX2-NEXT: movq %rax, %rbx -; AVX2-NEXT: shldq %cl, %rsi, %rbx -; AVX2-NEXT: movq 136(%rsp,%r10), %rsi -; AVX2-NEXT: movq %rsi, %r14 -; AVX2-NEXT: shldq %cl, %rax, %r14 -; AVX2-NEXT: movq 144(%rsp,%r10), %rax -; AVX2-NEXT: movq %rax, %r12 -; AVX2-NEXT: shldq %cl, %rsi, %r12 -; AVX2-NEXT: movq 96(%rsp,%r10), %rsi -; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 152(%rsp,%r10), %r13 -; AVX2-NEXT: shldq %cl, %rax, %r13 -; AVX2-NEXT: shldq %cl, %rsi, %r15 -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 16(%rsp,%r10), %rbp -; AVX2-NEXT: movq 24(%rsp,%r10), %r9 -; AVX2-NEXT: shldq %cl, %rbp, %r9 -; AVX2-NEXT: movq 8(%rsp,%r10), %rdx -; AVX2-NEXT: shldq %cl, %rdx, %rbp -; AVX2-NEXT: movq (%rsp,%r10), %rax -; AVX2-NEXT: shldq %cl, %rax, %rdx -; AVX2-NEXT: movq -8(%rsp,%r10), %r8 -; AVX2-NEXT: shldq %cl, %r8, %rax -; AVX2-NEXT: movq -16(%rsp,%r10), %rsi -; AVX2-NEXT: shldq %cl, %rsi, %r8 -; AVX2-NEXT: andnq 56(%rdi), %r13, %r13 -; AVX2-NEXT: orq %r9, %r13 -; AVX2-NEXT: movq -24(%rsp,%r10), %r9 -; AVX2-NEXT: shldq %cl, %r9, %rsi -; AVX2-NEXT: andnq 48(%rdi), %r12, %r12 -; AVX2-NEXT: andnq 40(%rdi), %r14, %r14 -; AVX2-NEXT: orq %rbp, %r12 -; AVX2-NEXT: orq %rdx, %r14 -; AVX2-NEXT: andnq 32(%rdi), %rbx, %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; AVX2-NEXT: movq -32(%rsp,%r10), %r10 -; AVX2-NEXT: shlxq %rcx, %r10, %rbx -; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %r10, %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: andnq 24(%rdi), %rcx, %rcx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andnq 16(%rdi), %r10, %r10 -; AVX2-NEXT: orq %r8, %rcx -; AVX2-NEXT: orq %rsi, %r10 -; AVX2-NEXT: andnq 8(%rdi), %r15, %rsi -; AVX2-NEXT: orq %r9, %rsi -; AVX2-NEXT: andnq (%rdi), %rax, %rax -; AVX2-NEXT: orq %rbx, %rax -; AVX2-NEXT: andl $60, %r11d -; AVX2-NEXT: movl (%rdi,%r11), %r8d -; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload -; AVX2-NEXT: btl %r9d, %r8d -; AVX2-NEXT: movq %r13, 56(%rdi) -; AVX2-NEXT: movq %r12, 48(%rdi) -; AVX2-NEXT: movq %r14, 40(%rdi) -; AVX2-NEXT: movq %rdx, 32(%rdi) -; AVX2-NEXT: movq %rcx, 24(%rdi) -; AVX2-NEXT: movq %r10, 16(%rdi) -; AVX2-NEXT: movq %rsi, 8(%rdi) -; AVX2-NEXT: movq %rax, (%rdi) -; AVX2-NEXT: setae %al -; AVX2-NEXT: addq $168, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: init_eq_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $152, %rsp -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: movl %esi, %r8d -; AVX512-NEXT: shrl $3, %r8d -; AVX512-NEXT: movl %r8d, %eax -; AVX512-NEXT: andl $56, %eax -; AVX512-NEXT: negl %eax -; AVX512-NEXT: movslq %eax, %r9 -; AVX512-NEXT: movq 88(%rsp,%r9), %r10 -; AVX512-NEXT: movq 96(%rsp,%r9), %rax -; AVX512-NEXT: movq %rax, %rsi -; AVX512-NEXT: shldq %cl, %r10, %rsi -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 104(%rsp,%r9), %rsi -; AVX512-NEXT: movq %rsi, %r11 -; AVX512-NEXT: shldq %cl, %rax, %r11 -; AVX512-NEXT: movq 112(%rsp,%r9), %rax -; AVX512-NEXT: movq %rax, %rbx -; AVX512-NEXT: shldq %cl, %rsi, %rbx -; AVX512-NEXT: movq 120(%rsp,%r9), %rsi -; AVX512-NEXT: movq %rsi, %r14 -; AVX512-NEXT: shldq %cl, %rax, %r14 -; AVX512-NEXT: movq 128(%rsp,%r9), %rax -; AVX512-NEXT: movq %rax, %r12 -; AVX512-NEXT: shldq %cl, %rsi, %r12 -; AVX512-NEXT: movq 136(%rsp,%r9), %r13 -; AVX512-NEXT: shldq %cl, %rax, %r13 -; AVX512-NEXT: movq 80(%rsp,%r9), %r15 -; AVX512-NEXT: shldq %cl, %r15, %r10 -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq (%rsp,%r9), %rbp -; AVX512-NEXT: movq 8(%rsp,%r9), %rsi -; AVX512-NEXT: shldq %cl, %rbp, %rsi -; AVX512-NEXT: movq -8(%rsp,%r9), %rdx -; AVX512-NEXT: shldq %cl, %rdx, %rbp -; AVX512-NEXT: movq -16(%rsp,%r9), %rax -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: andnq 56(%rdi), %r13, %r13 -; AVX512-NEXT: andnq 48(%rdi), %r12, %r12 -; AVX512-NEXT: orq %rsi, %r13 -; AVX512-NEXT: orq %rbp, %r12 -; AVX512-NEXT: andnq 40(%rdi), %r14, %r14 -; AVX512-NEXT: orq %rdx, %r14 -; AVX512-NEXT: movq -24(%rsp,%r9), %rsi -; AVX512-NEXT: shldq %cl, %rsi, %rax -; AVX512-NEXT: andnq 32(%rdi), %rbx, %rdx -; AVX512-NEXT: orq %rax, %rdx -; AVX512-NEXT: movq -32(%rsp,%r9), %rax -; AVX512-NEXT: shldq %cl, %rax, %rsi -; AVX512-NEXT: shlxq %rcx, %r15, %rbx -; AVX512-NEXT: andnq 24(%rdi), %r11, %r11 -; AVX512-NEXT: orq %rsi, %r11 -; AVX512-NEXT: movq -48(%rsp,%r9), %rsi -; AVX512-NEXT: movq -40(%rsp,%r9), %r9 -; AVX512-NEXT: shldq %cl, %r9, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: andnq 16(%rdi), %r15, %r15 -; AVX512-NEXT: orq %rax, %r15 -; AVX512-NEXT: shlxq %rcx, %rsi, %rax -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rsi, %r9 -; AVX512-NEXT: andnq 8(%rdi), %r10, %rcx -; AVX512-NEXT: orq %r9, %rcx -; AVX512-NEXT: andnq (%rdi), %rbx, %rsi -; AVX512-NEXT: orq %rax, %rsi -; AVX512-NEXT: andl $60, %r8d -; AVX512-NEXT: movl (%rdi,%r8), %eax -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload -; AVX512-NEXT: btl %r8d, %eax -; AVX512-NEXT: movq %r13, 56(%rdi) -; AVX512-NEXT: movq %r12, 48(%rdi) -; AVX512-NEXT: movq %r14, 40(%rdi) -; AVX512-NEXT: movq %rdx, 32(%rdi) -; AVX512-NEXT: movq %r11, 24(%rdi) -; AVX512-NEXT: movq %r15, 16(%rdi) -; AVX512-NEXT: movq %rcx, 8(%rdi) -; AVX512-NEXT: movq %rsi, (%rdi) -; AVX512-NEXT: setae %al -; AVX512-NEXT: addq $152, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: init_eq_i512: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: shrl $3, %ecx +; AVX-NEXT: andl $60, %ecx +; AVX-NEXT: movl (%rdi,%rcx), %r8d +; AVX-NEXT: btl %esi, %r8d +; AVX-NEXT: setae %al +; AVX-NEXT: btrl %esi, %r8d +; AVX-NEXT: shlxl %esi, %edx, %edx +; AVX-NEXT: orl %r8d, %edx +; AVX-NEXT: movl %edx, (%rdi,%rcx) +; AVX-NEXT: retq %rem = and i32 %position, 511 %ofs = zext nneg i32 %rem to i512 %bit = shl nuw i512 1, %ofs From 18ee67dc9219582318c3bb6c0dd7d21e08259759 Mon Sep 17 00:00:00 2001 From: Kunqiu Chen Date: Fri, 31 Oct 2025 21:04:35 +0800 Subject: [PATCH 335/539] [UTC] Support to test annotated IR (#165419) Some analysis/transformation, e.g., predicate info/ mem ssa, insert instruction annotations as comments, referring to https://github.com/llvm/llvm-project/pull/165249#discussion_r2466200672. This PR makes UTC support checking these instruction annotations with an extra UTC option `-check-inst-comments`. E.g., Before: ```LLVM ; CHECK: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 ``` After: ```LLVM ; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = and i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label %nope], RenamedOp: [[Z]] } ; CHECK-NEXT: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 ``` This PR also regenerates all UTC-generated tests for PredicateInfo; No MemSSA test is updated, as there are no UTC-generated tests designated for `print`. --- .../Util/PredicateInfo/branch-on-same-cond.ll | 36 +- .../Transforms/Util/PredicateInfo/condprop.ll | 103 +++-- .../Transforms/Util/PredicateInfo/diamond.ll | 20 +- .../Transforms/Util/PredicateInfo/edge.ll | 67 ++-- .../Transforms/Util/PredicateInfo/pr33456.ll | 19 +- .../Transforms/Util/PredicateInfo/pr33457.ll | 27 +- .../Util/PredicateInfo/testandor.ll | 354 ++++++++++++------ llvm/utils/UpdateTestChecks/common.py | 25 +- llvm/utils/update_test_checks.py | 8 + 9 files changed, 419 insertions(+), 240 deletions(-) diff --git a/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll b/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll index 0be13ee76bece..f024106b7299a 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments ; RUN: opt -S -passes=print-predicateinfo < %s 2>&1 >/dev/null | FileCheck %s ; FIXME: RenamedOp should be %cmp or %x in all cases here, @@ -9,25 +9,25 @@ define i32 @test(i32 %x) { ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 -; CHECK: RenamedOp: [[CMP]] -; CHECK: [[CMP_0:%.*]] = bitcast i1 [[CMP]] to i1 -; CHECK: RenamedOp: [[X]] -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[BB2:%.*]], label [[EXIT1:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB1]],label [[BB2:%.*]]], RenamedOp: [[CMP]] } +; CHECK-NEXT: [[CMP_0:%.*]] = bitcast i1 [[CMP]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB1]],label [[BB2]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[BB2]], label [[EXIT1:%.*]] ; CHECK: bb2: -; CHECK: RenamedOp: [[CMP_0]] -; CHECK: [[CMP_0_1:%.*]] = bitcast i1 [[CMP_0]] to i1 -; CHECK: RenamedOp: [[X]] -; CHECK: [[X_0_1:%.*]] = bitcast i32 [[X_0]] to i32 -; CHECK: RenamedOp: [[X_0]] -; CHECK: [[X_0_4:%.*]] = bitcast i32 [[X_0]] to i32 -; CHECK-NEXT: br i1 [[CMP_0]], label [[BB3:%.*]], label [[EXIT2:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB2]],label [[BB3:%.*]]], RenamedOp: [[CMP_0]] } +; CHECK-NEXT: [[CMP_0_1:%.*]] = bitcast i1 [[CMP_0]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB2]],label [[BB3]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0_1:%.*]] = bitcast i32 [[X_0]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB2]],label [[EXIT2:%.*]]], RenamedOp: [[X_0]] } +; CHECK-NEXT: [[X_0_4:%.*]] = bitcast i32 [[X_0]] to i32 +; CHECK-NEXT: br i1 [[CMP_0]], label [[BB3]], label [[EXIT2]] ; CHECK: bb3: -; CHECK: RenamedOp: [[X]] -; CHECK: [[X_0_1_2:%.*]] = bitcast i32 [[X_0_1]] to i32 -; CHECK: RenamedOp: [[X_0_1]] -; CHECK: [[X_0_1_3:%.*]] = bitcast i32 [[X_0_1]] to i32 -; CHECK-NEXT: br i1 [[CMP_0_1]], label [[EXIT3:%.*]], label [[EXIT4:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB3]],label [[EXIT3:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0_1_2:%.*]] = bitcast i32 [[X_0_1]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB3]],label [[EXIT4:%.*]]], RenamedOp: [[X_0_1]] } +; CHECK-NEXT: [[X_0_1_3:%.*]] = bitcast i32 [[X_0_1]] to i32 +; CHECK-NEXT: br i1 [[CMP_0_1]], label [[EXIT3]], label [[EXIT4]] ; CHECK: exit1: ; CHECK-NEXT: ret i32 0 ; CHECK: exit2: diff --git a/llvm/test/Transforms/Util/PredicateInfo/condprop.ll b/llvm/test/Transforms/Util/PredicateInfo/condprop.ll index 256d0d908ec1e..42e8ccb760b3f 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/condprop.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/condprop.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments ; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s @a = external global i32 ; [#uses=7] @@ -98,12 +98,17 @@ define void @test3(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XZ]], [[YZ]] -; CHECK: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 -; CHECK: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 -; CHECK: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK-NEXT: br i1 [[Z]], label [[BOTH_ZERO:%.*]], label [[NOPE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = and i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH_ZERO:%.*]]], RenamedOp: [[XZ]] } +; CHECK-NEXT: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH_ZERO]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH_ZERO]]], RenamedOp: [[YZ]] } +; CHECK-NEXT: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH_ZERO]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: br i1 [[Z]], label [[BOTH_ZERO]], label [[NOPE]] ; CHECK: both_zero: ; CHECK-NEXT: call void @foo(i1 [[XZ_0]]) ; CHECK-NEXT: call void @foo(i1 [[YZ_0]]) @@ -133,10 +138,11 @@ define void @test4(i1 %b, i32 %x) { ; CHECK-LABEL: @test4( ; CHECK-NEXT: br i1 [[B:%.*]], label [[SW:%.*]], label [[CASE3:%.*]] ; CHECK: sw: -; CHECK: [[X_0:%.*]] = bitcast i32 [[X:%.*]] to i32 +; CHECK-NEXT: ; switch predicate info { CaseValue: i32 1 Edge: [label [[SW]],label [[CASE1:%.*]]], RenamedOp: [[X:%.*]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 ; CHECK-NEXT: switch i32 [[X]], label [[DEFAULT:%.*]] [ ; CHECK-NEXT: i32 0, label [[CASE0:%.*]] -; CHECK-NEXT: i32 1, label [[CASE1:%.*]] +; CHECK-NEXT: i32 1, label [[CASE1]] ; CHECK-NEXT: i32 2, label [[CASE0]] ; CHECK-NEXT: i32 3, label [[CASE3]] ; CHECK-NEXT: i32 4, label [[DEFAULT]] @@ -180,11 +186,15 @@ case3: define i1 @test5(i32 %x, i32 %y) { ; CHECK-LABEL: @test5( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]] -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[X_1:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK: [[Y_1:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], [[Y]] Edge: [label [[TMP0:%.*]],label [[SAME:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_1:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[SAME]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_1:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[SAME]], label [[DIFFERENT]] ; CHECK: same: ; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i32 [[X_0]], [[Y_0]] ; CHECK-NEXT: ret i1 [[CMP2]] @@ -253,11 +263,15 @@ different: define i1 @test7(i32 %x, i32 %y) { ; CHECK-LABEL: @test7( ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]] -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[X_1:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK: [[Y_1:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp sgt i32 [[X]], [[Y]] Edge: [label [[TMP0:%.*]],label [[SAME:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp sgt i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_1:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp sgt i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[SAME]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp sgt i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_1:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[SAME]], label [[DIFFERENT]] ; CHECK: same: ; CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[X_0]], [[Y_0]] ; CHECK-NEXT: ret i1 [[CMP2]] @@ -280,11 +294,15 @@ different: define i1 @test7_fp(float %x, float %y) { ; CHECK-LABEL: @test7_fp( ; CHECK-NEXT: [[CMP:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]] -; CHECK: [[X_0:%.*]] = bitcast float [[X]] to float -; CHECK: [[X_1:%.*]] = bitcast float [[X]] to float -; CHECK: [[Y_0:%.*]] = bitcast float [[Y]] to float -; CHECK: [[Y_1:%.*]] = bitcast float [[Y]] to float -; CHECK-NEXT: br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp ogt float [[X]], [[Y]] Edge: [label [[TMP0:%.*]],label [[SAME:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast float [[X]] to float +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp ogt float [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_1:%.*]] = bitcast float [[X]] to float +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp ogt float [[X]], [[Y]] Edge: [label [[TMP0]],label [[SAME]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast float [[Y]] to float +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp ogt float [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_1:%.*]] = bitcast float [[Y]] to float +; CHECK-NEXT: br i1 [[CMP]], label [[SAME]], label [[DIFFERENT]] ; CHECK: same: ; CHECK-NEXT: [[CMP2:%.*]] = fcmp ule float [[X_0]], [[Y_0]] ; CHECK-NEXT: ret i1 [[CMP2]] @@ -353,9 +371,11 @@ different: define i32 @test9(i32 %i, i32 %j) { ; CHECK-LABEL: @test9( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]] -; CHECK: [[I_0:%.*]] = bitcast i32 [[I]] to i32 -; CHECK: [[J_0:%.*]] = bitcast i32 [[J]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[I]], [[J]] Edge: [label [[TMP0:%.*]],label [[COND_TRUE:%.*]]], RenamedOp: [[I]] } +; CHECK-NEXT: [[I_0:%.*]] = bitcast i32 [[I]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[I]], [[J]] Edge: [label [[TMP0]],label [[COND_TRUE]]], RenamedOp: [[J]] } +; CHECK-NEXT: [[J_0:%.*]] = bitcast i32 [[J]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE]], label [[RET:%.*]] ; CHECK: cond_true: ; CHECK-NEXT: [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]] ; CHECK-NEXT: ret i32 [[DIFF]] @@ -376,9 +396,11 @@ ret: define i32 @test10(i32 %j, i32 %i) { ; CHECK-LABEL: @test10( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]] -; CHECK: [[I_0:%.*]] = bitcast i32 [[I]] to i32 -; CHECK: [[J_0:%.*]] = bitcast i32 [[J]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[I]], [[J]] Edge: [label [[TMP0:%.*]],label [[COND_TRUE:%.*]]], RenamedOp: [[I]] } +; CHECK-NEXT: [[I_0:%.*]] = bitcast i32 [[I]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[I]], [[J]] Edge: [label [[TMP0]],label [[COND_TRUE]]], RenamedOp: [[J]] } +; CHECK-NEXT: [[J_0:%.*]] = bitcast i32 [[J]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE]], label [[RET:%.*]] ; CHECK: cond_true: ; CHECK-NEXT: [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]] ; CHECK-NEXT: ret i32 [[DIFF]] @@ -403,15 +425,18 @@ define i32 @test11(i32 %x) { ; CHECK-NEXT: [[V0:%.*]] = call i32 @yogibar() ; CHECK-NEXT: [[V1:%.*]] = call i32 @yogibar() ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[V0]], [[V1]] -; CHECK: [[V0_0:%.*]] = bitcast i32 [[V0]] to i32 -; CHECK: [[V1_0:%.*]] = bitcast i32 [[V1]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[NEXT:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[V0]], [[V1]] Edge: [label [[TMP0:%.*]],label [[NEXT:%.*]]], RenamedOp: [[V0]] } +; CHECK-NEXT: [[V0_0:%.*]] = bitcast i32 [[V0]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[V0]], [[V1]] Edge: [label [[TMP0]],label [[COND_TRUE:%.*]]], RenamedOp: [[V1]] } +; CHECK-NEXT: [[V1_0:%.*]] = bitcast i32 [[V1]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE]], label [[NEXT]] ; CHECK: cond_true: ; CHECK-NEXT: ret i32 [[V1_0]] ; CHECK: next: ; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[X:%.*]], [[V0_0]] -; CHECK: [[V0_0_1:%.*]] = bitcast i32 [[V0_0]] to i32 -; CHECK-NEXT: br i1 [[CMP2]], label [[COND_TRUE2:%.*]], label [[NEXT2:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP2]] = icmp eq i32 [[X]], [[V0_0]] Edge: [label [[NEXT]],label [[COND_TRUE2:%.*]]], RenamedOp: [[V0_0]] } +; CHECK-NEXT: [[V0_0_1:%.*]] = bitcast i32 [[V0_0]] to i32 +; CHECK-NEXT: br i1 [[CMP2]], label [[COND_TRUE2]], label [[NEXT2:%.*]] ; CHECK: cond_true2: ; CHECK-NEXT: ret i32 [[V0_0_1]] ; CHECK: next2: @@ -439,9 +464,11 @@ next2: define i32 @test12(i32 %x) { ; CHECK-LABEL: @test12( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[X_1:%.*]] = bitcast i32 [[X]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0:%.*]],label [[COND_TRUE:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[COND_FALSE:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_1:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE]], label [[COND_FALSE]] ; CHECK: cond_true: ; CHECK-NEXT: br label [[RET:%.*]] ; CHECK: cond_false: diff --git a/llvm/test/Transforms/Util/PredicateInfo/diamond.ll b/llvm/test/Transforms/Util/PredicateInfo/diamond.ll index ac2c9a1026e76..06c02d699c511 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/diamond.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/diamond.ll @@ -1,16 +1,18 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=print-predicateinfo < %s 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments +; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s define i1 @f(i32 %x, i1 %y) { ; CHECK-LABEL: @f( ; CHECK-NEXT: br i1 [[Y:%.*]], label [[BB0:%.*]], label [[BB1:%.*]] ; CHECK: bb0: ; CHECK-NEXT: [[CMP:%.*]] = icmp sge i32 [[X:%.*]], 0 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[BB2:%.*]], label [[BB3:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp sge i32 [[X]], 0 Edge: [label [[BB0]],label [[BB2:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[BB2]], label [[BB3:%.*]] ; CHECK: bb1: ; CHECK-NEXT: [[X2:%.*]] = add nuw nsw i32 [[X]], 1 ; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i32 [[X2]], 2 -; CHECK: [[X2_0:%.*]] = bitcast i32 [[X2]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP2]] = icmp sge i32 [[X2]], 2 Edge: [label [[BB1]],label [[BB2]]], RenamedOp: [[X2]] } +; CHECK-NEXT: [[X2_0:%.*]] = bitcast i32 [[X2]] to i32 ; CHECK-NEXT: br i1 [[CMP2]], label [[BB2]], label [[BB3]] ; CHECK: bb2: ; CHECK-NEXT: [[X3:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ [[X2_0]], [[BB1]] ] @@ -38,12 +40,14 @@ define i1 @g(i32 %x, i1 %y) { ; CHECK-NEXT: br i1 [[Y:%.*]], label [[BB0:%.*]], label [[BB1:%.*]] ; CHECK: bb0: ; CHECK-NEXT: [[CMP:%.*]] = icmp sge i32 [[X:%.*]], 0 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[BB3:%.*]], label [[BB2:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp sge i32 [[X]], 0 Edge: [label [[BB0]],label [[BB2:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[BB3:%.*]], label [[BB2]] ; CHECK: bb1: ; CHECK-NEXT: [[X2:%.*]] = add nuw nsw i32 [[X]], 1 ; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i32 [[X2]], 2 -; CHECK: [[X2_0:%.*]] = bitcast i32 [[X2]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP2]] = icmp sge i32 [[X2]], 2 Edge: [label [[BB1]],label [[BB2]]], RenamedOp: [[X2]] } +; CHECK-NEXT: [[X2_0:%.*]] = bitcast i32 [[X2]] to i32 ; CHECK-NEXT: br i1 [[CMP2]], label [[BB3]], label [[BB2]] ; CHECK: bb2: ; CHECK-NEXT: [[X3:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ [[X2_0]], [[BB1]] ] diff --git a/llvm/test/Transforms/Util/PredicateInfo/edge.ll b/llvm/test/Transforms/Util/PredicateInfo/edge.ll index ef757f323921a..913832696215e 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/edge.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/edge.ll @@ -1,16 +1,17 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=print-predicateinfo < %s 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments +; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s define i32 @f1(i32 %x) { ; CHECK-LABEL: @f1( ; CHECK-NEXT: bb0: ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[BB2:%.*]], label [[BB1:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB0:%.*]],label [[BB2:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[BB2]], label [[BB1:%.*]] ; CHECK: bb1: ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb2: -; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ] +; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ 0, [[BB1]] ] ; CHECK-NEXT: [[FOO:%.*]] = add i32 [[COND]], [[X]] ; CHECK-NEXT: ret i32 [[FOO]] ; @@ -29,12 +30,13 @@ define i32 @f2(i32 %x) { ; CHECK-LABEL: @f2( ; CHECK-NEXT: bb0: ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[X:%.*]], 0 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK-NEXT: br i1 [[CMP]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp ne i32 [[X]], 0 Edge: [label [[BB0:%.*]],label [[BB2:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[BB1:%.*]], label [[BB2]] ; CHECK: bb1: ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb2: -; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ] +; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ 0, [[BB1]] ] ; CHECK-NEXT: [[FOO:%.*]] = add i32 [[COND]], [[X]] ; CHECK-NEXT: ret i32 [[FOO]] ; @@ -52,14 +54,15 @@ bb2: define i32 @f3(i32 %x) { ; CHECK-LABEL: @f3( ; CHECK-NEXT: bb0: -; CHECK: [[X_0:%.*]] = bitcast i32 [[X:%.*]] to i32 +; CHECK-NEXT: ; switch predicate info { CaseValue: i32 0 Edge: [label [[BB0:%.*]],label [[BB2:%.*]]], RenamedOp: [[X:%.*]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 ; CHECK-NEXT: switch i32 [[X]], label [[BB1:%.*]] [ -; CHECK-NEXT: i32 0, label [[BB2:%.*]] +; CHECK-NEXT: i32 0, label [[BB2]] ; CHECK-NEXT: ] ; CHECK: bb1: ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb2: -; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ] +; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ 0, [[BB1]] ] ; CHECK-NEXT: [[FOO:%.*]] = add i32 [[COND]], [[X]] ; CHECK-NEXT: ret i32 [[FOO]] ; @@ -78,13 +81,14 @@ define double @fcmp_oeq_not_zero(double %x, double %y) { ; CHECK-LABEL: @fcmp_oeq_not_zero( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], 2.000000e+00 -; CHECK: [[Y_0:%.*]] = bitcast double [[Y]] to double -; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp oeq double [[Y]], 2.000000e+00 Edge: [label [[ENTRY:%.*]],label [[IF:%.*]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast double [[Y]] to double +; CHECK-NEXT: br i1 [[CMP]], label [[IF]], label [[RETURN:%.*]] ; CHECK: if: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]] ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY]] ] ; CHECK-NEXT: ret double [[RETVAL]] ; entry: @@ -105,13 +109,14 @@ define double @fcmp_une_not_zero(double %x, double %y) { ; CHECK-LABEL: @fcmp_une_not_zero( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[Y:%.*]], 2.000000e+00 -; CHECK: [[Y_0:%.*]] = bitcast double [[Y]] to double -; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp une double [[Y]], 2.000000e+00 Edge: [label [[ENTRY:%.*]],label [[ELSE:%.*]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast double [[Y]] to double +; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE]] ; CHECK: else: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]] ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY]] ] ; CHECK-NEXT: ret double [[RETVAL]] ; entry: @@ -132,13 +137,14 @@ define double @fcmp_oeq_zero(double %x, double %y) { ; CHECK-LABEL: @fcmp_oeq_zero( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], 0.000000e+00 -; CHECK: [[Y_0:%.*]] = bitcast double [[Y]] to double -; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp oeq double [[Y]], 0.000000e+00 Edge: [label [[ENTRY:%.*]],label [[IF:%.*]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast double [[Y]] to double +; CHECK-NEXT: br i1 [[CMP]], label [[IF]], label [[RETURN:%.*]] ; CHECK: if: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]] ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY]] ] ; CHECK-NEXT: ret double [[RETVAL]] ; entry: @@ -159,13 +165,14 @@ define double @fcmp_une_zero(double %x, double %y) { ; CHECK-LABEL: @fcmp_une_zero( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[Y:%.*]], -0.000000e+00 -; CHECK: [[Y_0:%.*]] = bitcast double [[Y]] to double -; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp une double [[Y]], -0.000000e+00 Edge: [label [[ENTRY:%.*]],label [[ELSE:%.*]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast double [[Y]] to double +; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE]] ; CHECK: else: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]] ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY]] ] ; CHECK-NEXT: ret double [[RETVAL]] ; entry: @@ -188,13 +195,14 @@ define double @fcmp_oeq_maybe_zero(double %x, double %y, double %z1, double %z2) ; CHECK-NEXT: entry: ; CHECK-NEXT: [[Z:%.*]] = fadd double [[Z1:%.*]], [[Z2:%.*]] ; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], [[Z]] -; CHECK: [[Z_0:%.*]] = bitcast double [[Z]] to double -; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp oeq double [[Y]], [[Z]] Edge: [label [[ENTRY:%.*]],label [[IF:%.*]]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[Z_0:%.*]] = bitcast double [[Z]] to double +; CHECK-NEXT: br i1 [[CMP]], label [[IF]], label [[RETURN:%.*]] ; CHECK: if: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Z_0]] ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY]] ] ; CHECK-NEXT: ret double [[RETVAL]] ; entry: @@ -217,13 +225,14 @@ define double @fcmp_une_maybe_zero(double %x, double %y, double %z1, double %z2) ; CHECK-NEXT: entry: ; CHECK-NEXT: [[Z:%.*]] = fadd double [[Z1:%.*]], [[Z2:%.*]] ; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[Y:%.*]], [[Z]] -; CHECK: [[Z_0:%.*]] = bitcast double [[Z]] to double -; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp une double [[Y]], [[Z]] Edge: [label [[ENTRY:%.*]],label [[ELSE:%.*]]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[Z_0:%.*]] = bitcast double [[Z]] to double +; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE]] ; CHECK: else: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Z_0]] ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY]] ] ; CHECK-NEXT: ret double [[RETVAL]] ; entry: diff --git a/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll b/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll index 36eaf6e66578d..4762d376ef5aa 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll @@ -1,5 +1,5 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=print-predicateinfo < %s 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments +; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s ; Don't insert predicate info for conditions with a single target. @a = global i32 1, align 4 @d = common global i32 0, align 4 @@ -12,22 +12,27 @@ define i32 @main() { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @d, align 4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP13:%.*]] -; CHECK: [[TMP4:%.*]] = load i32, ptr @a, align 4 +; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr @a, align 4 ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr @c, align 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp slt i32 [[TMP5]], 1 ; CHECK-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] -; CHECK: [[TMP8:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP4]], 0 ; CHECK-NEXT: br i1 [[TMP8]], label [[TMP9]], label [[TMP9]] -; CHECK: [[DOT0:%.*]] = phi i32 [ [[TMP4]], [[TMP7]] ], [ [[TMP4]], [[TMP7]] ], [ [[DOT1:%.*]], [[TMP13]] ], [ [[TMP4]], [[TMP3]] ] +; CHECK: 9: +; CHECK-NEXT: [[DOT0:%.*]] = phi i32 [ [[TMP4]], [[TMP7]] ], [ [[TMP4]], [[TMP7]] ], [ [[DOT1:%.*]], [[TMP13]] ], [ [[TMP4]], [[TMP3]] ] ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr @b, align 4 ; CHECK-NEXT: [[TMP11:%.*]] = sdiv i32 [[TMP10]], [[DOT0]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 0 ; CHECK-NEXT: br i1 [[TMP12]], label [[TMP13]], label [[TMP13]] -; CHECK: [[DOT1]] = phi i32 [ [[DOT0]], [[TMP9]] ], [ [[DOT0]], [[TMP9]] ], [ undef, [[TMP0:%.*]] ] +; CHECK: 13: +; CHECK-NEXT: [[DOT1]] = phi i32 [ [[DOT0]], [[TMP9]] ], [ [[DOT0]], [[TMP9]] ], [ undef, [[TMP0:%.*]] ] ; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr @e, align 4 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 0 ; CHECK-NEXT: br i1 [[TMP15]], label [[TMP16:%.*]], label [[TMP9]] -; CHECK: ret i32 0 +; CHECK: 16: +; CHECK-NEXT: ret i32 0 ; %1 = load i32, ptr @d, align 4 %2 = icmp eq i32 %1, 0 diff --git a/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll b/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll index bc1d39f371515..e4fd4cc6dd8a2 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll @@ -1,5 +1,5 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=print-predicateinfo < %s 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments +; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s ; Don't insert predicate info for conditions with a single target. @a = global i32 6, align 4 @c = global i32 -1, align 4 @@ -13,26 +13,32 @@ define i32 @main() { ; CHECK-LABEL: @main( ; CHECK-NEXT: store i32 6, ptr @e, align 4 ; CHECK-NEXT: br label [[TMP1:%.*]] -; CHECK: [[TMP2:%.*]] = load i32, ptr @d, align 4 +; CHECK: 1: +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @d, align 4 ; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[TMP2]] to i64 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [6 x i32], ptr @b, i64 0, i64 [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = call i32 (ptr, ...) @printf(ptr @.str, i32 [[TMP5]]) ; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr @a, align 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0 -; CHECK-NEXT: br i1 [[TMP8]], label %thread-pre-split, label [[TMP9:%.*]] -; CHECK: [[TMP10:%.*]] = load i32, ptr @e, align 4 +; CHECK-NEXT: br i1 [[TMP8]], label [[THREAD_PRE_SPLIT:%.*]], label [[TMP9:%.*]] +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr @e, align 4 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0 ; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP12]] ; CHECK: thread-pre-split: ; CHECK-NEXT: [[DOTPR:%.*]] = load i32, ptr @e, align 4 ; CHECK-NEXT: br label [[TMP12]] -; CHECK: [[TMP13:%.*]] = phi i32 [ [[DOTPR]], %thread-pre-split ], [ [[TMP10]], [[TMP9]] ], [ [[TMP10]], [[TMP9]] ] +; CHECK: 12: +; CHECK-NEXT: [[TMP13:%.*]] = phi i32 [ [[DOTPR]], [[THREAD_PRE_SPLIT]] ], [ [[TMP10]], [[TMP9]] ], [ [[TMP10]], [[TMP9]] ] ; CHECK-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 ; CHECK-NEXT: br i1 [[TMP14]], label [[TMP15:%.*]], label [[TMP15]] -; CHECK: br i1 [[TMP14]], label [[TMP16:%.*]], label [[TMP17:%.*]] -; CHECK: br label [[TMP17]] -; CHECK: [[DOT0:%.*]] = phi i32 [ 1, [[TMP16]] ], [ -1, [[TMP15]] ] +; CHECK: 15: +; CHECK-NEXT: br i1 [[TMP14]], label [[TMP16:%.*]], label [[TMP17:%.*]] +; CHECK: 16: +; CHECK-NEXT: br label [[TMP17]] +; CHECK: 17: +; CHECK-NEXT: [[DOT0:%.*]] = phi i32 [ 1, [[TMP16]] ], [ -1, [[TMP15]] ] ; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[DOT0]], 8693 ; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr @c, align 4 ; CHECK-NEXT: [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]] @@ -40,7 +46,8 @@ define i32 @main() { ; CHECK-NEXT: store i32 [[TMP21]], ptr @d, align 4 ; CHECK-NEXT: [[TMP22:%.*]] = icmp slt i32 [[TMP20]], -2 ; CHECK-NEXT: br i1 [[TMP22]], label [[TMP1]], label [[TMP23:%.*]] -; CHECK: ret i32 0 +; CHECK: 23: +; CHECK-NEXT: ret i32 0 ; store i32 6, ptr @e, align 4 br label %1 diff --git a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll index cc1dc4e6989a1..d29aadd54128c 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments ; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s declare void @foo(i1) @@ -10,12 +10,17 @@ define void @test_or(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = or i1 [[XZ]], [[YZ]] -; CHECK: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 -; CHECK: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 -; CHECK: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK-NEXT: br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = or i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NEITHER:%.*]]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[XZ]] } +; CHECK-NEXT: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[YZ]] } +; CHECK-NEXT: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER]] ; CHECK: oneof: ; CHECK-NEXT: call void @foo(i1 [[XZ]]) ; CHECK-NEXT: call void @foo(i1 [[YZ]]) @@ -55,12 +60,17 @@ define void @test_or_logical(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = select i1 [[XZ]], i1 true, i1 [[YZ]] -; CHECK: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 -; CHECK: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 -; CHECK: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK-NEXT: br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = select i1 [[XZ]], i1 true, i1 [[YZ]] Edge: [label [[TMP0:%.*]],label [[NEITHER:%.*]]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[XZ]] } +; CHECK-NEXT: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[YZ]] } +; CHECK-NEXT: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER]] ; CHECK: oneof: ; CHECK-NEXT: call void @foo(i1 [[XZ]]) ; CHECK-NEXT: call void @foo(i1 [[YZ]]) @@ -100,12 +110,17 @@ define void @test_and(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XZ]], [[YZ]] -; CHECK: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 -; CHECK: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 -; CHECK: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK-NEXT: br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = and i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH:%.*]]], RenamedOp: [[XZ]] } +; CHECK-NEXT: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[YZ]] } +; CHECK-NEXT: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: br i1 [[Z]], label [[BOTH]], label [[NOPE]] ; CHECK: both: ; CHECK-NEXT: call void @foo(i1 [[XZ_0]]) ; CHECK-NEXT: call void @foo(i1 [[YZ_0]]) @@ -145,12 +160,17 @@ define void @test_and_logical(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = select i1 [[XZ]], i1 [[YZ]], i1 false -; CHECK: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 -; CHECK: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 -; CHECK: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK-NEXT: br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = select i1 [[XZ]], i1 [[YZ]], i1 false Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH:%.*]]], RenamedOp: [[XZ]] } +; CHECK-NEXT: [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[YZ]] } +; CHECK-NEXT: [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[Y_0:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: br i1 [[Z]], label [[BOTH]], label [[NOPE]] ; CHECK: both: ; CHECK-NEXT: call void @foo(i1 [[XZ_0]]) ; CHECK-NEXT: call void @foo(i1 [[YZ_0]]) @@ -190,12 +210,17 @@ define void @testandsame(i32 %x, i32 %y) { ; CHECK-NEXT: [[XGT:%.*]] = icmp sgt i32 [[X:%.*]], 0 ; CHECK-NEXT: [[XLT:%.*]] = icmp slt i32 [[X]], 100 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XGT]], [[XLT]] -; CHECK: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 -; CHECK: [[XGT_0:%.*]] = bitcast i1 [[XGT]] to i1 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[X_0_1:%.*]] = bitcast i32 [[X_0]] to i32 -; CHECK: [[XLT_0:%.*]] = bitcast i1 [[XLT]] to i1 -; CHECK-NEXT: br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = and i1 [[XGT]], [[XLT]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[Z_0:%.*]] = bitcast i1 [[Z]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XGT]] = icmp sgt i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH:%.*]]], RenamedOp: [[XGT]] } +; CHECK-NEXT: [[XGT_0:%.*]] = bitcast i1 [[XGT]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XGT]] = icmp sgt i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XLT]] = icmp slt i32 [[X]], 100 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0_1:%.*]] = bitcast i32 [[X_0]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XLT]] = icmp slt i32 [[X]], 100 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[XLT]] } +; CHECK-NEXT: [[XLT_0:%.*]] = bitcast i1 [[XLT]] to i1 +; CHECK-NEXT: br i1 [[Z]], label [[BOTH]], label [[NOPE]] ; CHECK: both: ; CHECK-NEXT: call void @foo(i1 [[XGT_0]]) ; CHECK-NEXT: call void @foo(i1 [[XLT_0]]) @@ -229,17 +254,27 @@ define void @testandassume(i32 %x, i32 %y) { ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XZ]], [[YZ]] ; CHECK-NEXT: call void @llvm.assume(i1 [[Z]]) -; CHECK: [[TMP1:%.*]] = bitcast i32 [[Y]] to i32 -; CHECK: [[TMP2:%.*]] = bitcast i1 [[YZ]] to i1 -; CHECK: [[TMP3:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[TMP4:%.*]] = bitcast i1 [[XZ]] to i1 -; CHECK: [[TMP5:%.*]] = bitcast i1 [[Z]] to i1 -; CHECK: [[DOT0:%.*]] = bitcast i1 [[TMP5]] to i1 -; CHECK: [[DOT01:%.*]] = bitcast i1 [[TMP4]] to i1 -; CHECK: [[DOT02:%.*]] = bitcast i32 [[TMP3]] to i32 -; CHECK: [[DOT03:%.*]] = bitcast i1 [[TMP2]] to i1 -; CHECK: [[DOT04:%.*]] = bitcast i32 [[TMP1]] to i32 -; CHECK-NEXT: br i1 [[TMP5]], label [[BOTH:%.*]], label [[NOPE:%.*]] +; CHECK-NEXT: ; assume predicate info { Comparison: [[YZ]] = icmp eq i32 [[Y]], 0, RenamedOp: [[Y]] } +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[Y]] to i32 +; CHECK-NEXT: ; assume predicate info { Comparison: [[YZ]] = icmp eq i32 [[Y]], 0, RenamedOp: [[YZ]] } +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i1 [[YZ]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[XZ]] = icmp eq i32 [[X]], 0, RenamedOp: [[X]] } +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; assume predicate info { Comparison: [[XZ]] = icmp eq i32 [[X]], 0, RenamedOp: [[XZ]] } +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i1 [[XZ]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[Z]] = and i1 [[XZ]], [[YZ]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[Z]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = and i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[TMP5]] } +; CHECK-NEXT: [[DOT0:%.*]] = bitcast i1 [[TMP5]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH:%.*]]], RenamedOp: [[XZ]] } +; CHECK-NEXT: [[DOT01:%.*]] = bitcast i1 [[TMP4]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[DOT02:%.*]] = bitcast i32 [[TMP3]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[YZ]] } +; CHECK-NEXT: [[DOT03:%.*]] = bitcast i1 [[TMP2]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[Y]] } +; CHECK-NEXT: [[DOT04:%.*]] = bitcast i32 [[TMP1]] to i32 +; CHECK-NEXT: br i1 [[TMP5]], label [[BOTH]], label [[NOPE]] ; CHECK: both: ; CHECK-NEXT: call void @foo(i1 [[DOT01]]) ; CHECK-NEXT: call void @foo(i1 [[DOT03]]) @@ -274,9 +309,11 @@ define void @testorassume(i32 %x, i32 %y) { ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = or i1 [[XZ]], [[YZ]] ; CHECK-NEXT: call void @llvm.assume(i1 [[Z]]) -; CHECK: [[TMP1:%.*]] = bitcast i1 [[Z]] to i1 -; CHECK: [[DOT0:%.*]] = bitcast i1 [[TMP1]] to i1 -; CHECK-NEXT: br i1 [[TMP1]], label [[BOTH:%.*]], label [[NOPE:%.*]] +; CHECK-NEXT: ; assume predicate info { Comparison: [[Z]] = or i1 [[XZ]], [[YZ]], RenamedOp: [[Z]] } +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i1 [[Z]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = or i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[TMP1]] } +; CHECK-NEXT: [[DOT0:%.*]] = bitcast i1 [[TMP1]] to i1 +; CHECK-NEXT: br i1 [[TMP1]], label [[BOTH:%.*]], label [[NOPE]] ; CHECK: both: ; CHECK-NEXT: call void @foo(i1 [[XZ]]) ; CHECK-NEXT: call void @foo(i1 [[YZ]]) @@ -307,12 +344,17 @@ define void @test_and_one_unknown_cond(i32 %x, i1 %c1) { ; CHECK-LABEL: @test_and_one_unknown_cond( ; CHECK-NEXT: [[C2:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[A:%.*]] = and i1 [[C1:%.*]], [[C2]] -; CHECK: [[A_0:%.*]] = bitcast i1 [[A]] to i1 -; CHECK: [[A_1:%.*]] = bitcast i1 [[A]] to i1 -; CHECK: [[C1_0:%.*]] = bitcast i1 [[C1]] to i1 -; CHECK: [[C2_0:%.*]] = bitcast i1 [[C2]] to i1 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK-NEXT: br i1 [[A]], label [[BOTH:%.*]], label [[NOPE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A]] = and i1 [[C1]], [[C2]] Edge: [label [[TMP0:%.*]],label [[BOTH:%.*]]], RenamedOp: [[A]] } +; CHECK-NEXT: [[A_0:%.*]] = bitcast i1 [[A]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A]] = and i1 [[C1]], [[C2]] Edge: [label [[TMP0]],label [[NOPE:%.*]]], RenamedOp: [[A]] } +; CHECK-NEXT: [[A_1:%.*]] = bitcast i1 [[A]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison:i1 [[C1]] Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[C1]] } +; CHECK-NEXT: [[C1_0:%.*]] = bitcast i1 [[C1]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[C2]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[C2]] } +; CHECK-NEXT: [[C2_0:%.*]] = bitcast i1 [[C2]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[C2]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: br i1 [[A]], label [[BOTH]], label [[NOPE]] ; CHECK: both: ; CHECK-NEXT: call void @bar(i32 [[X_0]]) ; CHECK-NEXT: call void @foo(i1 [[C1_0]]) @@ -349,12 +391,17 @@ define void @test_or_one_unknown_cond(i32 %x, i1 %c1) { ; CHECK-LABEL: @test_or_one_unknown_cond( ; CHECK-NEXT: [[C2:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[A:%.*]] = or i1 [[C1:%.*]], [[C2]] -; CHECK: [[A_0:%.*]] = bitcast i1 [[A]] to i1 -; CHECK: [[A_1:%.*]] = bitcast i1 [[A]] to i1 -; CHECK: [[C1_0:%.*]] = bitcast i1 [[C1]] to i1 -; CHECK: [[C2_0:%.*]] = bitcast i1 [[C2]] to i1 -; CHECK: [[X_0:%.*]] = bitcast i32 [[X]] to i32 -; CHECK-NEXT: br i1 [[A]], label [[NOPE:%.*]], label [[BOTH_INVERTED:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A]] = or i1 [[C1]], [[C2]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[A]] } +; CHECK-NEXT: [[A_0:%.*]] = bitcast i1 [[A]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A]] = or i1 [[C1]], [[C2]] Edge: [label [[TMP0]],label [[BOTH_INVERTED:%.*]]], RenamedOp: [[A]] } +; CHECK-NEXT: [[A_1:%.*]] = bitcast i1 [[A]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison:i1 [[C1]] Edge: [label [[TMP0]],label [[BOTH_INVERTED]]], RenamedOp: [[C1]] } +; CHECK-NEXT: [[C1_0:%.*]] = bitcast i1 [[C1]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[C2]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH_INVERTED]]], RenamedOp: [[C2]] } +; CHECK-NEXT: [[C2_0:%.*]] = bitcast i1 [[C2]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[C2]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH_INVERTED]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: br i1 [[A]], label [[NOPE]], label [[BOTH_INVERTED]] ; CHECK: both_inverted: ; CHECK-NEXT: call void @bar(i32 [[X_0]]) ; CHECK-NEXT: call void @foo(i1 [[C1_0]]) @@ -391,13 +438,19 @@ define void @test_and_chain(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @test_and_chain( ; CHECK-NEXT: [[AND1:%.*]] = and i1 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[AND2:%.*]] = and i1 [[AND1]], [[C:%.*]] -; CHECK: [[AND2_0:%.*]] = bitcast i1 [[AND2]] to i1 -; CHECK: [[AND2_1:%.*]] = bitcast i1 [[AND2]] to i1 -; CHECK: [[AND1_0:%.*]] = bitcast i1 [[AND1]] to i1 -; CHECK: [[A_0:%.*]] = bitcast i1 [[A]] to i1 -; CHECK: [[B_0:%.*]] = bitcast i1 [[B]] to i1 -; CHECK: [[C_0:%.*]] = bitcast i1 [[C]] to i1 -; CHECK-NEXT: br i1 [[AND2]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[AND2]] = and i1 [[AND1]], [[C]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[AND2]] } +; CHECK-NEXT: [[AND2_0:%.*]] = bitcast i1 [[AND2]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[AND2]] = and i1 [[AND1]], [[C]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[AND2]] } +; CHECK-NEXT: [[AND2_1:%.*]] = bitcast i1 [[AND2]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[AND1]] = and i1 [[A]], [[B]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[AND1]] } +; CHECK-NEXT: [[AND1_0:%.*]] = bitcast i1 [[AND1]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison:i1 [[A]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A]] } +; CHECK-NEXT: [[A_0:%.*]] = bitcast i1 [[A]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison:i1 [[B]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[B]] } +; CHECK-NEXT: [[B_0:%.*]] = bitcast i1 [[B]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison:i1 [[C]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[C]] } +; CHECK-NEXT: [[C_0:%.*]] = bitcast i1 [[C]] to i1 +; CHECK-NEXT: br i1 [[AND2]], label [[IF]], label [[ELSE]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A_0]]) ; CHECK-NEXT: call void @foo(i1 [[B_0]]) @@ -438,13 +491,19 @@ define void @test_or_chain(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @test_or_chain( ; CHECK-NEXT: [[OR1:%.*]] = or i1 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[OR2:%.*]] = or i1 [[OR1]], [[C:%.*]] -; CHECK: [[OR2_0:%.*]] = bitcast i1 [[OR2]] to i1 -; CHECK: [[OR2_1:%.*]] = bitcast i1 [[OR2]] to i1 -; CHECK: [[OR1_0:%.*]] = bitcast i1 [[OR1]] to i1 -; CHECK: [[A_0:%.*]] = bitcast i1 [[A]] to i1 -; CHECK: [[B_0:%.*]] = bitcast i1 [[B]] to i1 -; CHECK: [[C_0:%.*]] = bitcast i1 [[C]] to i1 -; CHECK-NEXT: br i1 [[OR2]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[OR2]] = or i1 [[OR1]], [[C]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[OR2]] } +; CHECK-NEXT: [[OR2_0:%.*]] = bitcast i1 [[OR2]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[OR2]] = or i1 [[OR1]], [[C]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[OR2]] } +; CHECK-NEXT: [[OR2_1:%.*]] = bitcast i1 [[OR2]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[OR1]] = or i1 [[A]], [[B]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[OR1]] } +; CHECK-NEXT: [[OR1_0:%.*]] = bitcast i1 [[OR1]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison:i1 [[A]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A]] } +; CHECK-NEXT: [[A_0:%.*]] = bitcast i1 [[A]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison:i1 [[B]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[B]] } +; CHECK-NEXT: [[B_0:%.*]] = bitcast i1 [[B]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison:i1 [[C]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[C]] } +; CHECK-NEXT: [[C_0:%.*]] = bitcast i1 [[C]] to i1 +; CHECK-NEXT: br i1 [[OR2]], label [[IF]], label [[ELSE]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A]]) ; CHECK-NEXT: call void @foo(i1 [[B]]) @@ -485,11 +544,15 @@ define void @test_and_or_mixed(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @test_and_or_mixed( ; CHECK-NEXT: [[OR:%.*]] = or i1 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[AND:%.*]] = and i1 [[OR]], [[C:%.*]] -; CHECK: [[AND_0:%.*]] = bitcast i1 [[AND]] to i1 -; CHECK: [[AND_1:%.*]] = bitcast i1 [[AND]] to i1 -; CHECK: [[OR_0:%.*]] = bitcast i1 [[OR]] to i1 -; CHECK: [[C_0:%.*]] = bitcast i1 [[C]] to i1 -; CHECK-NEXT: br i1 [[AND]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[AND]] = and i1 [[OR]], [[C]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[AND]] } +; CHECK-NEXT: [[AND_0:%.*]] = bitcast i1 [[AND]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[AND]] = and i1 [[OR]], [[C]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[AND]] } +; CHECK-NEXT: [[AND_1:%.*]] = bitcast i1 [[AND]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[OR]] = or i1 [[A]], [[B]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[OR]] } +; CHECK-NEXT: [[OR_0:%.*]] = bitcast i1 [[OR]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison:i1 [[C]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[C]] } +; CHECK-NEXT: [[C_0:%.*]] = bitcast i1 [[C]] to i1 +; CHECK-NEXT: br i1 [[AND]], label [[IF]], label [[ELSE]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A]]) ; CHECK-NEXT: call void @foo(i1 [[B]]) @@ -542,16 +605,25 @@ define void @test_deep_and_chain(i1 %a1) { ; CHECK-NEXT: [[A13:%.*]] = and i1 [[A12]], true ; CHECK-NEXT: [[A14:%.*]] = and i1 [[A13]], true ; CHECK-NEXT: [[A15:%.*]] = and i1 [[A14]], true -; CHECK: [[A15_0:%.*]] = bitcast i1 [[A15]] to i1 -; CHECK: [[A15_1:%.*]] = bitcast i1 [[A15]] to i1 -; CHECK: [[A14_0:%.*]] = bitcast i1 [[A14]] to i1 -; CHECK: [[A13_0:%.*]] = bitcast i1 [[A13]] to i1 -; CHECK: [[A12_0:%.*]] = bitcast i1 [[A12]] to i1 -; CHECK: [[A11_0:%.*]] = bitcast i1 [[A11]] to i1 -; CHECK: [[A10_0:%.*]] = bitcast i1 [[A10]] to i1 -; CHECK: [[A9_0:%.*]] = bitcast i1 [[A9]] to i1 -; CHECK: [[A8_0:%.*]] = bitcast i1 [[A8]] to i1 -; CHECK-NEXT: br i1 [[A15]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A15]] = and i1 [[A14]], true Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[A15]] } +; CHECK-NEXT: [[A15_0:%.*]] = bitcast i1 [[A15]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A15]] = and i1 [[A14]], true Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[A15]] } +; CHECK-NEXT: [[A15_1:%.*]] = bitcast i1 [[A15]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A14]] = and i1 [[A13]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A14]] } +; CHECK-NEXT: [[A14_0:%.*]] = bitcast i1 [[A14]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A13]] = and i1 [[A12]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A13]] } +; CHECK-NEXT: [[A13_0:%.*]] = bitcast i1 [[A13]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A12]] = and i1 [[A11]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A12]] } +; CHECK-NEXT: [[A12_0:%.*]] = bitcast i1 [[A12]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A11]] = and i1 [[A10]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A11]] } +; CHECK-NEXT: [[A11_0:%.*]] = bitcast i1 [[A11]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A10]] = and i1 [[A9]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A10]] } +; CHECK-NEXT: [[A10_0:%.*]] = bitcast i1 [[A10]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A9]] = and i1 [[A8]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A9]] } +; CHECK-NEXT: [[A9_0:%.*]] = bitcast i1 [[A9]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A8]] = and i1 [[A7]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A8]] } +; CHECK-NEXT: [[A8_0:%.*]] = bitcast i1 [[A8]] to i1 +; CHECK-NEXT: br i1 [[A15]], label [[IF]], label [[ELSE]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A1]]) ; CHECK-NEXT: call void @foo(i1 [[A2]]) @@ -656,16 +728,25 @@ define void @test_deep_and_tree(i1 %a1) { ; CHECK-NEXT: [[A13:%.*]] = and i1 [[A12]], [[A12]] ; CHECK-NEXT: [[A14:%.*]] = and i1 [[A13]], [[A13]] ; CHECK-NEXT: [[A15:%.*]] = and i1 [[A14]], [[A14]] -; CHECK: [[A15_0:%.*]] = bitcast i1 [[A15]] to i1 -; CHECK: [[A15_1:%.*]] = bitcast i1 [[A15]] to i1 -; CHECK: [[A14_0:%.*]] = bitcast i1 [[A14]] to i1 -; CHECK: [[A13_0:%.*]] = bitcast i1 [[A13]] to i1 -; CHECK: [[A12_0:%.*]] = bitcast i1 [[A12]] to i1 -; CHECK: [[A11_0:%.*]] = bitcast i1 [[A11]] to i1 -; CHECK: [[A10_0:%.*]] = bitcast i1 [[A10]] to i1 -; CHECK: [[A9_0:%.*]] = bitcast i1 [[A9]] to i1 -; CHECK: [[A8_0:%.*]] = bitcast i1 [[A8]] to i1 -; CHECK-NEXT: br i1 [[A15]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A15]] = and i1 [[A14]], [[A14]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[A15]] } +; CHECK-NEXT: [[A15_0:%.*]] = bitcast i1 [[A15]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A15]] = and i1 [[A14]], [[A14]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[A15]] } +; CHECK-NEXT: [[A15_1:%.*]] = bitcast i1 [[A15]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A14]] = and i1 [[A13]], [[A13]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A14]] } +; CHECK-NEXT: [[A14_0:%.*]] = bitcast i1 [[A14]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A13]] = and i1 [[A12]], [[A12]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A13]] } +; CHECK-NEXT: [[A13_0:%.*]] = bitcast i1 [[A13]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A12]] = and i1 [[A11]], [[A11]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A12]] } +; CHECK-NEXT: [[A12_0:%.*]] = bitcast i1 [[A12]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A11]] = and i1 [[A10]], [[A10]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A11]] } +; CHECK-NEXT: [[A11_0:%.*]] = bitcast i1 [[A11]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A10]] = and i1 [[A9]], [[A9]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A10]] } +; CHECK-NEXT: [[A10_0:%.*]] = bitcast i1 [[A10]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A9]] = and i1 [[A8]], [[A8]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A9]] } +; CHECK-NEXT: [[A9_0:%.*]] = bitcast i1 [[A9]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A8]] = and i1 [[A7]], [[A7]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A8]] } +; CHECK-NEXT: [[A8_0:%.*]] = bitcast i1 [[A8]] to i1 +; CHECK-NEXT: br i1 [[A15]], label [[IF]], label [[ELSE]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A1]]) ; CHECK-NEXT: call void @foo(i1 [[A2]]) @@ -770,16 +851,25 @@ define void @test_deep_or_tree(i1 %a1) { ; CHECK-NEXT: [[A13:%.*]] = or i1 [[A12]], [[A12]] ; CHECK-NEXT: [[A14:%.*]] = or i1 [[A13]], [[A13]] ; CHECK-NEXT: [[A15:%.*]] = or i1 [[A14]], [[A14]] -; CHECK: [[A15_0:%.*]] = bitcast i1 [[A15]] to i1 -; CHECK: [[A15_1:%.*]] = bitcast i1 [[A15]] to i1 -; CHECK: [[A14_0:%.*]] = bitcast i1 [[A14]] to i1 -; CHECK: [[A13_0:%.*]] = bitcast i1 [[A13]] to i1 -; CHECK: [[A12_0:%.*]] = bitcast i1 [[A12]] to i1 -; CHECK: [[A11_0:%.*]] = bitcast i1 [[A11]] to i1 -; CHECK: [[A10_0:%.*]] = bitcast i1 [[A10]] to i1 -; CHECK: [[A9_0:%.*]] = bitcast i1 [[A9]] to i1 -; CHECK: [[A8_0:%.*]] = bitcast i1 [[A8]] to i1 -; CHECK-NEXT: br i1 [[A15]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[A15]] = or i1 [[A14]], [[A14]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[A15]] } +; CHECK-NEXT: [[A15_0:%.*]] = bitcast i1 [[A15]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A15]] = or i1 [[A14]], [[A14]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[A15]] } +; CHECK-NEXT: [[A15_1:%.*]] = bitcast i1 [[A15]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A14]] = or i1 [[A13]], [[A13]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A14]] } +; CHECK-NEXT: [[A14_0:%.*]] = bitcast i1 [[A14]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A13]] = or i1 [[A12]], [[A12]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A13]] } +; CHECK-NEXT: [[A13_0:%.*]] = bitcast i1 [[A13]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A12]] = or i1 [[A11]], [[A11]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A12]] } +; CHECK-NEXT: [[A12_0:%.*]] = bitcast i1 [[A12]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A11]] = or i1 [[A10]], [[A10]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A11]] } +; CHECK-NEXT: [[A11_0:%.*]] = bitcast i1 [[A11]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A10]] = or i1 [[A9]], [[A9]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A10]] } +; CHECK-NEXT: [[A10_0:%.*]] = bitcast i1 [[A10]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A9]] = or i1 [[A8]], [[A8]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A9]] } +; CHECK-NEXT: [[A9_0:%.*]] = bitcast i1 [[A9]] to i1 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 0 Comparison: [[A8]] = or i1 [[A7]], [[A7]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A8]] } +; CHECK-NEXT: [[A8_0:%.*]] = bitcast i1 [[A8]] to i1 +; CHECK-NEXT: br i1 [[A15]], label [[IF]], label [[ELSE]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A1]]) ; CHECK-NEXT: call void @foo(i1 [[A2]]) @@ -873,11 +963,16 @@ define void @test_assume_and_chain(i1 %a, i1 %b, i1 %c) { ; CHECK-NEXT: [[AND1:%.*]] = and i1 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[AND2:%.*]] = and i1 [[AND1]], [[C:%.*]] ; CHECK-NEXT: call void @llvm.assume(i1 [[AND2]]) -; CHECK: [[TMP1:%.*]] = bitcast i1 [[C]] to i1 -; CHECK: [[TMP2:%.*]] = bitcast i1 [[B]] to i1 -; CHECK: [[TMP3:%.*]] = bitcast i1 [[A]] to i1 -; CHECK: [[TMP4:%.*]] = bitcast i1 [[AND1]] to i1 -; CHECK: [[TMP5:%.*]] = bitcast i1 [[AND2]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison:i1 [[C]], RenamedOp: [[C]] } +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i1 [[C]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison:i1 [[B]], RenamedOp: [[B]] } +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i1 [[B]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison:i1 [[A]], RenamedOp: [[A]] } +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i1 [[A]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[AND1]] = and i1 [[A]], [[B]], RenamedOp: [[AND1]] } +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i1 [[AND1]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[AND2]] = and i1 [[AND1]], [[C]], RenamedOp: [[AND2]] } +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[AND2]] to i1 ; CHECK-NEXT: call void @foo(i1 [[TMP3]]) ; CHECK-NEXT: call void @foo(i1 [[TMP2]]) ; CHECK-NEXT: call void @foo(i1 [[TMP1]]) @@ -901,7 +996,8 @@ define void @test_assume_or_chain(i1 %a, i1 %b, i1 %c) { ; CHECK-NEXT: [[OR1:%.*]] = or i1 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[OR2:%.*]] = or i1 [[OR1]], [[C:%.*]] ; CHECK-NEXT: call void @llvm.assume(i1 [[OR2]]) -; CHECK: [[TMP1:%.*]] = bitcast i1 [[OR2]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[OR2]] = or i1 [[OR1]], [[C]], RenamedOp: [[OR2]] } +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i1 [[OR2]] to i1 ; CHECK-NEXT: call void @foo(i1 [[A]]) ; CHECK-NEXT: call void @foo(i1 [[B]]) ; CHECK-NEXT: call void @foo(i1 [[C]]) @@ -937,14 +1033,22 @@ define void @test_assume_deep_and_tree(i1 %a1) { ; CHECK-NEXT: [[A14:%.*]] = and i1 [[A13]], [[A13]] ; CHECK-NEXT: [[A15:%.*]] = and i1 [[A14]], [[A14]] ; CHECK-NEXT: call void @llvm.assume(i1 [[A15]]) -; CHECK: [[TMP1:%.*]] = bitcast i1 [[A8]] to i1 -; CHECK: [[TMP2:%.*]] = bitcast i1 [[A9]] to i1 -; CHECK: [[TMP3:%.*]] = bitcast i1 [[A10]] to i1 -; CHECK: [[TMP4:%.*]] = bitcast i1 [[A11]] to i1 -; CHECK: [[TMP5:%.*]] = bitcast i1 [[A12]] to i1 -; CHECK: [[TMP6:%.*]] = bitcast i1 [[A13]] to i1 -; CHECK: [[TMP7:%.*]] = bitcast i1 [[A14]] to i1 -; CHECK: [[TMP8:%.*]] = bitcast i1 [[A15]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[A8]] = and i1 [[A7]], [[A7]], RenamedOp: [[A8]] } +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i1 [[A8]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[A9]] = and i1 [[A8]], [[A8]], RenamedOp: [[A9]] } +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i1 [[A9]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[A10]] = and i1 [[A9]], [[A9]], RenamedOp: [[A10]] } +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i1 [[A10]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[A11]] = and i1 [[A10]], [[A10]], RenamedOp: [[A11]] } +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i1 [[A11]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[A12]] = and i1 [[A11]], [[A11]], RenamedOp: [[A12]] } +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[A12]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[A13]] = and i1 [[A12]], [[A12]], RenamedOp: [[A13]] } +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i1 [[A13]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[A14]] = and i1 [[A13]], [[A13]], RenamedOp: [[A14]] } +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i1 [[A14]] to i1 +; CHECK-NEXT: ; assume predicate info { Comparison: [[A15]] = and i1 [[A14]], [[A14]], RenamedOp: [[A15]] } +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i1 [[A15]] to i1 ; CHECK-NEXT: call void @foo(i1 [[A1]]) ; CHECK-NEXT: call void @foo(i1 [[A2]]) ; CHECK-NEXT: call void @foo(i1 [[A3]]) @@ -1001,13 +1105,15 @@ define i32 @test_and_with_phinode(i32 %x) { ; CHECK-NEXT: [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1 ; CHECK-NEXT: [[XLT2:%.*]] = icmp ult i32 [[X]], 2 ; CHECK-NEXT: [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]] -; CHECK: [[X_0_1:%.*]] = bitcast i32 [[X]] to i32 -; CHECK: [[X_0_2:%.*]] = bitcast i32 [[X_0_1]] to i32 -; CHECK-NEXT: br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]] +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XGE1]] = icmp uge i32 [[X]], 1 Edge: [label [[ENTRY:%.*]],label [[PHI:%.*]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0_1:%.*]] = bitcast i32 [[X]] to i32 +; CHECK-NEXT: ; branch predicate info { TrueEdge: 1 Comparison: [[XLT2]] = icmp ult i32 [[X]], 2 Edge: [label [[ENTRY]],label [[PHI]]], RenamedOp: [[X]] } +; CHECK-NEXT: [[X_0_2:%.*]] = bitcast i32 [[X_0_1]] to i32 +; CHECK-NEXT: br i1 [[AND]], label [[PHI]], label [[NOPE:%.*]] ; CHECK: nope: ; CHECK-NEXT: br label [[PHI]] ; CHECK: phi: -; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[X_0_2]], [[ENTRY:%.*]] ], [ 1, [[NOPE]] ] +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[X_0_2]], [[ENTRY]] ], [ 1, [[NOPE]] ] ; CHECK-NEXT: ret i32 [[RES]] ; entry: diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index b6b80ea117672..119303c319246 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -607,7 +607,7 @@ def invoke_tool(exe, cmd_args, ir, preprocess_cmd=None, verbose=False): DEBUG_ONLY_ARG_RE = re.compile(r"-debug-only[= ]([^ ]+)") IS_DEBUG_RECORD_RE = re.compile(r"^(\s+)#dbg_") -IS_SWITCH_CASE_RE = re.compile(r"^\s+i\d+ \d+, label %\w+") +IS_SWITCH_CASE_RE = re.compile(r"^\s+i\d+ \d+, label %\S+") SCRUB_LEADING_WHITESPACE_RE = re.compile(r"^(\s+)") SCRUB_WHITESPACE_RE = re.compile(r"(?!^(| \w))[ \t]+", flags=re.M) @@ -1123,6 +1123,8 @@ def processed_prefixes(self, prefixes): ##### Generator of LLVM IR CHECK lines SCRUB_IR_COMMENT_RE = re.compile(r"\s*;.*") +# Comments to indicate the predecessors of a block in the IR. +SCRUB_PRED_COMMENT_RE = re.compile(r"\s*; preds = .*") SCRUB_IR_FUNC_META_RE = re.compile(r"((?:\!(?!dbg\b)[a-zA-Z_]\w*(?:\s+![0-9]+)?)\s*)+") # TODO: We should also derive check lines for global, debug, loop declarations, etc.. @@ -1361,7 +1363,7 @@ def make_ir_generalizer(version, no_meta_details): ] prefix = r"(\s*)" - suffix = r"([,\s\(\)\}]|\Z)" + suffix = r"([,\s\(\)\}\]]|\Z)" # values = [ # nameless_value @@ -1877,6 +1879,7 @@ def generalize_check_lines( *, unstable_globals_only=False, no_meta_details=False, + ignore_all_comments=True, # If False, only ignore comments of predecessors ): if unstable_globals_only: regexp = ginfo.get_unstable_globals_regexp() @@ -1904,8 +1907,12 @@ def escape_braces(match_obj): line, ) break - # Ignore any comments, since the check lines will too. - scrubbed_line = SCRUB_IR_COMMENT_RE.sub(r"", line) + if ignore_all_comments: + # Ignore any comments, since the check lines will too. + scrubbed_line = SCRUB_IR_COMMENT_RE.sub(r"", line) + else: + # Ignore comments of predecessors only. + scrubbed_line = SCRUB_PRED_COMMENT_RE.sub(r"", line) # Ignore the metadata details if check global is none if no_meta_details: scrubbed_line = SCRUB_IR_FUNC_META_RE.sub(r"{{.*}}", scrubbed_line) @@ -2083,6 +2090,7 @@ def add_checks( global_tbaa_records_for_prefixes={}, preserve_names=False, original_check_lines: Mapping[str, List[str]] = {}, + check_inst_comments=True, ): # prefix_exclusions are prefixes we cannot use to print the function because it doesn't exist in run lines that use these prefixes as well. prefix_exclusions = set() @@ -2280,6 +2288,8 @@ def add_checks( global_tbaa_records, preserve_names, original_check_lines=original_check_lines.get(checkprefix), + # IR output might require comments checks, e.g., print-predicate-info, print + ignore_all_comments=not check_inst_comments, ) # This could be selectively enabled with an optional invocation argument. @@ -2299,8 +2309,9 @@ def add_checks( if func_line.strip() == "": is_blank_line = True continue - # Do not waste time checking IR comments. - func_line = SCRUB_IR_COMMENT_RE.sub(r"", func_line) + if not check_inst_comments: + # Do not waste time checking IR comments unless necessary. + func_line = SCRUB_IR_COMMENT_RE.sub(r"", func_line) # Skip blank lines instead of checking them. if is_blank_line: @@ -2342,6 +2353,7 @@ def add_ir_checks( global_vars_seen_dict, global_tbaa_records_for_prefixes, is_filtered, + check_inst_comments=False, original_check_lines={}, ): assert ginfo.is_ir() @@ -2368,6 +2380,7 @@ def add_ir_checks( global_tbaa_records_for_prefixes, preserve_names, original_check_lines=original_check_lines, + check_inst_comments=check_inst_comments, ) diff --git a/llvm/utils/update_test_checks.py b/llvm/utils/update_test_checks.py index 42227b20fca76..74e87787fd5b8 100755 --- a/llvm/utils/update_test_checks.py +++ b/llvm/utils/update_test_checks.py @@ -197,6 +197,7 @@ def update_test(ti: common.TestInfo): global_tbaa_records_for_prefixes, is_filtered=builder.is_filtered(), original_check_lines=original_check_lines.get(func, {}), + check_inst_comments=args.check_inst_comments, ), ) ) @@ -230,6 +231,7 @@ def update_test(ti: common.TestInfo): global_tbaa_records_for_prefixes, is_filtered=builder.is_filtered(), original_check_lines=original_check_lines.get(func_name, {}), + check_inst_comments=args.check_inst_comments, ) ) is_in_function_start = False @@ -362,6 +364,12 @@ def main(): choices=["none", "smart", "all"], help="Check global entries (global variables, metadata, attribute sets, ...) for functions", ) + parser.add_argument( + "--check-inst-comments", + action="store_true", + default=False, + help="Check the generated comments describing instructions (e.g., -print-predicate-info/print)", + ) parser.add_argument( "--reset-variable-names", action="store_true", From 313fcf3f33bc017bf849ee3a0afaa1e653a51364 Mon Sep 17 00:00:00 2001 From: Giacomo Castiglioni Date: Fri, 31 Oct 2025 14:19:05 +0100 Subject: [PATCH 336/539] [MLIR][NVVM] Extend NVVM mma ops to support fp64 (#165380) This PR extends the `nvvm.mma` ops to support fp64 type. The extension requires special handling of the return type for load ops for fragment `a` and `b` since they return a scalar instead of a struct. --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 16 +++++++-- mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 14 ++++++++ mlir/test/Target/LLVMIR/nvvmir-invalid.mlir | 11 ++++++ mlir/test/Target/LLVMIR/nvvmir.mlir | 37 +++++++++++++++++++++ 4 files changed, 76 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index b572ef9c1d07b..ba5e48e4ec9ba 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -1999,6 +1999,9 @@ class MMA_LDST_OPS Geom, list Frags, list Types> { // llvm supports and can be extended as needed. class NVVM_MMA_OPS { // "wmma" operations + list> fp64_wmma_ops = MMA_OPS< + [GEOM<8, 8, 4>], + ["f64"], [], ["f64"], []>.ret; list> tf32_wmma_ops = MMA_OPS< [GEOM<16, 16, 8>], ["tf32"], [], ["f32"], []>.ret; @@ -2009,6 +2012,7 @@ class NVVM_MMA_OPS { [GEOM<16, 16, 16>, GEOM<32, 8, 16>, GEOM<8, 32, 16>], ["s8","u8"], [], ["s32"], []>.ret; list> all_wmma_ops = !listconcat( + fp64_wmma_ops, tf32_wmma_ops, fp_wmma_ops, i8_wmma_ops); @@ -2025,9 +2029,17 @@ class NVVM_MMA_OPS { list ldst_tf32_cd_ops = MMA_LDST_OPS< [GEOM<16, 16, 8>], ["c", "d"], ["f32"]>.ret; + list ldst_f64_ab_ops = MMA_LDST_OPS< + [GEOM<8, 8, 4>], + ["a", "b"], ["f64"]>.ret; + list ldst_f64_cd_ops = MMA_LDST_OPS< + [GEOM<8, 8, 4>], + ["c", "d"], ["f64"]>.ret; list all_ldst_ops = !listconcat(ldst_ab_ops, ldst_cd_ops, ldst_tf32_ab_ops, - ldst_tf32_cd_ops); + ldst_tf32_cd_ops, + ldst_f64_ab_ops, + ldst_f64_cd_ops); // Separate A/B/C fragments (loads) from D (stores). list all_ld_ops = !filter(op, all_ldst_ops, !ne(op.frag, "d")); list all_st_ops = !filter(op, all_ldst_ops, !eq(op.frag, "d")); @@ -2334,7 +2346,7 @@ def MMAFragAttr : EnumAttr { } def NVVM_WMMALoadOp: NVVM_Op<"wmma.load">, - Results<(outs LLVM_AnyStruct:$res)>, + Results<(outs AnyTypeOf<[LLVM_AnyStruct, F64]>:$res)>, Arguments<(ins LLVM_AnyPointer: $ptr, I32: $stride, I32Attr:$m, I32Attr:$n, I32Attr:$k, MMALayoutAttr:$layout, MMATypesAttr:$eltype, MMAFragAttr:$frag)> { diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index 53a6f43c0bbcf..a5ffb9e77fa9d 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -896,6 +896,12 @@ std::pair NVVM::inferMMAType(NVVM::MMATypes type, } else if (type == NVVM::MMATypes::f32) { elementType = builder.getF32Type(); numberElements = 8; + } else if (type == NVVM::MMATypes::f64) { + elementType = builder.getF64Type(); + if (frag == NVVM::MMAFrag::a || frag == NVVM::MMAFrag::b) + numberElements = 1; + else + numberElements = 2; } else if (type == NVVM::MMATypes::tf32) { elementType = builder.getI32Type(); numberElements = 4; @@ -954,6 +960,14 @@ LogicalResult NVVM::WMMALoadOp::verify() { return emitOpError() << "invalid attribute combination"; std::pair typeInfo = inferMMATypeFromMNK( getEltype(), getFrag(), getM(), getN(), getK(), getContext()); + // Special case for f64 fragments + Type f64Ty = Float64Type::get(getContext()); + if (typeInfo.first == f64Ty && typeInfo.second == 1) { + if (getType() != f64Ty) + return emitOpError("expected destination type to be f64"); + return success(); + } + // Everything else is a struct Type dstType = LLVM::LLVMStructType::getLiteral( getContext(), SmallVector(typeInfo.second, typeInfo.first)); if (getType() != dstType) diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir index 09b8f593154b5..42aa2210eae1a 100644 --- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir @@ -621,3 +621,14 @@ func.func @invalid_range_equal_bounds() { %0 = nvvm.read.ptx.sreg.warpsize range : i32 return } + +// ----- + +// Test for correct return type check for wmma.load fragment a for f64 +llvm.func @nvvm_wmma_load_a_f64(%arg0: !llvm.ptr, %arg1 : i32) { + // expected-error @below {{'nvvm.wmma.load' op expected destination type to be f64}} + %0 = nvvm.wmma.load %arg0, %arg1 + {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 4 : i32, layout = #nvvm.mma_layout, m = 8 : i32, n = 8 : i32} + : (!llvm.ptr) -> !llvm.struct<(f64)> + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index 594ae4849e3eb..9115de65ff0e8 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -463,6 +463,43 @@ llvm.func @nvvm_wmma_mma(%0 : i32, %1 : i32, %2 : i32, %3 : i32, %4 : i32, %5 : llvm.return } +// CHECK-LABEL: @nvvm_wmma_load_a_f64 +llvm.func @nvvm_wmma_load_a_f64(%arg0: !llvm.ptr, %arg1 : i32) { + // CHECK: call double @llvm.nvvm.wmma.m8n8k4.load.a.row.stride.f64.p0(ptr %{{.*}}, i32 %{{.*}}) + %0 = nvvm.wmma.load %arg0, %arg1 + {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 4 : i32, layout = #nvvm.mma_layout, m = 8 : i32, n = 8 : i32} + : (!llvm.ptr) -> f64 + llvm.return +} + +// CHECK-LABEL: @nvvm_wmma_load_c_f64 +llvm.func @nvvm_wmma_load_c_f64(%arg0: !llvm.ptr, %arg1 : i32) { + // CHECK: call { double, double } @llvm.nvvm.wmma.m8n8k4.load.c.row.stride.f64.p0(ptr %{{.*}}, i32 %{{.*}}) + %0 = nvvm.wmma.load %arg0, %arg1 + {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 4 : i32, layout = #nvvm.mma_layout, m = 8 : i32, n = 8 : i32} + : (!llvm.ptr) -> !llvm.struct<(f64, f64)> + llvm.return +} + +// CHECK-LABEL: @nvvm_wmma_mma_f64 +llvm.func @nvvm_wmma_mma_f64(%0 : f64, %1 : f64, %2 : f64, %3 : f64) { + // CHECK: { double, double } @llvm.nvvm.wmma.m8n8k4.mma.row.col.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}) + %r = nvvm.wmma.mma %0, %1, %2, %3 + {eltypeA = #nvvm.mma_type, eltypeB = #nvvm.mma_type, k = 4 : i32, layoutA = #nvvm.mma_layout, layoutB = #nvvm.mma_layout, m = 8 : i32, n = 8 : i32} + : (f64, f64, f64, f64) + -> !llvm.struct<(f64, f64)> + llvm.return +} + +// CHECK-LABEL: @nvvm_wmma_store_d_f64 +llvm.func @nvvm_wmma_store_d_f64(%arg0: !llvm.ptr, %arg1 : i32, %arg2 : f64, %arg3 : f64) { + // CHECK: call void @llvm.nvvm.wmma.m8n8k4.store.d.row.stride.f64.p0(ptr %{{.*}}, double %{{.*}}, double %{{.*}}, i32 %{{.*}}) + nvvm.wmma.store %arg0, %arg1, %arg2, %arg3 + {eltype = #nvvm.mma_type, k = 4 : i32, layout = #nvvm.mma_layout, m = 8 : i32, n = 8 : i32} + : !llvm.ptr, f64, f64 + llvm.return +} + // CHECK-LABEL: @cp_async llvm.func @cp_async(%arg0: !llvm.ptr<3>, %arg1: !llvm.ptr<1>) { // CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}}) From ccaa2b3bb54522d3203ff65fe76c695d54bbda18 Mon Sep 17 00:00:00 2001 From: Rajat Bajpai Date: Fri, 31 Oct 2025 18:54:43 +0530 Subject: [PATCH 337/539] [NVPTX] Move TMA G2S lowering to Tablegen (#165710) This change refactors G2S TMA implementation to use pure TableGen based expansion instead verbose ISel DAG expansion. In addition, it adds proper arch predicates for TMA G2S. All the test cases are validated locally with CUDA 13.0 toolkit. --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 129 ------------------ llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 1 - llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 1 - llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 105 +++++--------- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 21 ++- .../NVPTX/cp-async-bulk-tensor-g2s-1cta.ll | 4 + .../NVPTX/cp-async-bulk-tensor-g2s-2cta.ll | 4 + .../NVPTX/cp-async-bulk-tensor-g2s-gather4.ll | 4 + .../NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll | 4 + .../cp-async-bulk-tensor-g2s-im2colw128.ll | 4 + .../CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll | 68 ++++----- 11 files changed, 96 insertions(+), 249 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 7e7ee754c250d..c667a09f95dbb 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1871,17 +1871,6 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; } (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \ : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, ))) -#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \ - [&]() -> auto { \ - if (is_mc && is_ch) \ - return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \ - if (is_ch) \ - return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \ - if (is_mc) \ - return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \ - return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \ - }() - static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, @@ -1925,112 +1914,6 @@ static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, } } -static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, - bool IsMultiCast, - bool IsCacheHint, bool IsIm2Col) { - if (IsIm2Col) { - switch (Dim) { - case 3: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL, IsMultiCast, - IsCacheHint, IsShared32); - case 4: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL, IsMultiCast, - IsCacheHint, IsShared32); - case 5: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL, IsMultiCast, - IsCacheHint, IsShared32); - default: - llvm_unreachable("Invalid Dimension in im2col mode for " - "GetCpAsyncBulkTensorG2SOpcode."); - } - } else { - switch (Dim) { - case 1: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE, IsMultiCast, - IsCacheHint, IsShared32); - case 2: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE, IsMultiCast, - IsCacheHint, IsShared32); - case 3: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE, IsMultiCast, - IsCacheHint, IsShared32); - case 4: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE, IsMultiCast, - IsCacheHint, IsShared32); - case 5: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE, IsMultiCast, - IsCacheHint, IsShared32); - default: - llvm_unreachable( - "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode."); - } - } -} - -static size_t GetDimsFromIntrinsic(unsigned IID) { - switch (IID) { - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d: - case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d: - return 3; - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d: - case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d: - return 4; - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: - case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d: - return 5; - default: - llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic."); - } -} - -void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N, - bool IsIm2Col) { - // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: - // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2} - // multicast, cache_hint, - // multicast_flag, cache_hint_flag, cta_group_flag} - // NumOperands = {Chain, IID} + {Actual intrinsic args} - // = {2} + {8 + dims + im2col_offsets} - size_t NumOps = N->getNumOperands(); - size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1)) - : (NumOps - 10); - // Offsets is always 'NumDims - 2' and only for im2col mode - size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0; - bool IsCacheHint = N->getConstantOperandVal(NumOps - 2) == 1; - bool IsMultiCast = N->getConstantOperandVal(NumOps - 3) == 1; - size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src} - size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID - - unsigned CTAGroupVal = N->getConstantOperandVal(NumOps - 1); - if ((CTAGroupVal > 0) && !Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()) - report_fatal_error( - formatv("CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}", - Subtarget->getSmVersion())); - - SDLoc DL(N); - SmallVector Ops(N->ops().slice(2, NumBaseArgs)); - - // Push MultiCast operand, if available - if (IsMultiCast) - Ops.push_back(N->getOperand(MultiCastIdx)); - - // Push CacheHint operand, if available - if (IsCacheHint) - Ops.push_back(N->getOperand(MultiCastIdx + 1)); - - // Flag for CTA Group - Ops.push_back(getI32Imm(CTAGroupVal, DL)); - - // Finally, the chain operand - Ops.push_back(N->getOperand(0)); - - bool IsShared32 = - CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32; - unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode( - NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col); - ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); -} - void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp, bool IsIm2Col) { @@ -2175,18 +2058,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { switch (IID) { default: return false; - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: - SelectCpAsyncBulkTensorG2SCommon(N); - return true; - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d: - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: - SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true); - return true; case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d: case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d: case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d: diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index c912e709d0aa0..1cb579bd96730 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -86,7 +86,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N); void SelectV2I64toI128(SDNode *N); void SelectI128toV2I64(SDNode *N); - void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp, bool IsIm2Col = false); void SelectTcgen05Ld(SDNode *N, bool hasOffset = false); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index dfde0cca0f00c..b26022184708c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -139,7 +139,6 @@ def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">; def hasTcgen05MMAScaleInputDImm : Predicate<"Subtarget->hasTcgen05MMAScaleInputDImm()">; -def hasTMACTAGroupSupport : Predicate<"Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()">; def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">; class hasPTX: Predicate<"Subtarget->getPTXVersion() >= " # version>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index c923f0ec907e7..e8758aa55d24e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -599,75 +599,15 @@ class TMA_IM2COL_UTIL { string base_str = !interleave(!foreach(i, !range(offsets), "$im2col" # i), ", "); } -// From Global to Shared memory (G2S) -class G2S_STRINGS { - string prefix = "cp.async.bulk.tensor"; - string dir = "shared::cluster.global"; - string completion = "mbarrier::complete_tx::bytes"; - string inst_name = prefix - # "." # dim # "d" - # "." # dir - # "." # mode - # "." # completion - # !if(mc, ".multicast::cluster", "") - # !if(ch, ".L2::cache_hint", ""); - string intr_name = "CP_ASYNC_BULK_TENSOR_G2S_" - # dim # "D" - # !if(is_shared32, "_SHARED32", "") - # !if(!eq(mode, "tile"), "_TILE", "_IM2COL"); -} - def CTAGroupFlags : Operand { let PrintMethod = "printCTAGroup"; } -multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR { - defvar dims_dag = TMA_DIMS_UTIL.ins_dag; - defvar dims_str = TMA_DIMS_UTIL.base_str; - defvar asm_str_default = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; - defvar rc = !if(is_shared32, B32, B64); - - defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0); - defvar im2col_dag = !if(!eq(mode, "im2col"), - !dag(ins, !listsplat(B16, num_im2col), !foreach(i, !range(num_im2col), "im2col" # i)), - (ins)); - defvar im2col_str = !interleave(!foreach(i, !range(num_im2col), "$im2col" # i), ", "); - defvar im2col_asm_str = ", {{" # im2col_str # "}}"; - - defvar asm_str = !if(!eq(mode, "im2col"), - !strconcat(asm_str_default, im2col_asm_str), asm_str_default); +def tma_cta_group_imm0 : TImmLeaf; +def tma_cta_group_imm_any : TImmLeaf= 0;}]>; - def "" : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, (ins CTAGroupFlags:$cg)), - !strconcat(G2S_STRINGS.inst_name, asm_str, ";")>, - Requires<[hasPTX<80>, hasSM<90>]>; - def _MC : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, - (ins B16:$mc, CTAGroupFlags:$cg)), - !strconcat(G2S_STRINGS.inst_name, asm_str, ", $mc;")>, - Requires<[hasPTX<80>, hasSM<90>]>; - def _CH : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, - (ins B64:$ch, CTAGroupFlags:$cg)), - !strconcat(G2S_STRINGS.inst_name, asm_str, ", $ch;")>, - Requires<[hasPTX<80>, hasSM<90>]>; - def _MC_CH : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, - (ins B16:$mc, B64:$ch, CTAGroupFlags:$cg)), - !strconcat(G2S_STRINGS.inst_name, asm_str, ", $mc, $ch;")>, - Requires<[hasPTX<80>, hasSM<90>]>; -} - -foreach dim = [1, 2, 3, 4, 5] in { - foreach shared32 = [true, false] in { - foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { - defm G2S_STRINGS.intr_name : - CP_ASYNC_BULK_TENSOR_G2S_INTR; - } - } -} - -multiclass TMA_TENSOR_G2S_INTR pred = []> { +multiclass TMA_TENSOR_G2S_INTR pred, + TImmLeaf cta_group_type = tma_cta_group_imm_any> { defvar dims_dag = TMA_DIMS_UTIL.ins_dag; defvar dims_str = TMA_DIMS_UTIL.base_str; defvar asm_str_base = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; @@ -697,10 +637,10 @@ multiclass TMA_TENSOR_G2S_INTR pred = []> !setdagop(dims_dag, intr), !setdagop(im2col_dag, intr), (intr B16:$mc, B64:$ch)); - defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, timm:$cg)); - defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, timm:$cg)); - defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, timm:$cg)); - defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, timm:$cg)); + defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, cta_group_type:$cg)); + defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, cta_group_type:$cg)); + defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, cta_group_type:$cg)); + defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, cta_group_type:$cg)); def "" : NVPTXInst<(outs), ins_dag, inst_name # asm_str # ";", @@ -719,14 +659,30 @@ multiclass TMA_TENSOR_G2S_INTR pred = []> [intr_dag_with_mc_ch]>, Requires; } + +foreach dim = 1...5 in { + defm TMA_G2S_TILE_CG0_ # dim # "D" + : TMA_TENSOR_G2S_INTR, hasSM<90>], + tma_cta_group_imm0>; + defm TMA_G2S_TILE_ # dim # "D" + : TMA_TENSOR_G2S_INTR]>; +} foreach dim = 3...5 in { + defm TMA_G2S_IM2COL_CG0_ # dim # "D" + : TMA_TENSOR_G2S_INTR, hasSM<90>], + tma_cta_group_imm0>; + defm TMA_G2S_IM2COL_ # dim # "D" + : TMA_TENSOR_G2S_INTR]>; foreach mode = ["im2col_w", "im2col_w_128"] in { defm TMA_G2S_ # !toupper(mode) # "_" # dim # "D" - : TMA_TENSOR_G2S_INTR; + : TMA_TENSOR_G2S_INTR]>; } } defm TMA_G2S_TILE_GATHER4_2D : TMA_TENSOR_G2S_INTR<5, "tile_gather4", - [hasTMACTAGroupSupport]>; + [callSubtarget<"hasTMABlackwellSupport">]>; multiclass TMA_TENSOR_G2S_CTA_INTR pred = []> { defvar dims_dag = TMA_DIMS_UTIL.ins_dag; @@ -784,7 +740,8 @@ foreach dim = 3...5 in { : TMA_TENSOR_G2S_CTA_INTR, hasSM<100>]>; defm TMA_G2S_CTA_IM2COL_W_128_ # dim # "D" - : TMA_TENSOR_G2S_CTA_INTR; + : TMA_TENSOR_G2S_CTA_INTR]>; } defm TMA_G2S_CTA_TILE_GATHER4_2D : TMA_TENSOR_G2S_CTA_INTR<5, "tile_gather4", [hasPTX<86>, hasSM<100>]>; @@ -835,7 +792,7 @@ foreach dim = 1...5 in { } } defm TMA_S2G_TILE_SCATTER4_2D : TMA_TENSOR_S2G_INTR<5, "tile_scatter4", - [hasTMACTAGroupSupport]>; + [callSubtarget<"hasTMABlackwellSupport">]>; def TMAReductionFlags : Operand { let PrintMethod = "printTmaReductionMode"; @@ -930,11 +887,11 @@ foreach dim = 3...5 in { foreach mode = ["im2col_w", "im2col_w_128"] in { defvar suffix = !toupper(mode) # "_" # dim # "D"; defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR; + [callSubtarget<"hasTMABlackwellSupport">]>; } } defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4", - [hasTMACTAGroupSupport]>; + [callSubtarget<"hasTMABlackwellSupport">]>; //Prefetchu and Prefetch diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 194dbdc061a96..021b1f6d0bf57 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -166,18 +166,15 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { // f32x2 instructions in Blackwell family bool hasF32x2Instructions() const; - // TMA G2S copy with cta_group::1/2 support - bool hasCpAsyncBulkTensorCTAGroupSupport() const { - // TODO: Update/tidy-up after the family-conditional support arrives - switch (FullSmVersion) { - case 1003: - case 1013: - return PTXVersion >= 86; - case 1033: - return PTXVersion >= 88; - default: - return false; - } + // Checks support for following in TMA: + // - cta_group::1/2 support + // - im2col_w/w_128 mode support + // - tile_gather4 mode support + // - tile_scatter4 mode support + bool hasTMABlackwellSupport() const { + return hasPTXWithFamilySMs(90, {100, 110}) || + hasPTXWithFamilySMs(88, {100, 101}) || + hasPTXWithAccelSMs(86, {100, 101}); } // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll index b5c43fd259a75..d653895efa340 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll index 57342dc9a49c5..5de1ac887b76c 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll index 6296d5af8ab18..2f5c1ef4670da 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll index e5ae3875a0ede..a2b2c2f27fa5e 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll index 7d04adaa774c3..e4c48ddddea18 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll index b0fe77c1a83be..727bb3b3aa8fd 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} ; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" @@ -29,10 +33,10 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_1]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_1d_param_2]; ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_3]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -48,10 +52,10 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_1d_param_1]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_2]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_1d_param_3]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -79,10 +83,10 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_2d_param_2]; ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_3]; ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_4]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -99,10 +103,10 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_2]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_2d_param_3]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_2d_param_4]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -131,10 +135,10 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_3]; ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_5]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -152,10 +156,10 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_3]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_3d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_3d_param_5]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -185,10 +189,10 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_6]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -207,10 +211,10 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_4d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_4d_param_6]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -241,10 +245,10 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_6]; ; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_7]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -264,10 +268,10 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_5d_param_6]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_tile_5d_param_7]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -297,10 +301,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -319,10 +323,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_3d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_3d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -354,10 +358,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_6]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -378,10 +382,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2col_4d_param_6]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -415,10 +419,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; +; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -441,10 +445,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; From 63ea712c86f595155871e86ddae3934f55a6c68d Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Fri, 31 Oct 2025 13:27:09 +0000 Subject: [PATCH 338/539] [lldb][TypeSystem] Better support for _BitInt types (#165689) Depends on: * https://github.com/llvm/llvm-project/pull/165686 This patch ensures we make use of the `DW_AT_bit_size` on `DW_TAG_base_type`s (which since https://github.com/llvm/llvm-project/pull/164372 can exist on `_BitInt`s) and adjusts `TypeSystemClang` to recognize `_BitInt`. For DWARF from older versions of Clang that didn't emit a `DW_AT_bit_size`, we would create `_BitInt`s using the byte-size. Not sure we can do much better than that. But the situation beforehand wasn't much better. Before: ``` (lldb) v (char) a = '\x01' (unsigned char) b = '\x01' (long) c = 2 (unsigned long) d = 2 ``` After: ``` (lldb) v (_BitInt(2)) a = 1 (unsigned _BitInt(2)) b = 1 (_BitInt(52)) c = 2 (unsigned _BitInt(52)) d = 2 ``` Fixes https://github.com/llvm/llvm-project/issues/110273 --- .../SymbolFile/DWARF/DWARFASTParserClang.cpp | 13 +- .../TypeSystem/Clang/TypeSystemClang.cpp | 11 + lldb/unittests/Symbol/TestTypeSystemClang.cpp | 91 ++++++++ .../DWARF/DWARFASTParserClangTests.cpp | 212 ++++++++++++++++++ 4 files changed, 323 insertions(+), 4 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index 47fa27b0a81a7..63b2dc4ab82b0 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -814,13 +814,18 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc, // there... [[fallthrough]]; - case DW_TAG_base_type: + case DW_TAG_base_type: { resolve_state = Type::ResolveState::Full; + // If a builtin type's size isn't a multiple of a byte, DWARF producers may + // add a precise bit-size to the type. Use the most precise bit-size + // possible. + const uint64_t bit_size = attrs.data_bit_size + ? *attrs.data_bit_size + : attrs.byte_size.value_or(0) * 8; clang_type = m_ast.GetBuiltinTypeForDWARFEncodingAndBitSize( - attrs.name.GetStringRef(), attrs.encoding, - attrs.byte_size.value_or(0) * 8); + attrs.name.GetStringRef(), attrs.encoding, bit_size); break; - + } case DW_TAG_pointer_type: encoding_data_type = Type::eEncodingIsPointerUID; break; diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index 67186542fb705..51cb883748514 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -1000,6 +1000,8 @@ CompilerType TypeSystemClang::GetBuiltinTypeForDWARFEncodingAndBitSize( case DW_ATE_signed: if (!type_name.empty()) { + if (type_name.starts_with("_BitInt")) + return GetType(ast.getBitIntType(/*Unsigned=*/false, bit_size)); if (type_name == "wchar_t" && QualTypeMatchesBitSize(bit_size, ast, ast.WCharTy) && (getTargetInfo() && @@ -1056,6 +1058,8 @@ CompilerType TypeSystemClang::GetBuiltinTypeForDWARFEncodingAndBitSize( case DW_ATE_unsigned: if (!type_name.empty()) { + if (type_name.starts_with("unsigned _BitInt")) + return GetType(ast.getBitIntType(/*Unsigned=*/true, bit_size)); if (type_name == "wchar_t") { if (QualTypeMatchesBitSize(bit_size, ast, ast.WCharTy)) { if (!(getTargetInfo() && @@ -3889,6 +3893,13 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type, ->getModifiedType() .getAsOpaquePtr(), pointee_or_element_clang_type); + case clang::Type::BitInt: { + uint32_t type_flags = eTypeIsScalar | eTypeIsInteger | eTypeHasValue; + if (qual_type->isSignedIntegerType()) + type_flags |= eTypeIsSigned; + + return type_flags; + } case clang::Type::Builtin: { const clang::BuiltinType *builtin_type = llvm::cast(qual_type->getCanonicalTypeInternal()); diff --git a/lldb/unittests/Symbol/TestTypeSystemClang.cpp b/lldb/unittests/Symbol/TestTypeSystemClang.cpp index 4de595fd62825..155fc743934c2 100644 --- a/lldb/unittests/Symbol/TestTypeSystemClang.cpp +++ b/lldb/unittests/Symbol/TestTypeSystemClang.cpp @@ -52,6 +52,12 @@ class TestTypeSystemClang : public testing::Test { return ClangUtil::GetQualType( m_ast->GetBuiltinTypeByName(ConstString(name))); } + + CompilerType GetBuiltinTypeForDWARFEncodingAndBitSize( + llvm::StringRef type_name, uint32_t encoding, uint32_t bit_size) const { + return m_ast->GetBuiltinTypeForDWARFEncodingAndBitSize(type_name, encoding, + bit_size); + } }; TEST_F(TestTypeSystemClang, TestGetBasicTypeFromEnum) { @@ -238,6 +244,91 @@ TEST_F(TestTypeSystemClang, TestBuiltinTypeForEncodingAndBitSize) { VerifyEncodingAndBitSize(*m_ast, eEncodingIEEE754, 64); } +TEST_F(TestTypeSystemClang, TestGetBuiltinTypeForDWARFEncodingAndBitSize) { + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitIn", llvm::dwarf::DW_ATE_signed, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "BitInt", llvm::dwarf::DW_ATE_signed, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt(2)", llvm::dwarf::DW_ATE_signed_char, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt", llvm::dwarf::DW_ATE_signed_char, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt(2)", llvm::dwarf::DW_ATE_unsigned, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt", llvm::dwarf::DW_ATE_unsigned, 2) + .IsValid()); + + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt(2)", llvm::dwarf::DW_ATE_signed, 2) + .GetTypeName(), + "_BitInt(2)"); + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt", llvm::dwarf::DW_ATE_signed, 2) + .GetTypeName(), + "_BitInt(2)"); + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt(129)", llvm::dwarf::DW_ATE_signed, 129) + .GetTypeName(), + "_BitInt(129)"); + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt", llvm::dwarf::DW_ATE_signed, 129) + .GetTypeName(), + "_BitInt(129)"); + + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitIn", llvm::dwarf::DW_ATE_unsigned, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned BitInt", llvm::dwarf::DW_ATE_unsigned, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt(2)", llvm::dwarf::DW_ATE_unsigned_char, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt", llvm::dwarf::DW_ATE_unsigned_char, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt(2)", llvm::dwarf::DW_ATE_signed, 2) + .IsValid()); + EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt", llvm::dwarf::DW_ATE_signed, 2) + .IsValid()); + + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt(2)", llvm::dwarf::DW_ATE_unsigned, 2) + .GetTypeName(), + "unsigned _BitInt(2)"); + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt", llvm::dwarf::DW_ATE_unsigned, 2) + .GetTypeName(), + "unsigned _BitInt(2)"); + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt(129)", llvm::dwarf::DW_ATE_unsigned, 129) + .GetTypeName(), + "unsigned _BitInt(129)"); + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt", llvm::dwarf::DW_ATE_unsigned, 129) + .GetTypeName(), + "unsigned _BitInt(129)"); +} + +TEST_F(TestTypeSystemClang, TestBitIntTypeInfo) { + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "_BitInt", llvm::dwarf::DW_ATE_signed, 2) + .GetTypeInfo(), + eTypeIsSigned | eTypeIsScalar | eTypeHasValue | eTypeIsInteger); + EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize( + "unsigned _BitInt", llvm::dwarf::DW_ATE_unsigned, 2) + .GetTypeInfo(), + eTypeIsScalar | eTypeHasValue | eTypeIsInteger); +} + TEST_F(TestTypeSystemClang, TestBuiltinTypeForEmptyTriple) { // Test that we can access type-info of builtin Clang AST // types without crashing even when the target triple is diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp index 064ed6d1d3e58..cef3a25a4a960 100644 --- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp +++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp @@ -1741,3 +1741,215 @@ TEST_F(DWARFASTParserClangTests, TestTypeBitSize) { EXPECT_EQ(llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0), 1U); } + +TEST_F(DWARFASTParserClangTests, TestBitIntParsing) { + // Tests that we correctly parse the DW_AT_base_type for a _BitInt. + // Older versions of Clang only emit the `_BitInt` string into the + // DW_AT_name (not including the bitsize). Make sure we understand + // those too. + + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_str: + - _BitInt(2) + - _BitInt + - unsigned _BitInt(2) + - unsigned _BitInt + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_base_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_encoding + Form: DW_FORM_data1 + - Attribute: DW_AT_byte_size + Form: DW_FORM_data1 + - Attribute: DW_AT_bit_size + Form: DW_FORM_data1 + - Code: 0x3 + Tag: DW_TAG_base_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_encoding + Form: DW_FORM_data1 + - Attribute: DW_AT_byte_size + Form: DW_FORM_data1 + + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + +# DW_TAG_compile_unit +# DW_AT_language [DW_FORM_data2] (DW_LANG_C_plus_plus) + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + +# DW_TAG_base_type +# DW_AT_name [DW_FORM_strp] ('_BitInt(2)') + + - AbbrCode: 0x2 + Values: + - Value: 0x0 + - Value: 0x05 + - Value: 0x01 + - Value: 0x02 + +# DW_TAG_base_type +# DW_AT_name [DW_FORM_strp] ('_BitInt') + + - AbbrCode: 0x2 + Values: + - Value: 0x0b + - Value: 0x05 + - Value: 0x08 + - Value: 0x34 + +# DW_TAG_base_type +# DW_AT_name [DW_FORM_strp] ('unsigned _BitInt(2)') + + - AbbrCode: 0x2 + Values: + - Value: 0x13 + - Value: 0x07 + - Value: 0x01 + - Value: 0x02 + +# DW_TAG_base_type +# DW_AT_name [DW_FORM_strp] ('unsigned _BitInt') + + - AbbrCode: 0x2 + Values: + - Value: 0x27 + - Value: 0x07 + - Value: 0x08 + - Value: 0x34 + +# DW_TAG_base_type +# DW_AT_name [DW_FORM_strp] ('_BitInt') + + - AbbrCode: 0x3 + Values: + - Value: 0x0b + - Value: 0x05 + - Value: 0x08 +... + +)"; + + YAMLModuleTester t(yamldata); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto holder = std::make_unique("ast"); + auto &ast_ctx = *holder->GetAST(); + DWARFASTParserClangStub ast_parser(ast_ctx); + + auto type_die = cu_die.GetFirstChild(); + ASSERT_TRUE(type_die.IsValid()); + + { + SymbolContext sc; + auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die, + /*type_is_new_ptr=*/nullptr); + ASSERT_NE(type_sp, nullptr); + + EXPECT_EQ( + llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0), + 1U); + EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingSint); + EXPECT_EQ(type_sp->GetName(), "_BitInt(2)"); + EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(), "_BitInt(2)"); + } + + { + type_die = type_die.GetSibling(); + SymbolContext sc; + auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die, + /*type_is_new_ptr=*/nullptr); + ASSERT_NE(type_sp, nullptr); + + EXPECT_EQ( + llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0), + 8U); + EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingSint); + EXPECT_EQ(type_sp->GetName(), "_BitInt"); + EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(), "_BitInt(52)"); + } + + { + type_die = type_die.GetSibling(); + SymbolContext sc; + auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die, + /*type_is_new_ptr=*/nullptr); + ASSERT_NE(type_sp, nullptr); + + EXPECT_EQ( + llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0), + 1U); + EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingUint); + EXPECT_EQ(type_sp->GetName(), "unsigned _BitInt(2)"); + EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(), + "unsigned _BitInt(2)"); + } + + { + type_die = type_die.GetSibling(); + SymbolContext sc; + auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die, + /*type_is_new_ptr=*/nullptr); + ASSERT_NE(type_sp, nullptr); + + EXPECT_EQ( + llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0), + 8U); + EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingUint); + EXPECT_EQ(type_sp->GetName(), "unsigned _BitInt"); + EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(), + "unsigned _BitInt(52)"); + } + + { + type_die = type_die.GetSibling(); + SymbolContext sc; + auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die, + /*type_is_new_ptr=*/nullptr); + ASSERT_NE(type_sp, nullptr); + + EXPECT_EQ( + llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0), + 8U); + EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingSint); + EXPECT_EQ(type_sp->GetName(), "_BitInt"); + + // Older versions of Clang didn't emit a DW_AT_bit_size for _BitInt. In + // those cases we would format the CompilerType name using the byte-size. + EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(), "_BitInt(64)"); + } +} From c9da2b258005952ecfaddd33a8ef3eb28de2cc62 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Fri, 31 Oct 2025 06:48:02 -0700 Subject: [PATCH 339/539] [MemProf] Select largest of matching contexts from profile (#165338) We aren't currently deduplicating contexts that are identical or nearly identical (differing inline frame information) when generating the profile. When we have multiple identical contexts we end up conservatively marking it as non-cold, even if some are much smaller in terms of bytes allocated. This was causing us to lose sight of a very large cold context, because we had a small non-cold one that only differed in the inlining (which we don't consider when matching as the inlining could change or be incomplete at that point in compilation). Likely the smaller one was from binary with much smaller usage and therefore not yet detected as cold. Deduplicate the alloc contexts for a function before applying the profile, selecting the largest one, or conservatively selecting the non-cold one if they are the same size. This caused a minor difference to an existing test (memprof_loop_unroll.ll), which now only gets one message for the duplicate context instead of 2. While here, convert to the text version of the profile. --- .../Transforms/Instrumentation/MemProfUse.cpp | 35 ++++-- .../PGOProfile/memprof_diff_inline.ll | 118 ++++++++++++++++++ .../PGOProfile/memprof_loop_unroll.ll | 46 +++++-- 3 files changed, 182 insertions(+), 17 deletions(-) create mode 100644 llvm/test/Transforms/PGOProfile/memprof_diff_inline.ll diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp index 2f256dfd7b0e2..b72d41a748857 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp @@ -127,15 +127,19 @@ static uint64_t computeStackId(const memprof::Frame &Frame) { return computeStackId(Frame.Function, Frame.LineOffset, Frame.Column); } +static AllocationType getAllocType(const AllocationInfo *AllocInfo) { + return getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(), + AllocInfo->Info.getAllocCount(), + AllocInfo->Info.getTotalLifetime()); +} + static AllocationType addCallStack(CallStackTrie &AllocTrie, const AllocationInfo *AllocInfo, uint64_t FullStackId) { SmallVector StackIds; for (const auto &StackFrame : AllocInfo->CallStack) StackIds.push_back(computeStackId(StackFrame)); - auto AllocType = getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(), - AllocInfo->Info.getAllocCount(), - AllocInfo->Info.getTotalLifetime()); + auto AllocType = getAllocType(AllocInfo); std::vector ContextSizeInfo; if (recordContextSizeInfoForAnalysis()) { auto TotalSize = AllocInfo->Info.getTotalSize(); @@ -405,22 +409,39 @@ handleAllocSite(Instruction &I, CallBase *CI, const std::set &AllocInfoSet, std::map, AllocMatchInfo> &FullStackIdToAllocMatchInfo) { + // TODO: Remove this once the profile creation logic deduplicates contexts + // that are the same other than the IsInlineFrame bool. Until then, keep the + // largest. + DenseMap UniqueFullContextIdAllocInfo; + for (auto *AllocInfo : AllocInfoSet) { + auto FullStackId = computeFullStackId(AllocInfo->CallStack); + auto [It, Inserted] = + UniqueFullContextIdAllocInfo.insert({FullStackId, AllocInfo}); + // If inserted entry, done. + if (Inserted) + continue; + // Keep the larger one, or the noncold one if they are the same size. + auto CurSize = It->second->Info.getTotalSize(); + auto NewSize = AllocInfo->Info.getTotalSize(); + if ((CurSize > NewSize) || + (CurSize == NewSize && + getAllocType(AllocInfo) != AllocationType::NotCold)) + continue; + It->second = AllocInfo; + } // We may match this instruction's location list to multiple MIB // contexts. Add them to a Trie specialized for trimming the contexts to // the minimal needed to disambiguate contexts with unique behavior. CallStackTrie AllocTrie(&ORE, MaxColdSize); uint64_t TotalSize = 0; uint64_t TotalColdSize = 0; - for (auto *AllocInfo : AllocInfoSet) { + for (auto &[FullStackId, AllocInfo] : UniqueFullContextIdAllocInfo) { // Check the full inlined call stack against this one. // If we found and thus matched all frames on the call, include // this MIB. if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack, InlinedCallStack)) { NumOfMemProfMatchedAllocContexts++; - uint64_t FullStackId = 0; - if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis()) - FullStackId = computeFullStackId(AllocInfo->CallStack); auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId); TotalSize += AllocInfo->Info.getTotalSize(); if (AllocType == AllocationType::Cold) diff --git a/llvm/test/Transforms/PGOProfile/memprof_diff_inline.ll b/llvm/test/Transforms/PGOProfile/memprof_diff_inline.ll new file mode 100644 index 0000000000000..5213a07d13d39 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/memprof_diff_inline.ll @@ -0,0 +1,118 @@ +;; Tests that the compiler ignores smaller contexts that differ only in the +;; IsInlineFrame bool. These map to the same full context id internally, as we +;; ignore the inline frame status which may differ in feedback compiles. +;; Presumably this happens when profiles collected from different binaries are +;; merged. If we didn't pick the largest we would default them all to noncold. + +;; Avoid failures on big-endian systems that can't read the profile properly +; REQUIRES: x86_64-linux + +;; Generate the profile and the IR. +; RUN: split-file %s %t + +;; Generate indexed profile +; RUN: llvm-profdata merge %t/memprof_diff_inline.yaml -o %t.memprofdata + +; RUN: opt < %t/memprof_diff_inline.ll -passes='memprof-use' -S -memprof-report-hinted-sizes -memprof-print-match-info 2>&1 | FileCheck %s --check-prefixes=MEMPROF + +; MEMPROF: MemProf notcold context with id 10194276560488437434 has total profiled size 200 is matched with 1 frames +; MEMPROF: MemProf cold context with id 16342802530253093571 has total profiled size 10000 is matched with 1 frames + +;--- memprof_diff_inline.yaml +--- +HeapProfileRecords: + - GUID: _Z3foov + AllocSites: + # Small non-cold, full context id 16342802530253093571, should ignore + - Callstack: + - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 10 + TotalLifetime: 0 + TotalLifetimeAccessDensity: 20000 + # Large cold, full context id 16342802530253093571, should keep + - Callstack: + - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: true } + - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 10000 + TotalLifetime: 200000 + TotalLifetimeAccessDensity: 0 + # Small non-cold, full context id 16342802530253093571, should ignore + - Callstack: + - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: true } + - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 100 + TotalLifetime: 0 + TotalLifetimeAccessDensity: 20000 + # Small non-cold, full context id 10194276560488437434 + - Callstack: + - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: main, LineOffset: 9, Column: 13, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 200 + TotalLifetime: 0 + TotalLifetimeAccessDensity: 20000 + CallSites: [] +... +;--- memprof_diff_inline.ll +; ModuleID = 'memprof_diff_inline.cc' +source_filename = "memprof_diff_inline.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%"struct.std::nothrow_t" = type { i8 } + +@_ZSt7nothrow = external global %"struct.std::nothrow_t", align 1 + +define dso_local noundef ptr @_Z3foov() !dbg !10 { +entry: + ; MEMPROF: call {{.*}} @_Znwm{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]] + %call = call noalias noundef align 32 ptr @_Znwm(i64 noundef 32) #6, !dbg !13 + ret ptr %call +} + +declare noundef ptr @_Znwm(i64 noundef) + +attributes #6 = { builtin allocsize(0) } + +; MEMPROF: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]]} + +; MEMPROF: ![[MIB1]] = !{![[STACK1:[0-9]+]], !"notcold", ![[CONTEXTSIZE1:[0-9]+]]} +; MEMPROF: ![[STACK1]] = !{i64 2732490490862098848, i64 8467819354083268568, i64 9086428284934609951, i64 2061451396820446691} +;; Full context id 10194276560488437434 == -8252467513221114182 +; MEMPROF: ![[CONTEXTSIZE1]] = !{i64 -8252467513221114182, i64 200} + +; MEMPROF: ![[MIB2]] = !{![[STACK2:[0-9]+]], !"cold", ![[CONTEXTSIZE2:[0-9]+]]} +; MEMPROF: ![[STACK2]] = !{i64 2732490490862098848, i64 8467819354083268568, i64 9086428284934609951, i64 -5747251260480066785} +;; Full context id 16342802530253093571 == -2103941543456458045 +;; We should have kept the large (cold) one. +; MEMPROF: ![[CONTEXTSIZE2]] = !{i64 -2103941543456458045, i64 10000} + +; MEMPROF: ![[C1]] = !{i64 2732490490862098848} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git 6cbe6284d1f0a088b5c6482ae27b738f03d82fe7)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!1 = !DIFile(filename: "memprof.cc", directory: "/usr/local/google/home/tejohnson/llvm/tmp", checksumkind: CSK_MD5, checksum: "e8c40ebe4b21776b4d60e9632cbc13c2") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!10 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !11, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12) +!11 = !DISubroutineType(types: !12) +!12 = !{} +!13 = !DILocation(line: 5, column: 10, scope: !10) diff --git a/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll b/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll index 2461ca32e9821..ba53c5797208c 100644 --- a/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll +++ b/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll @@ -4,24 +4,50 @@ ;; Avoid failures on big-endian systems that can't read the profile properly ; REQUIRES: x86_64-linux -;; TODO: Use text profile inputs once that is available for memprof. -;; # To update the Inputs below, run Inputs/update_memprof_inputs.sh. -;; # To generate below LLVM IR for use in matching. -;; $ clang++ -gmlt -fdebug-info-for-profiling -S %S/Inputs/memprof_loop_unroll_b.cc -emit-llvm +; Generate the profile and the IR. +; RUN: split-file %s %t + +;; Generate indexed profile +; RUN: llvm-profdata merge %t/memprof_loop_unroll.yaml -o %t.memprofdata -; RUN: llvm-profdata merge %S/Inputs/memprof_loop_unroll.memprofraw --profiled-binary %S/Inputs/memprof_loop_unroll.exe -o %t.memprofdata ;; Set the minimum lifetime threshold to 0 to ensure that one context is ;; considered cold (the other will be notcold). -; RUN: opt < %s -passes='memprof-use' -S -memprof-report-hinted-sizes -memprof-ave-lifetime-cold-threshold=0 2>&1 | FileCheck %s +; RUN: opt < %t/memprof_loop_unroll.ll -passes='memprof-use' -S -memprof-report-hinted-sizes -memprof-ave-lifetime-cold-threshold=0 2>&1 | FileCheck %s -;; Conservatively annotate as not cold. We get two messages as there are two -;; unrolled copies of the allocation. -; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and indistinguishable alloc type notcold: 4 -; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and indistinguishable alloc type notcold: 4 +;; Conservatively annotate as not cold. +; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and single alloc type notcold: 4 ; CHECK: call {{.*}} @_Znam{{.*}} #[[ATTR:[0-9]+]] ; CHECK: attributes #[[ATTR]] = { builtin allocsize(0) "memprof"="notcold" } ; CHECK-NOT: stackIds: () +;--- memprof_loop_unroll.yaml +--- +HeapProfileRecords: + - GUID: 0x7f8d88fcc70a347b + AllocSites: + - Callstack: + - { Function: 0x7f8d88fcc70a347b, LineOffset: 2, Column: 16, IsInlineFrame: false } + - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 4 + TotalLifetime: 2 + TotalLifetimeAccessDensity: 12500000000 + - Callstack: + - { Function: 0x7f8d88fcc70a347b, LineOffset: 2, Column: 16, IsInlineFrame: false } + - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false } + MemInfoBlock: + AllocCount: 1 + TotalSize: 4 + TotalLifetime: 2 + TotalLifetimeAccessDensity: 0 + - GUID: 0xdb956436e78dd5fa + CallSites: + - Frames: + - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false } +... + +;--- memprof_loop_unroll.ll ; ModuleID = 'memprof_loop_unroll_b.cc' source_filename = "memprof_loop_unroll_b.cc" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" From ccd5dc34d443012c03ef988d0efaebb41abe229d Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Fri, 31 Oct 2025 13:49:26 +0000 Subject: [PATCH 340/539] [CostModel][AArch64] Model cost of extract.last.active intrinsic (clastb) (#165739) Adds some aarch64 cost model tests for extract.last.active, and produces a lower cost when SVE is available (expecting clastb to be generated). --- .../AArch64/AArch64TargetTransformInfo.cpp | 7 + .../CostModel/AArch64/extract-last-active.ll | 216 ++++++++++++++++++ 2 files changed, 223 insertions(+) create mode 100644 llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index fede586cf35bc..47c1ac4b22224 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1032,6 +1032,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } break; } + case Intrinsic::experimental_vector_extract_last_active: + if (ST->isSVEorStreamingSVEAvailable()) { + auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]); + // This should turn into chained clastb instructions. + return LegalCost; + } + break; default: break; } diff --git a/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll b/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll new file mode 100644 index 0000000000000..9efcf912076b0 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll @@ -0,0 +1,216 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=NEON +; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=SVE +; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sme -force-streaming | FileCheck %s --check-prefix=SME-STREAMING + +define void @extractions() { +; NEON-LABEL: 'extractions' +; NEON-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8( poison, poison, i8 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16( poison, poison, i16 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32( poison, poison, i32 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64( poison, poison, i64 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16( poison, poison, half poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16( poison, poison, bfloat poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32( poison, poison, float poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64( poison, poison, double poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8( poison, poison, i8 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16( poison, poison, i16 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32( poison, poison, i32 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64( poison, poison, i64 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16( poison, poison, half poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16( poison, poison, bfloat poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32( poison, poison, float poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64( poison, poison, double poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8( poison, poison, i8 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16( poison, poison, i16 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32( poison, poison, i32 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64( poison, poison, i64 poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16( poison, poison, half poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16( poison, poison, bfloat poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32( poison, poison, float poison) +; NEON-NEXT: Cost Model: Invalid cost for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64( poison, poison, double poison) +; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SVE-LABEL: 'extractions' +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8( poison, poison, i8 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16( poison, poison, i16 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32( poison, poison, i32 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64( poison, poison, i64 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16( poison, poison, half poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16( poison, poison, bfloat poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32( poison, poison, float poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64( poison, poison, double poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8( poison, poison, i8 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16( poison, poison, i16 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32( poison, poison, i32 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64( poison, poison, i64 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16( poison, poison, half poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16( poison, poison, bfloat poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32( poison, poison, float poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64( poison, poison, double poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8( poison, poison, i8 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16( poison, poison, i16 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32( poison, poison, i32 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64( poison, poison, i64 poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16( poison, poison, half poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16( poison, poison, bfloat poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32( poison, poison, float poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64( poison, poison, double poison) +; SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SME-STREAMING-LABEL: 'extractions' +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8( poison, poison, i8 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16( poison, poison, i16 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32( poison, poison, i32 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64( poison, poison, i64 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16( poison, poison, half poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16( poison, poison, bfloat poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32( poison, poison, float poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64( poison, poison, double poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8( poison, poison, i8 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16( poison, poison, i16 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32( poison, poison, i32 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64( poison, poison, i64 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16( poison, poison, half poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16( poison, poison, bfloat poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32( poison, poison, float poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64( poison, poison, double poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8( poison, poison, i8 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16( poison, poison, i16 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32( poison, poison, i32 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64( poison, poison, i64 poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16( poison, poison, half poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16( poison, poison, bfloat poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32( poison, poison, float poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64( poison, poison, double poison) +; SME-STREAMING-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + + ;; Legal types + %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison) + %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison) + %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison) + %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison) + %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison) + %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison) + %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison) + %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison) + %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8( poison, poison, i8 poison) + %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16( poison, poison, i16 poison) + %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32( poison, poison, i32 poison) + %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64( poison, poison, i64 poison) + %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16( poison, poison, half poison) + %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16( poison, poison, bfloat poison) + %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32( poison, poison, float poison) + %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64( poison, poison, double poison) + + ;; Wider-than-legal + %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison) + %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison) + %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison) + %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison) + %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison) + %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison) + %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison) + %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison) + %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8( poison, poison, i8 poison) + %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16( poison, poison, i16 poison) + %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32( poison, poison, i32 poison) + %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64( poison, poison, i64 poison) + %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16( poison, poison, half poison) + %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16( poison, poison, bfloat poison) + %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32( poison, poison, float poison) + %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64( poison, poison, double poison) + + ;; Narrower-than-legal + %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison) + %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison) + %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison) + %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison) + %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison) + %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison) + %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison) + %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison) + %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8( poison, poison, i8 poison) + %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16( poison, poison, i16 poison) + %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32( poison, poison, i32 poison) + %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64( poison, poison, i64 poison) + %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16( poison, poison, half poison) + %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16( poison, poison, bfloat poison) + %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32( poison, poison, float poison) + %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64( poison, poison, double poison) + + ret void +} From 83b9c31ed8f176a9f06546a3e08d2b0ee9f7e731 Mon Sep 17 00:00:00 2001 From: Daniel Chen Date: Fri, 31 Oct 2025 09:49:50 -0400 Subject: [PATCH 341/539] To fix polymorphic pointer assignment in FORALL when LHS is unlimited polymorphic and RHS is intrinsic type target (#164999) Fixes #143569. --- flang/lib/Lower/Bridge.cpp | 4 +++ flang/test/Lower/forall-polymorphic.f90 | 41 +++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 6e729874eb5e6..0f4b39a07c5da 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -4876,6 +4876,10 @@ class FirConverter : public Fortran::lower::AbstractConverter { mlir::Value shape = builder->genShape(loc, lbounds, extents); rhsBox = fir::ReboxOp::create(*builder, loc, lhsBoxType, rhsBox, shape, /*slice=*/mlir::Value{}); + } else if (fir::isClassStarType(lhsBoxType) && + !fir::ConvertOp::canBeConverted(rhsBoxType, lhsBoxType)) { + rhsBox = fir::ReboxOp::create(*builder, loc, lhsBoxType, rhsBox, + mlir::Value{}, mlir::Value{}); } return rhsBox; } diff --git a/flang/test/Lower/forall-polymorphic.f90 b/flang/test/Lower/forall-polymorphic.f90 index 2b7a51f9b549a..656b6ecf00628 100644 --- a/flang/test/Lower/forall-polymorphic.f90 +++ b/flang/test/Lower/forall-polymorphic.f90 @@ -1,6 +1,7 @@ ! Test lower of FORALL polymorphic pointer assignment ! RUN: bbc -emit-fir %s -o - | FileCheck %s + !! Test when LHS is polymorphic and RHS is not polymorphic ! CHECK-LABEL: c.func @_QPforallpolymorphic subroutine forallPolymorphic() @@ -46,6 +47,7 @@ subroutine forallPolymorphic() end subroutine forallPolymorphic + !! Test when LHS is not polymorphic but RHS is polymorphic ! CHECK-LABEL: c.func @_QPforallpolymorphic2( ! CHECK-SAME: %arg0: !fir.ref>>>}>>>>> {fir.bindc_name = "tar1", fir.target}) { @@ -87,3 +89,42 @@ subroutine forallPolymorphic2(Tar1) end subroutine forallPolymorphic2 + +!! Test when LHS is unlimited polymorphic and RHS non-polymorphic intrinsic +!! type target. +! CHECK-LABEL: c.func @_QPforallpolymorphic3 +subroutine forallPolymorphic3() + TYPE :: DT + CLASS(*), POINTER :: Ptr => NULL() + END TYPE + + TYPE(DT) :: D1(10) + CHARACTER*1, TARGET :: TAR1(10) + INTEGER :: I + + FORALL (I=1:10) + D1(I)%Ptr => Tar1(I) + END FORALL + +! CHECK: %[[V_7:[0-9]+]] = fir.alloca !fir.array<10x!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class>}>> {bindc_name = "d1", uniq_name = "_QFforallpolymorphic3Ed1"} +! CHECK: %[[V_8:[0-9]+]] = fir.shape %c10 : (index) -> !fir.shape<1> +! CHECK: %[[V_9:[0-9]+]] = fir.declare %[[V_7]](%[[V_8]]) {uniq_name = "_QFforallpolymorphic3Ed1"} : (!fir.ref>}>>>, !fir.shape<1>) -> !fir.ref>}>>> +! CHECK: %[[V_16:[0-9]+]] = fir.alloca !fir.array<10x!fir.char<1>> {bindc_name = "tar1", fir.target, uniq_name = "_QFforallpolymorphic3Etar1"} +! CHECK: %[[V_17:[0-9]+]] = fir.declare %[[V_16]](%[[V_8]]) typeparams %c1 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFforallpolymorphic3Etar1"} : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref>> +! CHECK: %[[V_24:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index +! CHECK: %[[V_25:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index +! CHECK: fir.do_loop %arg0 = %[[V_24]] to %[[V_25]] step %c1 +! CHECK: { +! CHECK: %[[V_26:[0-9]+]] = fir.convert %arg0 : (index) -> i32 +! CHECK: %[[V_27:[0-9]+]] = fir.convert %[[V_26]] : (i32) -> i64 +! CHECK: %[[V_28:[0-9]+]] = fir.array_coor %[[V_9]](%[[V_8]]) %[[V_27]] : (!fir.ref>}>>>, !fir.shape<1>, i64) -> !fir.ref>}>> +! CHECK: %[[V_29:[0-9]+]] = fir.field_index ptr, !fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class>}> +! CHECK: %[[V_30:[0-9]+]] = fir.coordinate_of %[[V_28]], ptr : (!fir.ref>}>>) -> !fir.ref>> +! CHECK: %[[V_31:[0-9]+]] = fir.convert %[[V_26]] : (i32) -> i64 +! CHECK: %[[V_32:[0-9]+]] = fir.array_coor %[[V_17]](%[[V_8]]) %31 : (!fir.ref>>, !fir.shape<1>, i64) -> !fir.ref> +! CHECK: %[[V_33:[0-9]+]] = fir.embox %[[V_32]] : (!fir.ref>) -> !fir.box>> +! CHECK: %[[V_34:[0-9]+]] = fir.rebox %[[V_33]] : (!fir.box>>) -> !fir.class> +! CHECK: fir.store %[[V_34]] to %[[V_30]] : !fir.ref>> +! CHECK: } + +end subroutine forallPolymorphic3 From cf7fc903ae0f54ba6398b2e92d4a53e6fec42f6d Mon Sep 17 00:00:00 2001 From: Vladimir Vuksanovic <109677816+vvuksanovic@users.noreply.github.com> Date: Fri, 31 Oct 2025 14:52:01 +0100 Subject: [PATCH 342/539] [Sema] Fix parameter index checks on explicit object member functions (#165586) With the C++23 explicit object parameter feature, it is no longer sufficient to only check if a function is an instance method to determine if it has an implicit this argument. That causes problems in attributes that have parameter indexes. --- clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/AST/Attr.h | 5 ++-- clang/include/clang/Sema/Attr.h | 6 +++++ clang/include/clang/Sema/Sema.h | 6 ++--- clang/lib/Sema/SemaChecking.cpp | 11 ++++----- clang/lib/Sema/SemaDeclAttr.cpp | 6 ++--- clang/test/CodeGenCXX/attr-callback.cpp | 23 ++++++++++++++++++- clang/test/SemaCXX/attr-callback-broken.cpp | 7 +++++- clang/test/SemaCXX/attr-callback.cpp | 7 +++++- clang/test/SemaCXX/attr-format.cpp | 13 ++++++++++- .../test/SemaCXX/attr-lifetime-capture-by.cpp | 3 +++ clang/test/SemaCXX/attr-nonnull.cpp | 11 +++++++-- 12 files changed, 79 insertions(+), 20 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 73aaaad8b32e5..92fc9381a5868 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -451,6 +451,7 @@ Bug Fixes to Attribute Support ``[[gnu::error("some error")]]`` now correctly triggers an error. (#GH146520) - Fix a crash when the function name is empty in the `swift_name` attribute. (#GH157075) - Fixes crashes or missing diagnostics with the `device_kernel` attribute. (#GH161905) +- Fix handling of parameter indexes when an attribute is applied to a C++23 explicit object member function. Bug Fixes to C++ Support ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h index ce273c167aa22..14d7caa0e16d7 100644 --- a/clang/include/clang/AST/Attr.h +++ b/clang/include/clang/AST/Attr.h @@ -16,6 +16,7 @@ #include "clang/AST/ASTFwd.h" #include "clang/AST/AttrIterator.h" #include "clang/AST/Decl.h" +#include "clang/AST/DeclCXX.h" #include "clang/AST/Type.h" #include "clang/Basic/AttrKinds.h" #include "clang/Basic/AttributeCommonInfo.h" @@ -327,8 +328,8 @@ class ParamIdx { ParamIdx(unsigned Idx, const Decl *D) : Idx(Idx), HasThis(false), IsValid(true) { assert(Idx >= 1 && "Idx must be one-origin"); - if (const auto *FD = dyn_cast(D)) - HasThis = FD->isCXXInstanceMember(); + if (const auto *MethodDecl = dyn_cast(D)) + HasThis = MethodDecl->isImplicitObjectMemberFunction(); } /// A type into which \c ParamIdx can be serialized. diff --git a/clang/include/clang/Sema/Attr.h b/clang/include/clang/Sema/Attr.h index 3f0b10212789a..5836231818eec 100644 --- a/clang/include/clang/Sema/Attr.h +++ b/clang/include/clang/Sema/Attr.h @@ -123,6 +123,12 @@ inline bool isInstanceMethod(const Decl *D) { return false; } +inline bool hasImplicitObjectParameter(const Decl *D) { + if (const auto *MethodDecl = dyn_cast(D)) + return MethodDecl->isImplicitObjectMemberFunction(); + return false; +} + /// Diagnose mutually exclusive attributes when present on a given /// declaration. Returns true if diagnosed. template diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 52904c72d1cfc..c67ed99b1f49e 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -2608,13 +2608,13 @@ class Sema final : public SemaBase { }; /// Given a function and its FormatAttr or FormatMatchesAttr info, attempts to - /// populate the FomatStringInfo parameter with the attribute's correct + /// populate the FormatStringInfo parameter with the attribute's correct /// format_idx and firstDataArg. Returns true when the format fits the /// function and the FormatStringInfo has been populated. static bool getFormatStringInfo(const Decl *Function, unsigned FormatIdx, unsigned FirstArg, FormatStringInfo *FSI); static bool getFormatStringInfo(unsigned FormatIdx, unsigned FirstArg, - bool IsCXXMember, bool IsVariadic, + bool HasImplicitThisParam, bool IsVariadic, FormatStringInfo *FSI); // Used by C++ template instantiation. @@ -5119,7 +5119,7 @@ class Sema final : public SemaBase { // In C++ the implicit 'this' function parameter also counts. // Parameters are counted from one. bool HP = hasFunctionProto(D); - bool HasImplicitThisParam = isInstanceMethod(D); + bool HasImplicitThisParam = hasImplicitObjectParameter(D); bool IV = HP && isFunctionOrMethodVariadic(D); unsigned NumParams = (HP ? getFunctionOrMethodNumParams(D) : 0) + HasImplicitThisParam; diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index f4517877b04c8..ad2c2e4a97bb9 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3542,9 +3542,7 @@ bool Sema::ValueIsRunOfOnes(CallExpr *TheCall, unsigned ArgNum) { bool Sema::getFormatStringInfo(const Decl *D, unsigned FormatIdx, unsigned FirstArg, FormatStringInfo *FSI) { - bool IsCXXMember = false; - if (const auto *MD = dyn_cast(D)) - IsCXXMember = MD->isInstance(); + bool HasImplicitThisParam = hasImplicitObjectParameter(D); bool IsVariadic = false; if (const FunctionType *FnTy = D->getFunctionType()) IsVariadic = cast(FnTy)->isVariadic(); @@ -3553,11 +3551,12 @@ bool Sema::getFormatStringInfo(const Decl *D, unsigned FormatIdx, else if (const auto *OMD = dyn_cast(D)) IsVariadic = OMD->isVariadic(); - return getFormatStringInfo(FormatIdx, FirstArg, IsCXXMember, IsVariadic, FSI); + return getFormatStringInfo(FormatIdx, FirstArg, HasImplicitThisParam, + IsVariadic, FSI); } bool Sema::getFormatStringInfo(unsigned FormatIdx, unsigned FirstArg, - bool IsCXXMember, bool IsVariadic, + bool HasImplicitThisParam, bool IsVariadic, FormatStringInfo *FSI) { if (FirstArg == 0) FSI->ArgPassingKind = FAPK_VAList; @@ -3571,7 +3570,7 @@ bool Sema::getFormatStringInfo(unsigned FormatIdx, unsigned FirstArg, // The way the format attribute works in GCC, the implicit this argument // of member functions is counted. However, it doesn't appear in our own // lists, so decrement format_idx in that case. - if (IsCXXMember) { + if (HasImplicitThisParam) { if(FSI->FormatIdx == 0) return false; --FSI->FormatIdx; diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 964a2a791e18f..a9e7b44ac9d73 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -3785,7 +3785,7 @@ static bool handleFormatAttrCommon(Sema &S, Decl *D, const ParsedAttr &AL, // In C++ the implicit 'this' function parameter also counts, and they are // counted from one. - bool HasImplicitThisParam = isInstanceMethod(D); + bool HasImplicitThisParam = hasImplicitObjectParameter(D); Info->NumArgs = getFunctionOrMethodNumParams(D) + HasImplicitThisParam; Info->Identifier = AL.getArgAsIdent(0)->getIdentifierInfo(); @@ -3926,7 +3926,7 @@ static void handleCallbackAttr(Sema &S, Decl *D, const ParsedAttr &AL) { return; } - bool HasImplicitThisParam = isInstanceMethod(D); + bool HasImplicitThisParam = hasImplicitObjectParameter(D); int32_t NumArgs = getFunctionOrMethodNumParams(D); FunctionDecl *FD = D->getAsFunction(); @@ -4110,7 +4110,7 @@ static void handleLifetimeCaptureByAttr(Sema &S, Decl *D, } void Sema::LazyProcessLifetimeCaptureByParams(FunctionDecl *FD) { - bool HasImplicitThisParam = isInstanceMethod(FD); + bool HasImplicitThisParam = hasImplicitObjectParameter(FD); SmallVector Attrs; for (ParmVarDecl *PVD : FD->parameters()) if (auto *A = PVD->getAttr()) diff --git a/clang/test/CodeGenCXX/attr-callback.cpp b/clang/test/CodeGenCXX/attr-callback.cpp index c3456d6c430ff..efa705b9d06dc 100644 --- a/clang/test/CodeGenCXX/attr-callback.cpp +++ b/clang/test/CodeGenCXX/attr-callback.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple i386-unknown-unknown %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -triple i386-unknown-unknown -std=c++23 %s -emit-llvm -o - | FileCheck %s struct Base { @@ -47,9 +47,30 @@ struct Derived_2 : public Base { // CHECK-NOT: !callback void Derived_2::virtual_1(void (*callback)(void)) {} +class ExplicitParameterObject { + __attribute__((callback(1, 0))) void implicit_this_idx(void (*callback)(ExplicitParameterObject*)); + __attribute__((callback(1, this))) void implicit_this_identifier(void (*callback)(ExplicitParameterObject*)); + __attribute__((callback(2, 1))) void explicit_this_idx(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*)); + __attribute__((callback(2, self))) void explicit_this_identifier(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*)); +}; + +// CHECK-DAG: define{{.*}} void @_ZN23ExplicitParameterObject17implicit_this_idxEPFvPS_E({{[^!]*!callback}} ![[cid3:[0-9]+]] +void ExplicitParameterObject::implicit_this_idx(void (*callback)(ExplicitParameterObject*)) {} + +// CHECK-DAG: define{{.*}} void @_ZN23ExplicitParameterObject24implicit_this_identifierEPFvPS_E({{[^!]*!callback}} ![[cid3]] +void ExplicitParameterObject::implicit_this_identifier(void (*callback)(ExplicitParameterObject*)) {} + +// CHECK-DAG: define{{.*}} void @_ZNH23ExplicitParameterObject17explicit_this_idxEPS_PFvS0_E({{[^!]*!callback}} ![[cid3]] +void ExplicitParameterObject::explicit_this_idx(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*)) {} + +// CHECK-DAG: define{{.*}} void @_ZNH23ExplicitParameterObject24explicit_this_identifierEPS_PFvS0_E({{[^!]*!callback}} ![[cid3]] +void ExplicitParameterObject::explicit_this_identifier(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*)) {} + // CHECK-DAG: ![[cid0]] = !{![[cid0b:[0-9]+]]} // CHECK-DAG: ![[cid0b]] = !{i64 1, i1 false} // CHECK-DAG: ![[cid1]] = !{![[cid1b:[0-9]+]]} // CHECK-DAG: ![[cid1b]] = !{i64 2, i1 false} // CHECK-DAG: ![[cid2]] = !{![[cid2b:[0-9]+]]} // CHECK-DAG: ![[cid2b]] = !{i64 1, i64 0, i64 -1, i64 0, i1 false} +// CHECK-DAG: ![[cid3]] = !{![[cid3b:[0-9]+]]} +// CHECK-DAG: ![[cid3b]] = !{i64 1, i64 0, i1 false} diff --git a/clang/test/SemaCXX/attr-callback-broken.cpp b/clang/test/SemaCXX/attr-callback-broken.cpp index a5469b22ba350..53b331a49251b 100644 --- a/clang/test/SemaCXX/attr-callback-broken.cpp +++ b/clang/test/SemaCXX/attr-callback-broken.cpp @@ -1,7 +1,12 @@ -// RUN: %clang_cc1 %s -verify -fsyntax-only +// RUN: %clang_cc1 %s -std=c++23 -verify -fsyntax-only class C_in_class { #define HAS_THIS #include "../Sema/attr-callback-broken.c" #undef HAS_THIS }; + +class ExplicitParameterObject { + __attribute__((callback(2, 0))) void explicit_this_idx(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*)); // expected-error {{'callback' argument at position 2 references unavailable implicit 'this'}} + __attribute__((callback(2, this))) void explicit_this_identifier(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*)); // expected-error {{'callback' argument at position 2 references unavailable implicit 'this'}} +}; diff --git a/clang/test/SemaCXX/attr-callback.cpp b/clang/test/SemaCXX/attr-callback.cpp index ee02f7d3d24f7..ff5a241e92f74 100644 --- a/clang/test/SemaCXX/attr-callback.cpp +++ b/clang/test/SemaCXX/attr-callback.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 %s -verify -fsyntax-only +// RUN: %clang_cc1 %s -std=c++23 -verify -fsyntax-only // expected-no-diagnostics @@ -6,6 +6,11 @@ class C_in_class { #include "../Sema/attr-callback.c" }; +class ExplicitParameterObject { + __attribute__((callback(2, 1))) void explicit_this_idx(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*)); + __attribute__((callback(2, self))) void explicit_this_identifier(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*)); +}; + struct Base { void no_args_1(void (*callback)(void)); diff --git a/clang/test/SemaCXX/attr-format.cpp b/clang/test/SemaCXX/attr-format.cpp index adc05fc46776c..c0aeb5d07dfe9 100644 --- a/clang/test/SemaCXX/attr-format.cpp +++ b/clang/test/SemaCXX/attr-format.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsyntax-only -Wformat-nonliteral -verify %s +// RUN: %clang_cc1 -fsyntax-only -std=c++23 -Wformat-nonliteral -verify %s #include int printf(const char *fmt, ...) __attribute__((format(printf, 1, 2))); @@ -11,6 +11,10 @@ struct S { // the format argument is argument 2 here. void g(const char*, ...) __attribute__((format(printf, 2, 3))); const char* g2(const char*) __attribute__((format_arg(2))); + // From C++23 'this' can also be specified explicitly. + void g3(this S&, const char *, ...) __attribute__((format(printf, 2, 3))); + void g4(this const char* s, ...) __attribute__((format(printf, 1, 2))); + consteval operator const char*() const { return "%f"; } // #g4_fmt_string void h(const char*, ...) __attribute__((format(printf, 1, 4))); // \ expected-error{{implicit this argument as the format string}} @@ -18,10 +22,17 @@ struct S { expected-error{{out of bounds}} const char* h3(const char*) __attribute__((format_arg(1))); // \ expected-error{{invalid for the implicit this argument}} + void h4(this S&, const char *, ...) __attribute__((format(printf, 1, 3))); // \ + expected-error {{format argument not a string type}} void operator() (const char*, ...) __attribute__((format(printf, 2, 3))); }; +void s() { + S().g4(4); // expected-warning {{format specifies type 'double' but the argument has type 'int'}} + // expected-note@#g4_fmt_string {{format string is defined here}} +} + // PR5521 struct A { void a(const char*,...) __attribute((format(printf,2,3))); }; void b(A x) { diff --git a/clang/test/SemaCXX/attr-lifetime-capture-by.cpp b/clang/test/SemaCXX/attr-lifetime-capture-by.cpp index 70a5fe5a45376..8606592c6b771 100644 --- a/clang/test/SemaCXX/attr-lifetime-capture-by.cpp +++ b/clang/test/SemaCXX/attr-lifetime-capture-by.cpp @@ -44,4 +44,7 @@ struct T { { s.captureInt(x); } + + void explicit_this1(this T& self, const int &x [[clang::lifetime_capture_by(self)]]); + void explicit_this2(this T& self, const int &x [[clang::lifetime_capture_by(this)]]); // expected-error {{argument references unavailable implicit 'this'}} }; diff --git a/clang/test/SemaCXX/attr-nonnull.cpp b/clang/test/SemaCXX/attr-nonnull.cpp index 6f9119b519d09..0fba6b50cb354 100644 --- a/clang/test/SemaCXX/attr-nonnull.cpp +++ b/clang/test/SemaCXX/attr-nonnull.cpp @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -fsyntax-only -verify %s -// RUN: %clang_cc1 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s +// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter struct S { S(const char *) __attribute__((nonnull(2))); @@ -11,6 +11,13 @@ struct S { void h(const char*) __attribute__((nonnull(1))); // \ expected-error{{invalid for the implicit this argument}} + + void i(this S* self, const char*) __attribute__((nonnull(1))); + + void j(this S* self, const char*) __attribute__((nonnull(2))); + + void k(this S* self, const char*) __attribute__((nonnull(3))); // \ + expected-error{{'nonnull' attribute parameter 1 is out of bounds}} }; void test() { From 090872dbad230da883ba3f76d01e8b8d1261c2d8 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 31 Oct 2025 14:00:07 +0000 Subject: [PATCH 343/539] [lldb][docs] Add troubleshooting section to scripting introduction Logs just helped someone on Discord debug an issue in a way that would not have been possible just by stepping the Python script. It was that LLDB could not find the lldb-server binary. We do talk about logs elsewhere but I think it's fine to repeat here since it's a lot of people's first experience with scripting. --- .../implementing-standalone-scripts.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/lldb/docs/use/tutorials/implementing-standalone-scripts.md b/lldb/docs/use/tutorials/implementing-standalone-scripts.md index 285d2d3dea9ea..b1a3441ffe2ee 100644 --- a/lldb/docs/use/tutorials/implementing-standalone-scripts.md +++ b/lldb/docs/use/tutorials/implementing-standalone-scripts.md @@ -147,3 +147,20 @@ SBFunction: id = 0x0000002e, name = main, type = main a.out[0x714]: mov w0, #0x0 ; =0 a.out[0x718]: ret ``` + +### Troubleshooting + +You can use all the usual Python tools to debug scripts, and on top of that +you can enable LLDB's log channels. To do this in the script shown above, add +this line right after `debugger` has been assigned: + +```python +debugger.EnableLog("lldb", ["all"]) +``` + +`lldb` `all` enables a lot of different channels, so you will probably want +to enable only a few channels once you know what you are interested in. + +This API call is the equivalent of `log enable lldb all` when using LLDB +interactively. All channels available to `log enable` can be enabled using +`EnableLog` too. \ No newline at end of file From 3d5dcdd714de0bf44f564433692f579ec45a6cc6 Mon Sep 17 00:00:00 2001 From: Steven Perron Date: Fri, 31 Oct 2025 10:12:26 -0400 Subject: [PATCH 344/539] [SPIRV] Fix vector bitcast check in LegalizePointerCast (#164997) The previous check for vector bitcasts in `loadVectorFromVector` only compared the number of elements, which is insufficient when the element types differ. This can lead to incorrect assumptions about the validity of the cast. This commit replaces the element count check with a comparison of the total size of the vectors in bits. This ensures that the bitcast is only performed between vectors of the same size, preventing potential miscompilations. Part of https://github.com/llvm/llvm-project/issues/153091 --- .../Target/SPIRV/SPIRVLegalizePointerCast.cpp | 9 +++++++- .../hlsl-resources/issue-146942-ptr-cast.ll | 4 +--- .../CodeGen/SPIRV/pointers/ptrcast-bitcast.ll | 22 +++++++++++++++++++ 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp index 6e444c98de8da..65dffc7908b78 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp @@ -73,16 +73,23 @@ class SPIRVLegalizePointerCast : public FunctionPass { // Returns the loaded value. Value *loadVectorFromVector(IRBuilder<> &B, FixedVectorType *SourceType, FixedVectorType *TargetType, Value *Source) { - assert(TargetType->getNumElements() <= SourceType->getNumElements()); LoadInst *NewLoad = B.CreateLoad(SourceType, Source); buildAssignType(B, SourceType, NewLoad); Value *AssignValue = NewLoad; if (TargetType->getElementType() != SourceType->getElementType()) { + const DataLayout &DL = B.GetInsertBlock()->getModule()->getDataLayout(); + [[maybe_unused]] TypeSize TargetTypeSize = + DL.getTypeSizeInBits(TargetType); + [[maybe_unused]] TypeSize SourceTypeSize = + DL.getTypeSizeInBits(SourceType); + assert(TargetTypeSize == SourceTypeSize); AssignValue = B.CreateIntrinsic(Intrinsic::spv_bitcast, {TargetType, SourceType}, {NewLoad}); buildAssignType(B, TargetType, AssignValue); + return AssignValue; } + assert(TargetType->getNumElements() < SourceType->getNumElements()); SmallVector Mask(/* Size= */ TargetType->getNumElements()); for (unsigned I = 0; I < TargetType->getNumElements(); ++I) Mask[I] = I; diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll index ed67344842b11..4817e7450ac2e 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll @@ -16,7 +16,6 @@ define void @case1() local_unnamed_addr { ; CHECK: %[[#BUFFER_LOAD:]] = OpLoad %[[#FLOAT4]] %{{[0-9]+}} Aligned 16 ; CHECK: %[[#CAST_LOAD:]] = OpBitcast %[[#INT4]] %[[#BUFFER_LOAD]] - ; CHECK: %[[#VEC_SHUFFLE:]] = OpVectorShuffle %[[#INT4]] %[[#CAST_LOAD]] %[[#CAST_LOAD]] 0 1 2 3 %1 = tail call target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4f32_12_0t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str) %2 = tail call target("spirv.VulkanBuffer", [0 x <4 x i32>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4i32_12_1t(i32 0, i32 5, i32 1, i32 0, ptr nonnull @.str.2) %3 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v4f32_12_0t(target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) %1, i32 0) @@ -29,8 +28,7 @@ define void @case1() local_unnamed_addr { define void @case2() local_unnamed_addr { ; CHECK: %[[#BUFFER_LOAD:]] = OpLoad %[[#FLOAT4]] %{{[0-9]+}} Aligned 16 ; CHECK: %[[#CAST_LOAD:]] = OpBitcast %[[#INT4]] %[[#BUFFER_LOAD]] - ; CHECK: %[[#VEC_SHUFFLE:]] = OpVectorShuffle %[[#INT4]] %[[#CAST_LOAD]] %[[#CAST_LOAD]] 0 1 2 3 - ; CHECK: %[[#VEC_TRUNCATE:]] = OpVectorShuffle %[[#INT3]] %[[#VEC_SHUFFLE]] %[[#UNDEF_INT4]] 0 1 2 + ; CHECK: %[[#VEC_TRUNCATE:]] = OpVectorShuffle %[[#INT3]] %[[#CAST_LOAD]] %[[#UNDEF_INT4]] 0 1 2 %1 = tail call target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4f32_12_0t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str) %2 = tail call target("spirv.VulkanBuffer", [0 x <3 x i32>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v3i32_12_1t(i32 0, i32 5, i32 1, i32 0, ptr nonnull @.str.3) %3 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v4f32_12_0t(target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) %1, i32 0) diff --git a/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll b/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll index 84913283f6868..a1ec2cd1cfdd2 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll @@ -26,3 +26,25 @@ entry: store <4 x i32> %6, ptr addrspace(11) %7, align 16 ret void } + +; This tests a load from a pointer that has been bitcast between vector types +; which share the same total bit-width but have different numbers of elements. +; Tests that legalize-pointer-casts works correctly by moving the bitcast to +; the element that was loaded. + +define void @main2() local_unnamed_addr #0 { +entry: +; CHECK: %[[LOAD:[0-9]+]] = OpLoad %[[#v2_double]] {{.*}} +; CHECK: %[[BITCAST1:[0-9]+]] = OpBitcast %[[#v4_uint]] %[[LOAD]] +; CHECK: %[[BITCAST2:[0-9]+]] = OpBitcast %[[#v2_double]] %[[BITCAST1]] +; CHECK: OpStore {{%[0-9]+}} %[[BITCAST2]] {{.*}} + + %0 = tail call target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v2f64_12_1t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str.2) + %2 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v2f64_12_1t(target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) %0, i32 0) + %3 = load <4 x i32>, ptr addrspace(11) %2 + %4 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v2f64_12_1t(target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) %0, i32 1) + store <4 x i32> %3, ptr addrspace(11) %4 + ret void +} + +attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } From 8abd5c0211d4b83f53a44b61f02d5b9b259207db Mon Sep 17 00:00:00 2001 From: google-yfyang Date: Fri, 31 Oct 2025 10:22:33 -0400 Subject: [PATCH 345/539] [lldb] Fix a lldb failure following #165707 (#165864) Fix the breakage in: https://buildkite.com/llvm-project/upstream-bazel/builds/155695/steps/canvas?sid=019a39c6-6ccd-4845-bd5e-d36e4f0369e2 --- lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp b/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp index f9c249d7fec1c..e41a28bd21c36 100644 --- a/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp +++ b/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp @@ -480,11 +480,10 @@ ABISysV_arc::GetReturnValueObjectSimple(Thread &thread, } // Floating point return type. else if (type_flags & eTypeIsFloat) { - uint32_t float_count = 0; bool is_complex = false; - if (compiler_type.IsFloatingPointType(float_count, is_complex) && - 1 == float_count && !is_complex) { + if (compiler_type.IsFloatingPointType(is_complex) && + !compiler_type.IsVectorType() && !is_complex) { const size_t byte_size = llvm::expectedToOptional(compiler_type.GetByteSize(&thread)) .value_or(0); From 779e9468978c64e53380c6ffce1d6dad8877e33a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 31 Oct 2025 14:22:58 +0000 Subject: [PATCH 346/539] [X86] detectPMADDUBSW - use SDPatternMatch to handle some of the commutative matching for the zext/sext pairs. NFC. (#165861) --- llvm/lib/Target/X86/X86ISelLowering.cpp | 40 ++++++------------------- 1 file changed, 9 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index fd01363bed709..007074c3ffc82 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -54459,6 +54459,7 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { + using namespace SDPatternMatch; if (!VT.isVector() || !Subtarget.hasSSSE3()) return SDValue(); @@ -54468,42 +54469,19 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, return SDValue(); SDValue SSatVal = detectSSatPattern(In, VT); - if (!SSatVal || SSatVal.getOpcode() != ISD::ADD) - return SDValue(); - - // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs - // of multiplies from even/odd elements. - SDValue N0 = SSatVal.getOperand(0); - SDValue N1 = SSatVal.getOperand(1); - - if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL) + if (!SSatVal) return SDValue(); - SDValue N00 = N0.getOperand(0); - SDValue N01 = N0.getOperand(1); - SDValue N10 = N1.getOperand(0); - SDValue N11 = N1.getOperand(1); - + // See if this is a signed saturation of an ADD, adding pairs of multiplies + // from even/odd elements, from zero_extend/sign_extend operands. + // // TODO: Handle constant vectors and use knownbits/computenumsignbits? - // Canonicalize zero_extend to LHS. - if (N01.getOpcode() == ISD::ZERO_EXTEND) - std::swap(N00, N01); - if (N11.getOpcode() == ISD::ZERO_EXTEND) - std::swap(N10, N11); - - // Ensure we have a zero_extend and a sign_extend. - if (N00.getOpcode() != ISD::ZERO_EXTEND || - N01.getOpcode() != ISD::SIGN_EXTEND || - N10.getOpcode() != ISD::ZERO_EXTEND || - N11.getOpcode() != ISD::SIGN_EXTEND) + SDValue N00, N01, N10, N11; + if (!sd_match(SSatVal, + m_Add(m_Mul(m_ZExt(m_Value(N00)), m_SExt(m_Value(N01))), + m_Mul(m_ZExt(m_Value(N10)), m_SExt(m_Value(N11)))))) return SDValue(); - // Peek through the extends. - N00 = N00.getOperand(0); - N01 = N01.getOperand(0); - N10 = N10.getOperand(0); - N11 = N11.getOperand(0); - // Ensure the extend is from vXi8. if (N00.getValueType().getVectorElementType() != MVT::i8 || N01.getValueType().getVectorElementType() != MVT::i8 || From df4ccfeaac40af7dd713aabae767cc482b04da1a Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Fri, 31 Oct 2025 07:42:16 -0700 Subject: [PATCH 347/539] [RISCV] Mask integer and float loads as canFoldAsLoad for stackmaps (#165761) We have two mechanisms used in inline spilled for folding a load into a consuming instruction. One is used for stack reloads, the other for other load instructions (usually argument loads). We currently only implement optimizations for the first case, but stackmaps have generic support in target independent code for the other. We can go ahead and set the flag to enable that optimization. The primary motivation for this is that if we enable load rematerialization without it, we run into crashes where we can't make progress through rematerialization. We probably should enable the other foldMemoryOperand hook for RISCV specific instructions, but that's a separate optimization. --- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 2 +- llvm/lib/Target/RISCV/RISCVInstrInfo.td | 4 +++- llvm/lib/Target/RISCV/RISCVInstrInfoD.td | 1 + llvm/lib/Target/RISCV/RISCVInstrInfoF.td | 1 + llvm/test/CodeGen/RISCV/rv64-stackmap.ll | 8 ++++---- 5 files changed, 10 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 3a7013d9efae6..c9df787e0012d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -869,7 +869,7 @@ std::optional getFoldedOpcode(MachineFunction &MF, MachineInstr &MI, } } -// This is the version used during inline spilling +// This is the version used during InlineSpiller::spillAroundUses MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 7c89686ebfb3c..9cb53fb27a2d2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -768,7 +768,7 @@ def BGE : BranchCC_rri<0b101, "bge">; def BLTU : BranchCC_rri<0b110, "bltu">; def BGEU : BranchCC_rri<0b111, "bgeu">; -let IsSignExtendingOpW = 1 in { +let IsSignExtendingOpW = 1, canFoldAsLoad = 1 in { def LB : Load_ri<0b000, "lb">, Sched<[WriteLDB, ReadMemBase]>; def LH : Load_ri<0b001, "lh">, Sched<[WriteLDH, ReadMemBase]>; def LW : Load_ri<0b010, "lw">, Sched<[WriteLDW, ReadMemBase]>; @@ -889,8 +889,10 @@ def CSRRCI : CSR_ii<0b111, "csrrci">; /// RV64I instructions let Predicates = [IsRV64] in { +let canFoldAsLoad = 1 in { def LWU : Load_ri<0b110, "lwu">, Sched<[WriteLDW, ReadMemBase]>; def LD : Load_ri<0b011, "ld">, Sched<[WriteLDD, ReadMemBase]>; +} def SD : Store_rri<0b011, "sd">, Sched<[WriteSTD, ReadStoreData, ReadMemBase]>; let IsSignExtendingOpW = 1 in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index afac37d6337d4..4ffe3e62ac501 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -71,6 +71,7 @@ defvar DExtsRV64 = [DExt, ZdinxExt]; //===----------------------------------------------------------------------===// let Predicates = [HasStdExtD] in { +let canFoldAsLoad = 1 in def FLD : FPLoad_r<0b011, "fld", FPR64, WriteFLD64>; // Operands for stores are in the order srcreg, base, offset rather than diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td index 6571d998246a7..b30f8ec820c15 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td @@ -330,6 +330,7 @@ class PseudoFROUND //===----------------------------------------------------------------------===// let Predicates = [HasStdExtF] in { +let canFoldAsLoad = 1 in def FLW : FPLoad_r<0b010, "flw", FPR32, WriteFLD32>; // Operands for stores are in the order srcreg, base, offset rather than diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll index c3183a1a3e036..9aefa90684dd3 100644 --- a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll +++ b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll @@ -38,8 +38,8 @@ ; CHECK-NEXT: .quad liveConstant ; CHECK-NEXT: .quad 0 ; CHECK-NEXT: .quad 1 -; CHECK-NEXT: .quad spilledValue -; CHECK-NEXT: .quad 144 +; CHECK-NEXT: .quad liveArgs +; CHECK-NEXT: .quad 0 ; CHECK-NEXT: .quad 1 ; CHECK-NEXT: .quad directFrameIdx ; CHECK-NEXT: .quad 48 @@ -278,7 +278,7 @@ define void @liveConstant() { ; ; Verify 28 stack map entries. ; -; CHECK-LABEL: .word .L{{.*}}-spilledValue +; CHECK-LABEL: .word .L{{.*}}-liveArgs ; CHECK-NEXT: .half 0 ; CHECK-NEXT: .half 28 ; @@ -290,7 +290,7 @@ define void @liveConstant() { ; CHECK-NEXT: .half 2 ; CHECK-NEXT: .half 0 ; CHECK-NEXT: .word -define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 zeroext %l26, i32 signext %l27) { +define void @liveArgs(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 zeroext %l26, i32 signext %l27) { entry: call void (i64, i32, ptr, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 28, ptr null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 %l26, i32 %l27) ret void From b46f767e9fe9842a1e085efb9194d442ca983162 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 31 Oct 2025 10:44:27 -0400 Subject: [PATCH 348/539] [LoopUnroll] Fix block frequencies when no runtime (#157754) This patch implements the LoopUnroll changes discussed in [[RFC] Fix Loop Transformations to Preserve Block Frequencies](https://discourse.llvm.org/t/rfc-fix-loop-transformations-to-preserve-block-frequencies/85785) and is thus another step in addressing issue #135812. In summary, for the case of partial loop unrolling without a remainder loop, this patch changes LoopUnroll to: - Maintain branch weights consistently with the original loop for the sake of preserving the total frequency of the original loop body. - Store the new estimated trip count in the `llvm.loop.estimated_trip_count` metadata, introduced by PR #148758. - Correct the new estimated trip count (e.g., 3 instead of 2) when the original estimated trip count (e.g., 10) divided by the unroll count (e.g., 4) leaves a remainder (e.g., 2). There are loop unrolling cases this patch does not fully fix, such as partial unrolling with a remainder loop and complete unrolling, and there are two associated tests whose branch weights this patch adversely affects. They will be addressed in future patches that should land with this patch. --- llvm/lib/Transforms/Utils/LoopUnroll.cpp | 39 +++++++++-- .../peel.ll} | 0 .../branch-weights-freq/unroll-partial.ll | 68 +++++++++++++++++++ .../LoopUnroll/runtime-loop-branchweight.ll | 5 +- .../LoopUnroll/unroll-heuristics-pgo.ll | 4 +- 5 files changed, 108 insertions(+), 8 deletions(-) rename llvm/test/Transforms/LoopUnroll/{peel-branch-weights-freq.ll => branch-weights-freq/peel.ll} (100%) create mode 100644 llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 4fe736ac29b0a..23686448ab5ae 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -499,9 +499,8 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, const unsigned MaxTripCount = SE->getSmallConstantMaxTripCount(L); const bool MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L); - unsigned EstimatedLoopInvocationWeight = 0; std::optional OriginalTripCount = - llvm::getLoopEstimatedTripCount(L, &EstimatedLoopInvocationWeight); + llvm::getLoopEstimatedTripCount(L); // Effectively "DCE" unrolled iterations that are beyond the max tripcount // and will never be executed. @@ -1131,10 +1130,38 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, // We shouldn't try to use `L` anymore. L = nullptr; } else if (OriginalTripCount) { - // Update the trip count. Note that the remainder has already logic - // computing it in `UnrollRuntimeLoopRemainder`. - setLoopEstimatedTripCount(L, *OriginalTripCount / ULO.Count, - EstimatedLoopInvocationWeight); + // Update metadata for the loop's branch weights and estimated trip count: + // - If ULO.Runtime, UnrollRuntimeLoopRemainder sets the guard branch + // weights, latch branch weights, and estimated trip count of the + // remainder loop it creates. It also sets the branch weights for the + // unrolled loop guard it creates. The branch weights for the unrolled + // loop latch are adjusted below. FIXME: Actually handle ULO.Runtime. + // - Otherwise, if unrolled loop iteration latches become unconditional, + // branch weights are adjusted above. FIXME: Actually handle such + // unconditional latches. + // - Otherwise, the original loop's branch weights are correct for the + // unrolled loop, so do not adjust them. + // - In all cases, the unrolled loop's estimated trip count is set below. + // + // As an example of the last case, consider what happens if the unroll count + // is 4 for a loop with an estimated trip count of 10 when we do not create + // a remainder loop and all iterations' latches remain conditional. Each + // unrolled iteration's latch still has the same probability of exiting the + // loop as it did when in the original loop, and thus it should still have + // the same branch weights. Each unrolled iteration's non-zero probability + // of exiting already appropriately reduces the probability of reaching the + // remaining iterations just as it did in the original loop. Trying to also + // adjust the branch weights of the final unrolled iteration's latch (i.e., + // the backedge for the unrolled loop as a whole) to reflect its new trip + // count of 3 will erroneously further reduce its block frequencies. + // However, in case an analysis later needs to estimate the trip count of + // the unrolled loop as a whole without considering the branch weights for + // each unrolled iteration's latch within it, we store the new trip count as + // separate metadata. + unsigned NewTripCount = *OriginalTripCount / ULO.Count; + if (!ULO.Runtime && *OriginalTripCount % ULO.Count) + NewTripCount += 1; + setLoopEstimatedTripCount(L, NewTripCount); } // LoopInfo should not be valid, confirm that. diff --git a/llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/peel.ll similarity index 100% rename from llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll rename to llvm/test/Transforms/LoopUnroll/branch-weights-freq/peel.ll diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll new file mode 100644 index 0000000000000..cde9d46ee8421 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll @@ -0,0 +1,68 @@ +; Test branch weight metadata, estimated trip count metadata, and block +; frequencies after partial loop unrolling without -unroll-runtime. + +; RUN: opt < %s -S -passes='print' 2>&1 | \ +; RUN: FileCheck -check-prefix=CHECK %s + +; The -implicit-check-not options make sure that no additional labels or calls +; to @f show up. +; RUN: opt < %s -S -passes='loop-unroll,print' \ +; RUN: -unroll-count=4 2>&1 | \ +; RUN: FileCheck %s -check-prefix=CHECK-UR \ +; RUN: -implicit-check-not='{{^( *- )?[^ ;]*:}}' \ +; RUN: -implicit-check-not='call void @f' + +; CHECK: block-frequency-info: test +; CHECK: do.body: float = 10.0, + +; The sum should still be ~10. +; +; CHECK-UR: block-frequency-info: test +; CHECK-UR: - [[ENTRY:.*]]: +; CHECK-UR: - [[DO_BODY:.*]]: float = 2.9078, +; CHECK-UR: - [[DO_BODY_1:.*]]: float = 2.617, +; CHECK-UR: - [[DO_BODY_2:.*]]: float = 2.3553, +; CHECK-UR: - [[DO_BODY_3:.*]]: float = 2.1198, +; CHECK-UR: - [[DO_END:.*]]: + +declare void @f(i32) + +define void @test(i32 %n) { +; CHECK-UR-LABEL: define void @test(i32 %{{.*}}) { +; CHECK-UR: [[ENTRY]]: +; CHECK-UR: br label %[[DO_BODY]] +; CHECK-UR: [[DO_BODY]]: +; CHECK-UR: call void @f +; CHECK-UR: br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY_1]], !prof ![[#PROF:]] +; CHECK-UR: [[DO_BODY_1]]: +; CHECK-UR: call void @f +; CHECK-UR: br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY_2]], !prof ![[#PROF]] +; CHECK-UR: [[DO_BODY_2]]: +; CHECK-UR: call void @f +; CHECK-UR: br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY_3]], !prof ![[#PROF]] +; CHECK-UR: [[DO_BODY_3]]: +; CHECK-UR: call void @f +; CHECK-UR: br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY]], !prof ![[#PROF]], !llvm.loop ![[#LOOP_UR_LATCH:]] +; CHECK-UR: [[DO_END]]: +; CHECK-UR: ret void + +entry: + br label %do.body + +do.body: + %i = phi i32 [ 0, %entry ], [ %inc, %do.body ] + %inc = add i32 %i, 1 + call void @f(i32 %i) + %c = icmp sge i32 %inc, %n + br i1 %c, label %do.end, label %do.body, !prof !0 + +do.end: + ret void +} + +!0 = !{!"branch_weights", i32 1, i32 9} + +; CHECK-UR: ![[#PROF]] = !{!"branch_weights", i32 1, i32 9} +; CHECK-UR: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; CHECK-UR: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 3} +; CHECK-UR: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"} diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll index 26171990a2592..db87143286f93 100644 --- a/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll @@ -6,7 +6,10 @@ ; CHECK: br i1 [[COND1:%.*]], label %for.end.loopexit.unr-lcssa, label %for.body, !prof ![[#PROF:]], !llvm.loop ![[#LOOP:]] ; CHECK-LABEL: for.body.epil: ; CHECK: br i1 [[COND2:%.*]], label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !prof ![[#PROF2:]], !llvm.loop ![[#LOOP2:]] -; CHECK: ![[#PROF]] = !{!"branch_weights", i32 1, i32 2499} + +; FIXME: These branch weights are incorrect and should not be merged into main +; until PR #159163, which fixes them. +; CHECK: ![[#PROF]] = !{!"branch_weights", i32 1, i32 9999} ; CHECK: ![[#PROF2]] = !{!"branch_weights", i32 1, i32 1} define i3 @test(ptr %a, i3 %n) { diff --git a/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll b/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll index 611ee5fb5807e..1cd70f1d1dfd3 100644 --- a/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll +++ b/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll @@ -60,5 +60,7 @@ loop.end: !1 = !{!"function_entry_count", i64 1} !2 = !{!"branch_weights", i32 1, i32 1000} -; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 124} +; FIXME: These branch weights are incorrect and should not be merged into main +; until PR #159163, which fixes them. +; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 1000} ; CHECK: [[PROF1]] = !{!"branch_weights", i32 3, i32 1} From c81e6c8d9e905b817c1ab8934ef96fe270d3f644 Mon Sep 17 00:00:00 2001 From: Steven Perron Date: Fri, 31 Oct 2025 10:57:19 -0400 Subject: [PATCH 349/539] [SPIRV] Expand spv_bitcast intrinsic during instruction selection (#164884) The spv_bitcast intrinsic is currently replaced by an OpBitcast during prelegalization. This will cause a problem when we need to legalize the OpBitcast. The legalizer assumes that instruction already lowered to a target specific opcode is legal. We cannot lower it to a G_BITCAST because the bitcasts sometimes the LLT type will be the same, causing an error in the verifier, even if the SPIR-V types will be different. This commit keeps the intrinsic around until instruction selection. We can create rules to legalize a G_INTRINISIC* instruction, and it does not create problem for the verifier. No tests are updated because this change should be invisible to users. --- .../Target/SPIRV/SPIRVInstructionSelector.cpp | 8 +++ llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp | 61 +++++++++++-------- 2 files changed, 42 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 3fea21e6e694c..3f0424f436c72 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -3151,6 +3151,14 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectInsertElt(ResVReg, ResType, I); case Intrinsic::spv_gep: return selectGEP(ResVReg, ResType, I); + case Intrinsic::spv_bitcast: { + Register OpReg = I.getOperand(2).getReg(); + SPIRVType *OpType = + OpReg.isValid() ? GR.getSPIRVTypeForVReg(OpReg) : nullptr; + if (!GR.isBitcastCompatible(ResType, OpType)) + report_fatal_error("incompatible result and operand types in a bitcast"); + return selectOpWithSrcs(ResVReg, ResType, I, {OpReg}, SPIRV::OpBitcast); + } case Intrinsic::spv_unref_global: case Intrinsic::spv_init_global: { MachineInstr *MI = MRI->getVRegDef(I.getOperand(1).getReg()); diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index db6f2d61e8f29..d538009f0ecbe 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -192,31 +192,43 @@ static void buildOpBitcast(SPIRVGlobalRegistry *GR, MachineIRBuilder &MIB, .addUse(OpReg); } -// We do instruction selections early instead of calling MIB.buildBitcast() -// generating the general op code G_BITCAST. When MachineVerifier validates -// G_BITCAST we see a check of a kind: if Source Type is equal to Destination -// Type then report error "bitcast must change the type". This doesn't take into -// account the notion of a typed pointer that is important for SPIR-V where a -// user may and should use bitcast between pointers with different pointee types -// (https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpBitcast). -// It's important for correct lowering in SPIR-V, because interpretation of the -// data type is not left to instructions that utilize the pointer, but encoded -// by the pointer declaration, and the SPIRV target can and must handle the -// declaration and use of pointers that specify the type of data they point to. -// It's not feasible to improve validation of G_BITCAST using just information -// provided by low level types of source and destination. Therefore we don't -// produce G_BITCAST as the general op code with semantics different from -// OpBitcast, but rather lower to OpBitcast immediately. As for now, the only -// difference would be that CombinerHelper couldn't transform known patterns -// around G_BUILD_VECTOR. See discussion -// in https://github.com/llvm/llvm-project/pull/110270 for even more context. -static void selectOpBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, - MachineIRBuilder MIB) { +// We lower G_BITCAST to OpBitcast here to avoid a MachineVerifier error. +// The verifier checks if the source and destination LLTs of a G_BITCAST are +// different, but this check is too strict for SPIR-V's typed pointers, which +// may have the same LLT but different SPIRVType (e.g. pointers to different +// pointee types). By lowering to OpBitcast here, we bypass the verifier's +// check. See discussion in https://github.com/llvm/llvm-project/pull/110270 +// for more context. +// +// We also handle the llvm.spv.bitcast intrinsic here. If the source and +// destination SPIR-V types are the same, we lower it to a COPY to enable +// further optimizations like copy propagation. +static void lowerBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, + MachineIRBuilder MIB) { SmallVector ToErase; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { + if (isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(2).getReg(); + SPIRVType *DstType = GR->getSPIRVTypeForVReg(DstReg); + assert( + DstType && + "Expected destination SPIR-V type to have been assigned already."); + SPIRVType *SrcType = GR->getSPIRVTypeForVReg(SrcReg); + assert(SrcType && + "Expected source SPIR-V type to have been assigned already."); + if (DstType == SrcType) { + MIB.setInsertPt(*MI.getParent(), MI); + MIB.buildCopy(DstReg, SrcReg); + ToErase.push_back(&MI); + continue; + } + } + if (MI.getOpcode() != TargetOpcode::G_BITCAST) continue; + MIB.setInsertPt(*MI.getParent(), MI); buildOpBitcast(GR, MIB, MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); @@ -237,16 +249,11 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, SmallVector ToErase; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { - if (!isSpvIntrinsic(MI, Intrinsic::spv_bitcast) && - !isSpvIntrinsic(MI, Intrinsic::spv_ptrcast)) + if (!isSpvIntrinsic(MI, Intrinsic::spv_ptrcast)) continue; assert(MI.getOperand(2).isReg()); MIB.setInsertPt(*MI.getParent(), MI); ToErase.push_back(&MI); - if (isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) { - MIB.buildBitcast(MI.getOperand(0).getReg(), MI.getOperand(2).getReg()); - continue; - } Register Def = MI.getOperand(0).getReg(); Register Source = MI.getOperand(2).getReg(); Type *ElemTy = getMDOperandAsType(MI.getOperand(3).getMetadata(), 0); @@ -1089,7 +1096,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) { removeImplicitFallthroughs(MF, MIB); insertSpirvDecorations(MF, GR, MIB); insertInlineAsm(MF, GR, ST, MIB); - selectOpBitcasts(MF, GR, MIB); + lowerBitcasts(MF, GR, MIB); return true; } From 93b88df0fe8ecdd08dd5d5b45d0d85c90a645e61 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 31 Oct 2025 11:01:42 -0400 Subject: [PATCH 350/539] [LoopUnroll] Fix block frequencies for epilogue (#159163) As another step in issue #135812, this patch fixes block frequencies for partial loop unrolling with an epilogue remainder loop. It does not fully handle the case when the epilogue loop itself is unrolled. That will be handled in the next patch. For the guard and latch of each of the unrolled loop and epilogue loop, this patch sets branch weights derived directly from the original loop latch branch weights. The total frequency of the original loop body, summed across all its occurrences in the unrolled loop and epilogue loop, is the same as in the original loop. This patch also sets `llvm.loop.estimated_trip_count` for the epilogue loop instead of relying on the epilogue's latch branch weights to imply it. This patch fixes branch weights in tests that PR #157754 adversely affected. --- llvm/include/llvm/Support/BranchProbability.h | 3 + .../include/llvm/Transforms/Utils/LoopUtils.h | 34 ++++ .../llvm/Transforms/Utils/UnrollLoop.h | 4 +- llvm/lib/Support/BranchProbability.cpp | 7 + llvm/lib/Transforms/Utils/LoopUnroll.cpp | 30 ++-- .../Transforms/Utils/LoopUnrollRuntime.cpp | 101 +++++++++-- llvm/lib/Transforms/Utils/LoopUtils.cpp | 48 +++++- .../branch-weights-freq/unroll-epilog.ll | 160 ++++++++++++++++++ .../runtime-exit-phi-scev-invalidation.ll | 4 +- .../LoopUnroll/runtime-loop-branchweight.ll | 61 ++++++- .../Transforms/LoopUnroll/runtime-loop.ll | 9 +- .../LoopUnroll/unroll-heuristics-pgo.ll | 68 ++++++-- 12 files changed, 467 insertions(+), 62 deletions(-) create mode 100644 llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll diff --git a/llvm/include/llvm/Support/BranchProbability.h b/llvm/include/llvm/Support/BranchProbability.h index 42fe225709ef8..b15d6e1707afa 100644 --- a/llvm/include/llvm/Support/BranchProbability.h +++ b/llvm/include/llvm/Support/BranchProbability.h @@ -97,6 +97,9 @@ class BranchProbability { /// \return \c Num divided by \c this. LLVM_ABI uint64_t scaleByInverse(uint64_t Num) const; + /// Compute pow(Probability, N). + BranchProbability pow(unsigned N) const; + BranchProbability &operator+=(BranchProbability RHS) { assert(N != UnknownN && RHS.N != UnknownN && "Unknown probability cannot participate in arithmetics."); diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 2d2355d6be68a..86eb21389756c 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -365,6 +365,40 @@ LLVM_ABI bool setLoopEstimatedTripCount( Loop *L, unsigned EstimatedTripCount, std::optional EstimatedLoopInvocationWeight = std::nullopt); +/// Based on branch weight metadata, return either: +/// - An unknown probability if the implementation is unable to handle the loop +/// form of \p L (e.g., \p L must have a latch block that controls the loop +/// exit). +/// - The probability \c P that, at the end of any iteration, the latch of \p L +/// will start another iteration such that `1 - P` is the probability of +/// exiting the loop. +BranchProbability getLoopProbability(Loop *L); + +/// Set branch weight metadata for the latch of \p L to indicate that, at the +/// end of any iteration, \p P and `1 - P` are the probabilities of starting +/// another iteration and exiting the loop, respectively. Return false if the +/// implementation is unable to handle the loop form of \p L (e.g., \p L must +/// have a latch block that controls the loop exit). Otherwise, return true. +bool setLoopProbability(Loop *L, BranchProbability P); + +/// Based on branch weight metadata, return either: +/// - An unknown probability if the implementation cannot extract the +/// probability (e.g., \p B must have exactly two target labels, so it must be +/// a conditional branch). +/// - The probability \c P that control flows from \p B to its first target +/// label such that `1 - P` is the probability of control flowing to its +/// second target label, or vice-versa if \p ForFirstTarget is false. +BranchProbability getBranchProbability(BranchInst *B, bool ForFirstTarget); + +/// Set branch weight metadata for \p B to indicate that \p P and `1 - P` are +/// the probabilities of control flowing to its first and second target labels, +/// respectively, or vice-versa if \p ForFirstTarget is false. Return false if +/// the implementation cannot set the probability (e.g., \p B must have exactly +/// two target labels, so it must be a conditional branch). Otherwise, return +/// true. +bool setBranchProbability(BranchInst *B, BranchProbability P, + bool ForFirstTarget); + /// Check inner loop (L) backedge count is known to be invariant on all /// iterations of its outer loop. If the loop has no parent, this is trivially /// true. diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h index 871c13d972470..a3efc43c62dc3 100644 --- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h +++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h @@ -97,7 +97,9 @@ LLVM_ABI bool UnrollRuntimeLoopRemainder( LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const TargetTransformInfo *TTI, bool PreserveLCSSA, unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit, - Loop **ResultLoop = nullptr); + Loop **ResultLoop = nullptr, + std::optional OriginalTripCount = std::nullopt, + BranchProbability OriginalLoopProb = BranchProbability::getUnknown()); LLVM_ABI LoopUnrollResult UnrollAndJamLoop( Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple, diff --git a/llvm/lib/Support/BranchProbability.cpp b/llvm/lib/Support/BranchProbability.cpp index e3763449d16cb..ea42f34b58645 100644 --- a/llvm/lib/Support/BranchProbability.cpp +++ b/llvm/lib/Support/BranchProbability.cpp @@ -111,3 +111,10 @@ uint64_t BranchProbability::scale(uint64_t Num) const { uint64_t BranchProbability::scaleByInverse(uint64_t Num) const { return ::scale<0>(Num, D, N); } + +BranchProbability BranchProbability::pow(unsigned N) const { + BranchProbability Res = BranchProbability::getOne(); + for (unsigned I = 0; I < N; ++I) + Res *= *this; + return Res; +} diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 23686448ab5ae..94dfd3a974923 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -501,6 +501,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, const bool MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L); std::optional OriginalTripCount = llvm::getLoopEstimatedTripCount(L); + BranchProbability OriginalLoopProb = llvm::getLoopProbability(L); // Effectively "DCE" unrolled iterations that are beyond the max tripcount // and will never be executed. @@ -591,11 +592,11 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, : isEpilogProfitable(L); if (ULO.Runtime && - !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount, - EpilogProfitability, ULO.UnrollRemainder, - ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI, - PreserveLCSSA, ULO.SCEVExpansionBudget, - ULO.RuntimeUnrollMultiExit, RemainderLoop)) { + !UnrollRuntimeLoopRemainder( + L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability, + ULO.UnrollRemainder, ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI, + PreserveLCSSA, ULO.SCEVExpansionBudget, ULO.RuntimeUnrollMultiExit, + RemainderLoop, OriginalTripCount, OriginalLoopProb)) { if (ULO.Force) ULO.Runtime = false; else { @@ -1129,13 +1130,13 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, LI->erase(L); // We shouldn't try to use `L` anymore. L = nullptr; - } else if (OriginalTripCount) { + } else { // Update metadata for the loop's branch weights and estimated trip count: // - If ULO.Runtime, UnrollRuntimeLoopRemainder sets the guard branch // weights, latch branch weights, and estimated trip count of the // remainder loop it creates. It also sets the branch weights for the // unrolled loop guard it creates. The branch weights for the unrolled - // loop latch are adjusted below. FIXME: Actually handle ULO.Runtime. + // loop latch are adjusted below. FIXME: Handle prologue loops. // - Otherwise, if unrolled loop iteration latches become unconditional, // branch weights are adjusted above. FIXME: Actually handle such // unconditional latches. @@ -1158,10 +1159,17 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, // the unrolled loop as a whole without considering the branch weights for // each unrolled iteration's latch within it, we store the new trip count as // separate metadata. - unsigned NewTripCount = *OriginalTripCount / ULO.Count; - if (!ULO.Runtime && *OriginalTripCount % ULO.Count) - NewTripCount += 1; - setLoopEstimatedTripCount(L, NewTripCount); + if (!OriginalLoopProb.isUnknown() && ULO.Runtime && EpilogProfitability) { + // Where p is always the probability of executing at least 1 more + // iteration, the probability for at least n more iterations is p^n. + setLoopProbability(L, OriginalLoopProb.pow(ULO.Count)); + } + if (OriginalTripCount) { + unsigned NewTripCount = *OriginalTripCount / ULO.Count; + if (!ULO.Runtime && *OriginalTripCount % ULO.Count) + ++NewTripCount; + setLoopEstimatedTripCount(L, NewTripCount); + } } // LoopInfo should not be valid, confirm that. diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 7a2b8da6ffd21..1e8f6cc76900c 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -40,6 +40,7 @@ #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include using namespace llvm; @@ -195,6 +196,21 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, } } +/// Assume, due to our position in the remainder loop or its guard, anywhere +/// from 0 to \p N more iterations can possibly execute. Among such cases in +/// the original loop (with loop probability \p OriginalLoopProb), what is the +/// probability of executing at least one more iteration? +static BranchProbability +probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) { + // Each of these variables holds the original loop's probability that the + // number of iterations it will execute is some m in the specified range. + BranchProbability ProbOne = OriginalLoopProb; // 1 <= m + BranchProbability ProbTooMany = ProbOne.pow(N + 1); // N + 1 <= m + BranchProbability ProbNotTooMany = ProbTooMany.getCompl(); // 0 <= m <= N + BranchProbability ProbOneNotTooMany = ProbOne - ProbTooMany; // 1 <= m <= N + return ProbOneNotTooMany / ProbNotTooMany; +} + /// Connect the unrolling epilog code to the original loop. /// The unrolling epilog code contains code to execute the /// 'extra' iterations if the run-time trip count modulo the @@ -221,7 +237,8 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader, ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA, ScalarEvolution &SE, - unsigned Count, AssumptionCache &AC) { + unsigned Count, AssumptionCache &AC, + BranchProbability OriginalLoopProb) { BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Loop must have a latch"); BasicBlock *EpilogLatch = cast(VMap[Latch]); @@ -332,12 +349,19 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, PreserveLCSSA); // Add the branch to the exit block (around the epilog loop) MDNode *BranchWeights = nullptr; - if (hasBranchWeightMD(*Latch->getTerminator())) { + if (OriginalLoopProb.isUnknown() && + hasBranchWeightMD(*Latch->getTerminator())) { // Assume equal distribution in interval [0, Count). MDBuilder MDB(B.getContext()); BranchWeights = MDB.createBranchWeights(1, Count - 1); } - B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights); + BranchInst *RemainderLoopGuard = + B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights); + if (!OriginalLoopProb.isUnknown()) { + setBranchProbability(RemainderLoopGuard, + probOfNextInRemainder(OriginalLoopProb, Count - 1), + /*ForFirstTarget=*/true); + } InsertPt->eraseFromParent(); if (DT) { auto *NewDom = DT->findNearestCommonDominator(Exit, NewExit); @@ -357,14 +381,15 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, /// The cloned blocks should be inserted between InsertTop and InsertBot. /// InsertTop should be new preheader, InsertBot new loop exit. /// Returns the new cloned loop that is created. -static Loop * -CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, - const bool UnrollRemainder, - BasicBlock *InsertTop, - BasicBlock *InsertBot, BasicBlock *Preheader, +static Loop *CloneLoopBlocks(Loop *L, Value *NewIter, + const bool UseEpilogRemainder, + const bool UnrollRemainder, BasicBlock *InsertTop, + BasicBlock *InsertBot, BasicBlock *Preheader, std::vector &NewBlocks, LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, - DominatorTree *DT, LoopInfo *LI, unsigned Count) { + DominatorTree *DT, LoopInfo *LI, unsigned Count, + std::optional OriginalTripCount, + BranchProbability OriginalLoopProb) { StringRef suffix = UseEpilogRemainder ? "epil" : "prol"; BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); @@ -419,7 +444,8 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, Builder.CreateAdd(NewIdx, One, NewIdx->getName() + ".next"); Value *IdxCmp = Builder.CreateICmpNE(IdxNext, NewIter, NewIdx->getName() + ".cmp"); MDNode *BranchWeights = nullptr; - if (hasBranchWeightMD(*LatchBR)) { + if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) && + hasBranchWeightMD(*LatchBR)) { uint32_t ExitWeight; uint32_t BackEdgeWeight; if (Count >= 3) { @@ -437,7 +463,29 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, MDBuilder MDB(Builder.getContext()); BranchWeights = MDB.createBranchWeights(BackEdgeWeight, ExitWeight); } - Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights); + BranchInst *RemainderLoopLatch = + Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights); + if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) { + // Compute the total frequency of the original loop body from the + // remainder iterations. Once we've reached them, the first of them + // always executes, so its frequency and probability are 1. + double FreqRemIters = 1; + if (Count > 2) { + BranchProbability ProbReaching = BranchProbability::getOne(); + for (unsigned N = Count - 2; N >= 1; --N) { + ProbReaching *= probOfNextInRemainder(OriginalLoopProb, N); + FreqRemIters += double(ProbReaching.getNumerator()) / + ProbReaching.getDenominator(); + } + } + // Solve for the loop probability that would produce that frequency. + // Sum(i=0..inf)(Prob^i) = 1/(1-Prob) = FreqRemIters. + double ProbDouble = 1 - 1 / FreqRemIters; + BranchProbability Prob = BranchProbability::getBranchProbability( + std::round(ProbDouble * BranchProbability::getDenominator()), + BranchProbability::getDenominator()); + setBranchProbability(RemainderLoopLatch, Prob, /*ForFirstTarget=*/true); + } NewIdx->addIncoming(Zero, InsertTop); NewIdx->addIncoming(IdxNext, NewBB); LatchBR->eraseFromParent(); @@ -461,6 +509,9 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, Loop *NewLoop = NewLoops[L]; assert(NewLoop && "L should have been cloned"); + if (OriginalTripCount && UseEpilogRemainder) + setLoopEstimatedTripCount(NewLoop, *OriginalTripCount % Count); + // Add unroll disable metadata to disable future unrolling for this loop. if (!UnrollRemainder) NewLoop->setLoopAlreadyUnrolled(); @@ -588,7 +639,8 @@ bool llvm::UnrollRuntimeLoopRemainder( LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const TargetTransformInfo *TTI, bool PreserveLCSSA, unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit, - Loop **ResultLoop) { + Loop **ResultLoop, std::optional OriginalTripCount, + BranchProbability OriginalLoopProb) { LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n"); LLVM_DEBUG(L->dump()); LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n" @@ -808,12 +860,23 @@ bool llvm::UnrollRuntimeLoopRemainder( BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit; // Branch to either remainder (extra iterations) loop or unrolling loop. MDNode *BranchWeights = nullptr; - if (hasBranchWeightMD(*Latch->getTerminator())) { + if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) && + hasBranchWeightMD(*Latch->getTerminator())) { // Assume loop is nearly always entered. MDBuilder MDB(B.getContext()); BranchWeights = MDB.createBranchWeights(EpilogHeaderWeights); } - B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights); + BranchInst *UnrollingLoopGuard = + B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights); + if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) { + // The original loop's first iteration always happens. Compute the + // probability of the original loop executing Count-1 iterations after that + // to complete the first iteration of the unrolled loop. + BranchProbability ProbOne = OriginalLoopProb; + BranchProbability ProbRest = ProbOne.pow(Count - 1); + setBranchProbability(UnrollingLoopGuard, ProbRest, + /*ForFirstTarget=*/false); + } PreHeaderBR->eraseFromParent(); if (DT) { if (UseEpilogRemainder) @@ -840,9 +903,10 @@ bool llvm::UnrollRuntimeLoopRemainder( // iterations. This function adds the appropriate CFG connections. BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit; BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader; - Loop *remainderLoop = CloneLoopBlocks( - L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, InsertBot, - NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI, Count); + Loop *remainderLoop = + CloneLoopBlocks(L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, + InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, + LI, Count, OriginalTripCount, OriginalLoopProb); // Insert the cloned blocks into the function. F->splice(InsertBot->getIterator(), F, NewBlocks[0]->getIterator(), F->end()); @@ -941,7 +1005,8 @@ bool llvm::UnrollRuntimeLoopRemainder( // Connect the epilog code to the original loop and update the // PHI functions. ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader, - NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC); + NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC, + OriginalLoopProb); // Update counter in loop for unrolling. // Use an incrementing IV. Pre-incr/post-incr is backedge/trip count. diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index b6ba82288aeb4..8be471bee5579 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -962,13 +962,51 @@ bool llvm::setLoopEstimatedTripCount( if (LatchBranch->getSuccessor(0) != L->getHeader()) std::swap(BackedgeTakenWeight, LatchExitWeight); - MDBuilder MDB(LatchBranch->getContext()); - // Set/Update profile metadata. - LatchBranch->setMetadata( - LLVMContext::MD_prof, - MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight)); + setBranchWeights(*LatchBranch, {BackedgeTakenWeight, LatchExitWeight}, + /*IsExpected=*/false); + + return true; +} + +BranchProbability llvm::getLoopProbability(Loop *L) { + BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); + if (!LatchBranch) + return BranchProbability::getUnknown(); + bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader(); + return getBranchProbability(LatchBranch, FirstTargetIsLoop); +} +bool llvm::setLoopProbability(Loop *L, BranchProbability P) { + BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); + if (!LatchBranch) + return false; + bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader(); + return setBranchProbability(LatchBranch, P, FirstTargetIsLoop); +} + +BranchProbability llvm::getBranchProbability(BranchInst *B, + bool ForFirstTarget) { + if (B->getNumSuccessors() != 2) + return BranchProbability::getUnknown(); + uint64_t Weight0, Weight1; + if (!extractBranchWeights(*B, Weight0, Weight1)) + return BranchProbability::getUnknown(); + if (!ForFirstTarget) + std::swap(Weight0, Weight1); + return BranchProbability::getBranchProbability(Weight0, Weight0 + Weight1); +} + +bool llvm::setBranchProbability(BranchInst *B, BranchProbability P, + bool ForFirstTarget) { + if (B->getNumSuccessors() != 2) + return false; + BranchProbability Prob0 = P; + BranchProbability Prob1 = P.getCompl(); + if (!ForFirstTarget) + std::swap(Prob0, Prob1); + setBranchWeights(*B, {Prob0.getNumerator(), Prob1.getNumerator()}, + /*IsExpected=*/false); return true; } diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll new file mode 100644 index 0000000000000..96b31d801c2f9 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll @@ -0,0 +1,160 @@ +; Test branch weight metadata, estimated trip count metadata, and block +; frequencies after loop unrolling with an epilogue. + +; ------------------------------------------------------------------------------ +; Define substitutions. +; +; Check original loop body frequency. +; DEFINE: %{bf-fc} = opt %s -S -passes='print' 2>&1 | \ +; DEFINE: FileCheck %s -check-prefixes +; +; Unroll loops and then check block frequency. The -implicit-check-not options +; make sure that no additional labels or @f calls show up. +; DEFINE: %{ur-bf} = opt %s -S -passes='loop-unroll,print' 2>&1 +; DEFINE: %{fc} = FileCheck %s \ +; DEFINE: -implicit-check-not='{{^( *- )?[^ ;]*:}}' \ +; DEFINE: -implicit-check-not='call void @f' -check-prefixes + +; ------------------------------------------------------------------------------ +; Check various interesting unroll count values relative to the original loop's +; estimated trip count of 11 (e.g., minimum and boundary values). +; +; RUN: %{bf-fc} ALL,ORIG +; RUN: %{ur-bf} -unroll-count=2 -unroll-runtime | %{fc} ALL,UR,UR2 +; RUN: %{ur-bf} -unroll-count=4 -unroll-runtime | %{fc} ALL,UR,UR4 +; RUN: %{ur-bf} -unroll-count=10 -unroll-runtime | %{fc} ALL,UR,UR10 +; RUN: %{ur-bf} -unroll-count=11 -unroll-runtime | %{fc} ALL,UR,UR11 +; RUN: %{ur-bf} -unroll-count=12 -unroll-runtime | %{fc} ALL,UR,UR12 + +; ------------------------------------------------------------------------------ +; Check the iteration frequencies, which, when each is multiplied by the number +; of original loop bodies that execute within it, should sum to almost exactly +; the original loop body frequency. +; +; ALL-LABEL: block-frequency-info: test +; +; ORIG: - [[ENTRY:.*]]: +; ORIG: - [[DO_BODY:.*]]: float = 11.0, +; ORIG: - [[DO_END:.*]]: +; +; UR: - [[ENTRY:.*]]: +; UR: - [[ENTRY_NEW:.*]]: +; UR2: - [[DO_BODY:.*]]: float = 5.2381, +; UR4: - [[DO_BODY:.*]]: float = 2.3702, +; UR10: - [[DO_BODY:.*]]: float = 0.6902, +; UR11: - [[DO_BODY:.*]]: float = 0.59359, +; UR12: - [[DO_BODY:.*]]: float = 0.5144, +; UR: - [[DO_END_UNR_LCSSA:.*]]: +; UR: - [[DO_BODY_EPIL_PREHEADER:.*]]: +; UR2: - [[DO_BODY_EPIL:.*]]: float = 0.52381, +; UR4: - [[DO_BODY_EPIL:.*]]: float = 1.5193, +; UR10: - [[DO_BODY_EPIL:.*]]: float = 4.098, +; UR11: - [[DO_BODY_EPIL:.*]]: float = 4.4705, +; UR12: - [[DO_BODY_EPIL:.*]]: float = 4.8272, +; UR4: - [[DO_END_EPILOG_LCSSA:.*]]: +; UR10: - [[DO_END_EPILOG_LCSSA:.*]]: +; UR11: - [[DO_END_EPILOG_LCSSA:.*]]: +; UR12: - [[DO_END_EPILOG_LCSSA:.*]]: +; UR: - [[DO_END:.*]]: + +; ------------------------------------------------------------------------------ +; Check the CFGs, including the number of original loop bodies that appear +; within each unrolled iteration. +; +; UR-LABEL: define void @test(i32 %{{.*}}) { +; UR: [[ENTRY]]: +; UR: br i1 %{{.*}}, label %[[DO_BODY_EPIL_PREHEADER]], label %[[ENTRY_NEW]], !prof ![[#PROF_UR_GUARD:]]{{$}} +; UR: [[ENTRY_NEW]]: +; UR: br label %[[DO_BODY]] +; UR: [[DO_BODY]]: +; UR2-COUNT-2: call void @f +; UR4-COUNT-4: call void @f +; UR10-COUNT-10: call void @f +; UR11-COUNT-11: call void @f +; UR12-COUNT-12: call void @f +; UR: br i1 %{{.*}}, label %[[DO_END_UNR_LCSSA]], label %[[DO_BODY]], !prof ![[#PROF_UR_LATCH:]], !llvm.loop ![[#LOOP_UR_LATCH:]]{{$}} +; UR: [[DO_END_UNR_LCSSA]]: +; UR: br i1 %{{.*}}, label %[[DO_BODY_EPIL_PREHEADER]], label %[[DO_END:.*]], !prof ![[#PROF_RM_GUARD:]]{{$}} +; UR: [[DO_BODY_EPIL_PREHEADER]]: +; UR: br label %[[DO_BODY_EPIL]] +; UR: [[DO_BODY_EPIL]]: +; UR: call void @f +; UR4: br i1 %{{.*}}, label %[[DO_BODY_EPIL]], label %[[DO_END_EPILOG_LCSSA]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]{{$}} +; UR10: br i1 %{{.*}}, label %[[DO_BODY_EPIL]], label %[[DO_END_EPILOG_LCSSA]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]{{$}} +; UR11: br i1 %{{.*}}, label %[[DO_BODY_EPIL]], label %[[DO_END_EPILOG_LCSSA]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]{{$}} +; UR12: br i1 %{{.*}}, label %[[DO_BODY_EPIL]], label %[[DO_END_EPILOG_LCSSA]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]{{$}} +; UR4: [[DO_END_EPILOG_LCSSA]]: +; UR10: [[DO_END_EPILOG_LCSSA]]: +; UR11: [[DO_END_EPILOG_LCSSA]]: +; UR12: [[DO_END_EPILOG_LCSSA]]: +; UR: br label %[[DO_END]] +; UR: [[DO_END]]: +; UR: ret void + +declare void @f(i32) + +define void @test(i32 %n) { +entry: + br label %do.body + +do.body: + %i = phi i32 [ 0, %entry ], [ %inc, %do.body ] + %inc = add i32 %i, 1 + call void @f(i32 %i) + %c = icmp sge i32 %inc, %n + br i1 %c, label %do.end, label %do.body, !prof !0 + +do.end: + ret void +} + +!0 = !{!"branch_weights", i32 1, i32 10} + +; ------------------------------------------------------------------------------ +; Check branch weight metadata and estimated trip count metadata. +; +; UR2: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 195225786, i32 1952257862} +; UR4: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 534047398, i32 1613436250} +; UR10: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 1236740947, i32 910742701} +; UR11: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 1319535738, i32 827947910} +; UR12: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 1394803730, i32 752679918} +; +; UR2: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 372703773, i32 1774779875} +; UR4: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 680723421, i32 1466760227} +; UR10: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 1319535738, i32 827947910} +; UR11: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 1394803730, i32 752679918} +; UR12: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 1463229177, i32 684254471} +; +; UR2: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; UR4: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; UR10: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; UR11: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; UR12: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; +; UR2: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 5} +; UR4: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 2} +; UR10: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 1} +; UR11: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 1} +; UR12: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 0} +; UR: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"} +; +; UR2: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1022611260, i32 1124872388} +; UR4: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1531603292, i32 615880356} +; UR10: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1829762672, i32 317720976} +; UR11: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1846907894, i32 300575754} +; UR12: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1860963812, i32 286519836} +; +; UR4: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1038564635, i32 1108919013} +; UR10: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1656332913, i32 491150735} +; UR11: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1693034047, i32 454449601} +; UR12: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1723419551, i32 424064097} + +; UR4: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]} +; UR10: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; UR11: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]} +; UR12: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]} +; +; UR4: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 3} +; For UR10, llvm.loop.estimated_trip_count is the same for both loops. +; UR11: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 0} +; UR12: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 11} diff --git a/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll b/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll index 0c52b5a0edef8..047360178aa06 100644 --- a/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll @@ -188,7 +188,7 @@ define void @pr56286(i64 %x, ptr %src, ptr %dst, ptr %ptr.src) !prof !0 { ; CHECK-NEXT: [[L_1_LCSSA_UNR:%.*]] = phi i32 [ poison, [[OUTER_HEADER]] ], [ [[L_1_LCSSA_UNR_PH]], [[INNER_1_HEADER_PROL_LOOPEXIT_UNR_LCSSA]] ] ; CHECK-NEXT: [[INNER_1_IV_UNR:%.*]] = phi i64 [ [[X]], [[OUTER_HEADER]] ], [ [[INNER_1_IV_UNR_PH]], [[INNER_1_HEADER_PROL_LOOPEXIT_UNR_LCSSA]] ] ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 7 -; CHECK-NEXT: br i1 [[TMP4]], label [[OUTER_MIDDLE:%.*]], label [[OUTER_HEADER_NEW:%.*]], !prof [[PROF3]] +; CHECK-NEXT: br i1 [[TMP4]], label [[OUTER_MIDDLE:%.*]], label [[OUTER_HEADER_NEW:%.*]], !prof [[PROF6:![0-9]+]] ; CHECK: outer.header.new: ; CHECK-NEXT: br label [[INNER_1_HEADER:%.*]] ; CHECK: inner.1.header: @@ -232,7 +232,7 @@ define void @pr56286(i64 %x, ptr %src, ptr %dst, ptr %ptr.src) !prof !0 { ; CHECK-NEXT: store i32 [[L_1_7]], ptr [[DST]], align 8 ; CHECK-NEXT: [[INNER_1_IV_NEXT_7]] = add i64 [[INNER_1_IV]], 8 ; CHECK-NEXT: [[CMP_2_7:%.*]] = icmp sgt i64 [[INNER_1_IV_NEXT_6]], 0 -; CHECK-NEXT: br i1 [[CMP_2_7]], label [[OUTER_MIDDLE_UNR_LCSSA:%.*]], label [[INNER_1_HEADER]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_2_7]], label [[OUTER_MIDDLE_UNR_LCSSA:%.*]], label [[INNER_1_HEADER]], !prof [[PROF7:![0-9]+]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: outer.middle.unr-lcssa: ; CHECK-NEXT: [[L_1_LCSSA_PH:%.*]] = phi i32 [ [[L_1_7]], [[INNER_1_LATCH_7]] ] ; CHECK-NEXT: br label [[OUTER_MIDDLE]] diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll index db87143286f93..2f8f98d40e86f 100644 --- a/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll @@ -2,15 +2,24 @@ ;; Check that the remainder loop is properly assigned a branch weight for its latch branch. ; CHECK-LABEL: @test( -; CHECK-LABEL: for.body: -; CHECK: br i1 [[COND1:%.*]], label %for.end.loopexit.unr-lcssa, label %for.body, !prof ![[#PROF:]], !llvm.loop ![[#LOOP:]] -; CHECK-LABEL: for.body.epil: -; CHECK: br i1 [[COND2:%.*]], label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !prof ![[#PROF2:]], !llvm.loop ![[#LOOP2:]] - -; FIXME: These branch weights are incorrect and should not be merged into main -; until PR #159163, which fixes them. -; CHECK: ![[#PROF]] = !{!"branch_weights", i32 1, i32 9999} -; CHECK: ![[#PROF2]] = !{!"branch_weights", i32 1, i32 1} +; CHECK-LABEL: entry: +; CHECK: [[FOR_BODY_PREHEADER:.*]]: +; CHECK: br i1 %{{.*}}, label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]], !prof ![[#PROF_UR_GUARD:]] +; CHECK: [[FOR_BODY_PREHEADER_NEW]]: +; CHECK: br label %for.body +; CHECK: for.body: +; CHECK: %add = add +; CHECK: %add.1 = add +; CHECK: %add.2 = add +; CHECK: %add.3 = add +; CHECK-NOT: %add.4 = add +; CHECK: br i1 %{{.*}}, label %[[FOR_END_LOOPEXIT_UNR_LCSSA:.*]], label %for.body, !prof ![[#PROF_UR_LATCH:]], !llvm.loop ![[#LOOP_UR_LATCH:]] +; CHECK: [[FOR_END_LOOPEXIT_UNR_LCSSA]]: +; CHECK: br i1 %{{.*}}, label %[[FOR_BODY_EPIL_PREHEADER]], label %[[FOR_END_LOOPEXIT:.*]], !prof ![[#PROF_RM_GUARD:]] +; CHECK: [[FOR_BODY_EPIL_PREHEADER]]: +; CHECK: br label %[[FOR_BODY_EPIL:.*]] +; CHECK: [[FOR_BODY_EPIL]]: +; CHECK: br i1 {{.*}}, label %[[FOR_BODY_EPIL]], label %[[FOR_END_LOOPEXIT_EPILOG_LCSSA:.*]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]] define i3 @test(ptr %a, i3 %n) { entry: @@ -34,3 +43,37 @@ for.end: } !0 = !{!"branch_weights", i32 1, i32 9999} + +; Original loop probability: p = 9999/(1+9999) = 0.9999 +; Original estimated trip count: (1+9999)/1 = 10000 +; Unroll count: 4 + +; Probability of >=3 iterations after first: p^3 = 0.9970003 =~ +; 2146839468 / (644180 + 2146839468). +; CHECK: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 644180, i32 2146839468} + +; Probability of >=4 more iterations: p^4 = 0.99960006 =~ +; 2146624784 / (858864 + 2146624784). +; CHECK: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 858864, i32 2146624784} + +; 10000//4 = 2500 +; CHECK: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} +; CHECK: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 2500} +; +; CHECK: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"} + +; Probability of 1 to 3 more of 3 more remainder iterations: +; (p-p^4)/(1-p^4) = 0.749962497 =~ 1610532724 / (1610532724 + 536950924). +; CHECK: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1610532724, i32 536950924} + +; Frequency of first remainder iter: r1 = 1 +; Frequency of second remainder iter: r2 = r1*(p-p^3)/(1-p^3) = 0.666633331 +; Frequency of third remainder iter: r3 = r2*(p-p^2)/(1-p^2) = 0.333299999 +; Solve for loop probability that produces that frequency: f = 1/(1-p') => +; p' = 1-1/f = 1-1/(r1+r2+r3) = 0.499983332 =~ +; 1073706403 / (1073706403 + 1073777245). +; CHECK: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1073706403, i32 1073777245} + +; 10000%4 = 0 +; CHECK: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]} +; CHECK: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 0} diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop.ll index 492de063573be..ec7aba432b484 100644 --- a/llvm/test/Transforms/LoopUnroll/runtime-loop.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-loop.ll @@ -295,11 +295,12 @@ exit2.loopexit: ; COMMON-LABEL: {{^}}!0 = ; EPILOG: [[EPILOG_PROF_0]] = !{!"branch_weights", i32 1, i32 11} -; EPILOG: [[EPILOG_PROF_1]] = !{!"branch_weights", i32 1, i32 127} -; EPILOG: [[EPILOG_PROF_2]] = !{!"branch_weights", i32 1, i32 7} -; EPILOG: [[EPILOG_PROF_3]] = !{!"branch_weights", i32 3, i32 1} +; EPILOG: [[EPILOG_PROF_1]] = !{!"branch_weights", i32 326124004, i32 1821359644} +; EPILOG: [[EPILOG_PROF_2]] = !{!"branch_weights", i32 1856428066, i32 291055582} +; EPILOG: [[EPILOG_PROF_3]] = !{!"branch_weights", i32 1597681585, i32 549802063} -; EPILOG: [[EPILOG_LOOP]] = distinct !{[[EPILOG_LOOP]], [[EPILOG_LOOP_1:![0-9]+]]} +; EPILOG: [[EPILOG_LOOP]] = distinct !{[[EPILOG_LOOP]], [[EPILOG_TC:![0-9]+]], [[EPILOG_LOOP_1:![0-9]+]]} +; EPILOG: [[EPILOG_TC]] = !{!"llvm.loop.estimated_trip_count", i32 3} ; EPILOG: [[EPILOG_LOOP_1]] = !{!"llvm.loop.unroll.disable"} ; PROLOG: [[PROLOG_PROF_0]] = !{!"branch_weights", i32 1, i32 11} diff --git a/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll b/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll index 1cd70f1d1dfd3..02f5bf932132e 100644 --- a/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll +++ b/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll @@ -3,14 +3,27 @@ @known_constant = internal unnamed_addr constant [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16 ; CHECK-LABEL: @bar_prof -; CHECK: loop: -; CHECK: %mul = mul -; CHECK: %mul.1 = mul -; CHECK: %mul.2 = mul -; CHECK: %mul.3 = mul -; CHECK: br i1 %niter.ncmp.7, label %loop.end.unr-lcssa, label %loop, !prof [[PROF0:![0-9]+]] -; CHECK: loop.epil: -; CHECK: br i1 %epil.iter.cmp, label %loop.epil, label %loop.end.epilog-lcssa, !prof [[PROF1:![0-9]+]], !llvm.loop {{![0-9]+}} +; CHECK: entry: +; CHECK: br i1 %{{.*}}, label %[[LOOP_EPIL_PREHEADER:.*]], label %[[ENTRY_NEW:.*]], !prof ![[#PROF_UR_GUARD:]] +; CHECK: [[ENTRY_NEW]]: +; CHECK: br label %loop +; CHECK: loop: +; CHECK: %mul = mul +; CHECK: %mul.1 = mul +; CHECK: %mul.2 = mul +; CHECK: %mul.3 = mul +; CHECK: %mul.4 = mul +; CHECK: %mul.5 = mul +; CHECK: %mul.6 = mul +; CHECK: %mul.7 = mul +; CHECK-NOT: %mul.8 = mul +; CHECK: br i1 %{{.*}}, label %[[LOOP_END_UNR_LCSSA:.*]], label %loop, !prof ![[#PROF_UR_LATCH:]], !llvm.loop ![[#LOOP_UR_LATCH:]] +; CHECK: [[LOOP_END_UNR_LCSSA]]: +; CHECK: br i1 %{{.*}}, label %[[LOOP_EPIL_PREHEADER]], label %loop.end, !prof ![[#PROF_RM_GUARD:]] +; CHECK: [[LOOP_EPIL_PREHEADER]]: +; CHECK: br label %[[LOOP_EPIL:.*]] +; CHECK: [[LOOP_EPIL]]: +; CHECK: br i1 %{{.*}}, label %[[LOOP_EPIL]], label %[[LOOP_END_EPILOG_LCSSA:.*]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]] define i32 @bar_prof(ptr noalias nocapture readonly %src, i64 %c) !prof !1 { entry: br label %loop @@ -60,7 +73,38 @@ loop.end: !1 = !{!"function_entry_count", i64 1} !2 = !{!"branch_weights", i32 1, i32 1000} -; FIXME: These branch weights are incorrect and should not be merged into main -; until PR #159163, which fixes them. -; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 1000} -; CHECK: [[PROF1]] = !{!"branch_weights", i32 3, i32 1} +; Original loop probability: p = 1000/(1+1000) = 0.999000999 +; Original estimated trip count: (1+1000)/1 = 1001 +; Unroll count: 8 + +; Probability of >=7 iterations after first: p^7 = 0.993027916 =~ +; 2132511214 / (14972434 + 2132511214). +; CHECK: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 14972434, i32 2132511214} + +; Probability of >=8 more iterations: p^8 = 0.99203588 =~ +; 2130380833 / (17102815 + 2130380833). +; CHECK: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 17102815, i32 2130380833} + +; 1001//8 = 125 +; CHECK: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]]} +; CHECK: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 125} + +; Probability of 1 to 7 more of 7 more remainder iterations: +; (p-p^8)/(1-p^8) = 0.874562282 =~ 1878108210 / (1878108210 + 269375438). +; CHECK: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1878108210, i32 269375438} + +; Frequency of first remainder iter: r1 = 1 +; Frequency of second remainder iter: r2 = r1*(p-p^7)/(1-p^7) = 0.856714143 +; Frequency of third remainder iter: r3 = r2*(p-p^6)/(1-p^6) = 0.713571429 +; Frequency of fourth remainder iter: r4 = r2*(p-p^5)/(1-p^5) = 0.570571715 +; Frequency of fifth remainder iter: r5 = r2*(p-p^4)/(1-p^4) = 0.427714858 +; Frequency of sixth remainder iter: r6 = r2*(p-p^3)/(1-p^3) = 0.285000715 +; Frequency of seventh remainder iter: r7 = r2*(p-p^2)/(1-p^2) = 0.142429143 +; Solve for loop probability that produces that frequency: f = 1/(1-p') => +; p' = 1-1/f = 1-1/(r1+r2+r3+r4+r5+r6+r7) = 0.749749875 =~ +; 1610075606 / (1610075606 + 537408042). +; CHECK: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1610075606, i32 537408042} + +; Remainder estimated trip count: 1001%8 = 1 +; CHECK: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]} +; CHECK: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 1} From 28d46a146b5a597565e6e83a95d8e9c939d39c50 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 31 Oct 2025 15:17:55 +0000 Subject: [PATCH 351/539] [X86] bittest-big-integer.ll - add missing AVX2/AVX512 checks (#165856) These were removed in #165742 but separate tests from #165758 still need them --- llvm/test/CodeGen/X86/bittest-big-integer.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index 5776c6c82bcc3..dffe9005094ab 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE -; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX -; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512 ; bt/btc/btr/bts patterns + 'init' to set single bit value in large integers From 35d63a2db07e267c2af125dcdc70f1ad299cf175 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 31 Oct 2025 08:56:30 -0700 Subject: [PATCH 352/539] [BOLT] Refactor handling of branch targets. NFCI (#165828) Refactor code that verifies external branch destinations and creates secondary entry points. --- bolt/include/bolt/Core/BinaryContext.h | 10 +++++++ bolt/lib/Core/BinaryContext.cpp | 32 +++++++++++++++-------- bolt/lib/Core/BinaryFunction.cpp | 19 ++++---------- bolt/test/AArch64/constant-island-entry.s | 2 +- 4 files changed, 37 insertions(+), 26 deletions(-) diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h index 5cbc28fb38a33..085c0265de3ed 100644 --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -932,6 +932,16 @@ class BinaryContext { std::pair handleAddressRef(uint64_t Address, BinaryFunction &BF, bool IsPCRel); + /// When \p Address inside function \p BF is a target of a control transfer + /// instruction (branch) from another function, return a corresponding symbol + /// that should be used by the branch. For example, main or secondary entry + /// point. + /// + /// If \p Address is an invalid destination, such as a constant island, return + /// nullptr and mark \p BF as ignored, since we cannot properly handle a + /// branch to a constant island. + MCSymbol *handleExternalBranchTarget(uint64_t Address, BinaryFunction &BF); + /// Analyze memory contents at the given \p Address and return the type of /// memory contents (such as a possible jump table). MemoryContentsType analyzeMemoryAt(uint64_t Address, BinaryFunction &BF); diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index c33540ada8a05..a383ced1712e3 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -518,6 +518,23 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF, return std::make_pair(TargetSymbol, 0); } +MCSymbol *BinaryContext::handleExternalBranchTarget(uint64_t Address, + BinaryFunction &BF) { + if (BF.isInConstantIsland(Address)) { + BF.setIgnored(); + this->outs() << "BOLT-WARNING: ignoring entry point at address 0x" + << Twine::utohexstr(Address) + << " in constant island of function " << BF << '\n'; + return nullptr; + } + + const uint64_t Offset = Address - BF.getAddress(); + assert(Offset < BF.getSize() && + "Address should be inside the referenced function"); + + return Offset ? BF.addEntryPointAtOffset(Offset) : BF.getSymbol(); +} + MemoryContentsType BinaryContext::analyzeMemoryAt(uint64_t Address, BinaryFunction &BF) { if (!isX86()) @@ -1399,17 +1416,10 @@ void BinaryContext::processInterproceduralReferences() { << Function.getPrintName() << " and " << TargetFunction->getPrintName() << '\n'; } - if (uint64_t Offset = Address - TargetFunction->getAddress()) { - if (!TargetFunction->isInConstantIsland(Address)) { - TargetFunction->addEntryPointAtOffset(Offset); - } else { - TargetFunction->setIgnored(); - this->outs() << "BOLT-WARNING: Ignoring entry point at address 0x" - << Twine::utohexstr(Address) - << " in constant island of function " << *TargetFunction - << '\n'; - } - } + + // Create an extra entry point if needed. Can also render the target + // function ignored if the reference is invalid. + handleExternalBranchTarget(Address, *TargetFunction); continue; } diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index fbe186454351c..ddaad6eef6140 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -1697,21 +1697,12 @@ bool BinaryFunction::scanExternalRefs() { if (!TargetFunction || ignoreFunctionRef(*TargetFunction)) continue; - const uint64_t FunctionOffset = - TargetAddress - TargetFunction->getAddress(); - if (!TargetFunction->isInConstantIsland(TargetAddress)) { - BranchTargetSymbol = - FunctionOffset - ? TargetFunction->addEntryPointAtOffset(FunctionOffset) - : TargetFunction->getSymbol(); - } else { - TargetFunction->setIgnored(); - BC.outs() << "BOLT-WARNING: Ignoring entry point at address 0x" - << Twine::utohexstr(Address) - << " in constant island of function " << *TargetFunction - << '\n'; + // Get a reference symbol for the function when address is a valid code + // reference. + BranchTargetSymbol = + BC.handleExternalBranchTarget(TargetAddress, *TargetFunction); + if (!BranchTargetSymbol) continue; - } } // Can't find more references. Not creating relocations since we are not diff --git a/bolt/test/AArch64/constant-island-entry.s b/bolt/test/AArch64/constant-island-entry.s index 7f8449deea130..a82b876fde46d 100644 --- a/bolt/test/AArch64/constant-island-entry.s +++ b/bolt/test/AArch64/constant-island-entry.s @@ -10,7 +10,7 @@ ## Skip caller to check the identical warning is triggered from ScanExternalRefs(). # RUN: llvm-bolt %t.exe -o %t.bolt -skip-funcs=caller 2>&1 | FileCheck %s -# CHECK: BOLT-WARNING: Ignoring entry point at address 0x{{[0-9a-f]+}} in constant island of function func +# CHECK: BOLT-WARNING: ignoring entry point at address 0x{{[0-9a-f]+}} in constant island of function func .globl func .type func, %function From 5ca240bb8de9766827e01132b34f6ec70659f51d Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 31 Oct 2025 08:56:46 -0700 Subject: [PATCH 353/539] [GitHub] Add Copilot review instructions for LLDB (#165783) This is an experiment to encode the LLVM Coding Standards [1] as instructions for the Copilot reviewer on GitHub. Ideally, this will catch common issues automatically and reduce the review burden. Adding Copilot as a reviewer is entirely opt-in. Initially, I will add it as a reviewer to test this. If the experiment is successful, we can explore how to integrate this into other parts of LLVM. [1]: https://llvm.org/docs/CodingStandards.html --- .github/instructions/lldb.instructions.md | 79 +++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 .github/instructions/lldb.instructions.md diff --git a/.github/instructions/lldb.instructions.md b/.github/instructions/lldb.instructions.md new file mode 100644 index 0000000000000..35bcd27b1b42f --- /dev/null +++ b/.github/instructions/lldb.instructions.md @@ -0,0 +1,79 @@ +--- +applyTo: lldb/**/* +--- + +When reviewing code, focus on: + +## Language, Libraries & Standards + +- Target C++17 and avoid vendor-specific extensions. +- For Python scripts, follow PEP 8. +- Prefer standard library or LLVM support libraries instead of reinventing data structures. + +## Comments & Documentation + +- Each source file should include the standard LLVM file header. +- Header files must have proper header guards. +- Non-trivial classes and public methods should have Doxygen documentation. +- Use `//` or `///` comments normally; avoid block comments unless necessary. +- Non-trivial code should have comments explaining what it does and why. Avoid comments that explain how it does it at a micro level. + +## Language & Compiler Issues + +- Write portable code; wrap non-portable code in interfaces. +- Do not use RTTI or exceptions. +- Prefer C++-style casts over C-style casts. +- Do not use static constructors. +- Use `class` or `struct` consistently; `struct` only for all-public data. +- When then same class is declared or defined multiple times, make sure it's consistently done using either `class` or `struct`. + +## Headers & Library Layering + +- Include order: module header → local/private headers → project headers → system headers. +- Headers must compile standalone (include all dependencies). +- Maintain proper library layering; avoid circular dependencies. +- Include minimally; use forward declarations where possible. +- Keep internal headers private to modules. +- Use full namespace qualifiers for out-of-line definitions. + +## Control Flow & Structure + +- Prefer early exits over deep nesting. +- Do not use `else` after `return`, `continue`, `break`, or `goto`. +- Encapsulate loops that compute predicates into helper functions. + +## Naming + +- LLDB's code style differs from LLVM's coding style. +- Variables are `snake_case`. +- Functions and methods are `UpperCamelCase`. +- Static, global and member variables have `s_`, `g_` and `m_` prefixes respectively. + +## General Guidelines + +- Use `assert` liberally; prefer `llvm_unreachable` for unreachable states. +- Do not use `using namespace std;` in headers. +- Provide a virtual method anchor for classes defined in headers. +- Do not use default labels in fully covered switches over enumerations. +- Use range-based for loops wherever possible. +- Capture `end()` outside loops if not using range-based iteration. +- Including `` is forbidded. Use LLVM’s `raw_ostream` instead. +- Don’t use `inline` when defining a function in a class definition. + +## Microscopic Details + +- Preserve existing style in modified code. +- Prefer pre-increment (`++i`) when value is unused. +- Use `private`, `protected`, or `public` keyword as appropriate to restrict class member visibility. +- Omit braces for single-statement `if`, `else`, `while`, `for` unless needed. + +## Review Style + +- Be specific and actionable in feedback. +- Explain the "why" behind recommendations. +- Link back to the LLVM Coding Standards: https://llvm.org/docs/CodingStandards.html. +- Ask clarifying questions when code intent is unclear. + +Ignore formatting and assume that's handled by external tools like `clang-format` and `black`. +Remember that these standards are **guidelines**. +Always prioritize consistency with the style that is already being used by the surrounding code. From 3a3240334f07b20b5e2bcfa4821417adbbe0a901 Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Fri, 31 Oct 2025 09:02:05 -0700 Subject: [PATCH 354/539] [lld][macho] Ignore cstrings in bp orderer (#165757) --- lld/MachO/BPSectionOrderer.cpp | 4 ++++ lld/test/MachO/bp-section-orderer.s | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp index d50abc22fc6c1..328c33e6cfb65 100644 --- a/lld/MachO/BPSectionOrderer.cpp +++ b/lld/MachO/BPSectionOrderer.cpp @@ -118,6 +118,10 @@ DenseMap lld::macho::runBalancedPartitioning( auto *isec = subsec.isec; if (!isec || isec->data.empty() || !isec->data.data()) continue; + // CString section order is handled by + // {Deduplicated}CStringSection::finalizeContents() + if (isa(isec) || isec->isFinal) + continue; // ConcatInputSections are entirely live or dead, so the offset is // irrelevant. if (isa(isec) && !isec->isLive(0)) diff --git a/lld/test/MachO/bp-section-orderer.s b/lld/test/MachO/bp-section-orderer.s index 90924e5797b64..d7de90d6cd7b3 100644 --- a/lld/test/MachO/bp-section-orderer.s +++ b/lld/test/MachO/bp-section-orderer.s @@ -106,6 +106,11 @@ r3: r4: .quad s2 +# cstrings are ignored by runBalancedPartitioning() +.cstring +cstr: + .asciz "this is cstr" + .bss bss0: .zero 10 From 8f90e26ce9680895b599b66b35763c5e14c0aa0a Mon Sep 17 00:00:00 2001 From: Sirui Mu Date: Sat, 1 Nov 2025 00:06:04 +0800 Subject: [PATCH 355/539] [CIR][NFC] Remove ia32 t2rpntlvwz* builtins --- clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp index 3c9c7ecf35aff..0198a9d4eb192 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp @@ -771,14 +771,6 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI_WriteBarrier: case X86::BI_AddressOfReturnAddress: case X86::BI__stosb: - case X86::BI__builtin_ia32_t2rpntlvwz0_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: case X86::BI__ud2: case X86::BI__int2c: case X86::BI__readfsbyte: From 09622cbec3ea90baa2a6ab3db3a192de65934fe5 Mon Sep 17 00:00:00 2001 From: James Newling Date: Fri, 31 Oct 2025 09:12:43 -0700 Subject: [PATCH 356/539] [MLIR][GPU] Ensure all lanes in cluster have final reduction value (#165764) This is a fix for a cluster size of 32 when the subgroup size is 64. Previously, only lanes [16, 32) u [48, 64) contained the correct clusterwise reduction value. This PR adds a swizzle instruction to broadcast the correct value down to lanes [0, 16) u [32, 48). --- .../GPU/Transforms/SubgroupReduceLowering.cpp | 28 ++- .../Dialect/GPU/subgroup-reduce-lowering.mlir | 172 +++++++++--------- 2 files changed, 109 insertions(+), 91 deletions(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index 81c3069cec16e..ec1571a56fe4a 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -416,13 +416,39 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op, if (ci.clusterSize >= 32) { if (chipset.majorVersion <= 9) { // Broadcast last value from each row to next row. - // Use row mask to avoid polluting rows 1 and 3. + // Use row mask to avoid polluting row 0 (and row 2 if wave-64). dpp = amdgpu::DPPOp::create(rewriter, loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15, rewriter.getUnitAttr(), 0xa, allBanks, /*bound_ctrl*/ false); res = vector::makeArithReduction( rewriter, loc, gpu::convertReductionKind(mode), res, dpp); + + // For subgroupSize = 64, at this point lanes [16, 32) contain the full + // reduction over lanes [0, 32), but lanes [0, 16) do not. Similarly, + // lanes [48, 64) contain the full reduction over lanes [32, 64), but + // lanes [32, 48) do not. + // + // If subgroup size is 64 and cluster size is 64, we don't need lanes [0, + // 16) and [32, 48) to have the correct cluster-32 reduction values at + // this point, because only lane 63's value will ultimately be read in + // this full-cluster case. + // + // If subgroup size is 64 and cluster size is 32, we need to ensure that + // lanes [0, 16) and [32, 48) have the correct final cluster-32 reduction + // values (subgroup_reduce guarantees that all lanes within each cluster + // contain the final reduction value). We do this by broadcasting lane + // 31's value to lanes [0, 16) and lanes 63's value to lanes [32, 48). + // + // See https://gpuopen.com/learn/amd-gcn-assembly-cross-lane-operations + // for an illustration of how this within-cluster broadcast works with a + // swizzle. + if (ci.subgroupSize == 64 && ci.clusterSize == 32) { + res = + amdgpu::SwizzleBitModeOp::create(rewriter, loc, res, /*and_mask=*/0, + /*or_mask=*/31, + /*xor_mask=*/0); + } } else if (chipset.majorVersion <= 12) { // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2). Value uint32Max = arith::ConstantOp::create( diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir index 87a31ca20eb7b..1adc4181e05d3 100644 --- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir +++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir @@ -8,11 +8,11 @@ // RUN: mlir-opt --allow-unregistered-dialect \ // RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx942" %s \ -// RUN: | FileCheck %s --check-prefix=CHECK-GFX9 +// RUN: | FileCheck %s --check-prefixes=CHECK-GFX,CHECK-GFX9 // RUN: mlir-opt --allow-unregistered-dialect \ // RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx1030" %s \ -// RUN: | FileCheck %s --check-prefix=CHECK-GFX10 +// RUN: | FileCheck %s --check-prefixes=CHECK-GFX,CHECK-GFX10 // CHECK-SUB: gpu.module @kernels { // CHECK-SHFL: gpu.module @kernels { @@ -24,8 +24,7 @@ gpu.module @kernels { // CHECK-SUB-SAME: %[[ARG0:.+]]: vector<5xf16>) // // CHECK-SHFL-LABEL: gpu.func @kernel0( - // CHECK-GFX9-LABEL: gpu.func @kernel0( - // CHECK-GFX10-LABEL: gpu.func @kernel0( + // CHECK-GFX-LABEL: gpu.func @kernel0( gpu.func @kernel0(%arg0: vector<5xf16>) kernel { // CHECK-SUB: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16> // CHECK-SUB: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16> @@ -56,8 +55,7 @@ gpu.module @kernels { // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4) // CHECK-SUB: "test.consume" - // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}} - // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}} + // CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}} %sum2 = gpu.subgroup_reduce mul %arg0 cluster(size = 4) : (vector<5xf16>) -> (vector<5xf16>) "test.consume"(%sum2) : (vector<5xf16>) -> () @@ -74,8 +72,7 @@ gpu.module @kernels { // CHECK-SUB-SAME: %[[ARG0:.+]]: vector<1xf32>) // // CHECK-SHFL-LABEL: gpu.func @kernel1( - // CHECK-GFX9-LABEL: gpu.func @kernel1( - // CHECK-GFX10-LABEL: gpu.func @kernel1( + // CHECK-GFX-LABEL: gpu.func @kernel1( gpu.func @kernel1(%arg0: vector<1xf32>) kernel { // CHECK-SUB: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32> // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32 @@ -100,17 +97,14 @@ gpu.module @kernels { // Note stride is dropped because it is == 1. // CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32 // CHECK-SUB: "test.consume" - // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}} quad_perm - // CHECK-GFX9: amdgpu.dpp {{.+}} row_half_mirror - // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}} quad_perm - // CHECK-GFX10: amdgpu.dpp {{.+}} row_half_mirror + // CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}} quad_perm + // CHECK-GFX: amdgpu.dpp {{.+}} row_half_mirror %sum2 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 1) : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum2) : (vector<1xf32>) -> () // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32 // CHECK-SUB: "test.consume" - // CHECK-GFX9-NOT: amdgpu.dpp - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-NOT: amdgpu.dpp // CHECK-GFX10-NOT: rocdl.permlanex16 %sum3 = gpu.subgroup_reduce add %arg0 uniform cluster(size = 8, stride = 4) : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum3) : (vector<1xf32>) -> () @@ -126,11 +120,8 @@ gpu.module @kernels { // // CHECK-SHFL-LABEL: gpu.func @kernel2( // - // CHECK-GFX9-LABEL: gpu.func @kernel2( - // CHECK-GFX9-NOT: amdgpu.dpp - // - // CHECK-GFX10-LABEL: gpu.func @kernel2( - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-LABEL: gpu.func @kernel2( + // CHECK-GFX-NOT: amdgpu.dpp gpu.func @kernel2(%arg0: vector<3xi8>, %arg1: vector<4xi8>) kernel { // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8> // CHECK-SUB: "test.consume"(%[[R0]]) : (vector<3xi8>) -> () @@ -148,8 +139,7 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel3( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32) - // CHECK-GFX9-LABEL: gpu.func @kernel3( - // CHECK-GFX10-LABEL: gpu.func @kernel3( + // CHECK-GFX-LABEL: gpu.func @kernel3( gpu.func @kernel3(%arg0: i32) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -169,9 +159,9 @@ gpu.module @kernels { // CHECK-SHFL: %[[S4:.+]], %{{.+}} = gpu.shuffle xor %[[A3]], %[[C16]], %[[C32]] : i32 // CHECK-SHFL: %[[A4:.+]] = arith.addi %[[A3]], %[[S4]] : i32 // CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> () - + // CHECK-GFX9-COUNT-6: amdgpu.dpp - + // CHECK-GFX10-COUNT-4: amdgpu.dpp // CHECK-GFX10: rocdl.permlanex16 // CHECK-GFX10-COUNT-2: rocdl.readlane @@ -185,11 +175,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32) // - // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered( - // CHECK-GFX9-SAME: %[[ARG0:.+]]: i32) - // - // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered( - // CHECK-GFX10-SAME: %[[ARG0:.+]]: i32) + // CHECK-GFX-LABEL: gpu.func @kernel3_clustered( + // CHECK-GFX-SAME: %[[ARG0:.+]]: i32) gpu.func @kernel3_clustered(%arg0: i32) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -204,19 +191,13 @@ gpu.module @kernels { // CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32 // CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> () - // CHECK-GFX9: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32 - // CHECK-GFX9: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32 - // CHECK-GFX9: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32 - // CHECK-GFX9: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32 - // CHECK-GFX9: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32 - // CHECK-GFX9: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32 - - // CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32 - // CHECK-GFX10: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32 - // CHECK-GFX10: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32 - // CHECK-GFX10: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32 - // CHECK-GFX10: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32 - // CHECK-GFX10: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32 + // CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32 + // CHECK-GFX: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32 + // CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32 + // CHECK-GFX: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32 + // CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32 + // CHECK-GFX: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32 + // CHECK-GFX10: "test.consume"(%[[A2]]) : (i32) -> () %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8) : (i32) -> i32 "test.consume"(%sum0) : (i32) -> () @@ -228,11 +209,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32) // - // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered_strided( - // CHECK-GFX9-NOT: amdgpu.dpp - // - // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered_strided( - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-LABEL: gpu.func @kernel3_clustered_strided( + // CHECK-GFX-NOT: amdgpu.dpp gpu.func @kernel3_clustered_strided(%arg0: i32) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32 @@ -256,11 +234,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel4( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>) // - // CHECK-GFX9-LABEL: gpu.func @kernel4( - // CHECK-GFX9-NOT: amdgpu.dpp - // - // CHECK-GFX10-LABEL: gpu.func @kernel4( - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-LABEL: gpu.func @kernel4( + // CHECK-GFX-NOT: amdgpu.dpp gpu.func @kernel4(%arg0: vector<2xf16>) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -298,11 +273,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel4_clustered( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>) // - // CHECK-GFX9-LABEL: gpu.func @kernel4_clustered( - // CHECK-GFX9-NOT: amdgpu.dpp - // - // CHECK-GFX10-LABEL: gpu.func @kernel4_clustered( - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-LABEL: gpu.func @kernel4_clustered( + // CHECK-GFX-NOT: amdgpu.dpp gpu.func @kernel4_clustered(%arg0: vector<2xf16>) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -319,10 +291,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel5( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i16) // - // CHECK-GFX9-LABEL: gpu.func @kernel5( - // - // CHECK-GFX10-LABEL: gpu.func @kernel5( - // CHECK-GFX10-SAME: %[[ARG0:.+]]: i16) + // CHECK-GFX-LABEL: gpu.func @kernel5( + // CHECK-GFX-SAME: %[[ARG0:.+]]: i16) gpu.func @kernel5(%arg0: i16) kernel { // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32 // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32 @@ -334,7 +304,7 @@ gpu.module @kernels { // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16 // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16 // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> () - + // CHECK-GFX9-COUNT-6: amdgpu.dpp // CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16 @@ -361,11 +331,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel5_clustered( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i16) // - // CHECK-GFX9-LABEL: gpu.func @kernel5_clustered - // CHECK-GFX9-SAME: %[[ARG0:.+]]: i16) - // - // CHECK-GFX10-LABEL: gpu.func @kernel5_clustered - // CHECK-GFX10-SAME: %[[ARG0:.+]]: i16) + // CHECK-GFX-LABEL: gpu.func @kernel5_clustered + // CHECK-GFX-SAME: %[[ARG0:.+]]: i16) gpu.func @kernel5_clustered(%arg0: i16) kernel { // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32 // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32 @@ -378,25 +345,15 @@ gpu.module @kernels { // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16 // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> () - // CHECK-GFX9: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16 - // CHECK-GFX9: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16 - // CHECK-GFX9: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16 - // CHECK-GFX9: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16 - // CHECK-GFX9: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16 - // CHECK-GFX9: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16 - // CHECK-GFX9: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16 - // CHECK-GFX9: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16 - // CHECK-GFX9: "test.consume"(%[[VAR7]]) : (i16) -> () - - // CHECK-GFX10: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16 - // CHECK-GFX10: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16 - // CHECK-GFX10: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16 - // CHECK-GFX10: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16 - // CHECK-GFX10: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16 - // CHECK-GFX10: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16 - // CHECK-GFX10: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16 - // CHECK-GFX10: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16 - // CHECK-GFX10: "test.consume"(%[[VAR7]]) : (i16) -> () + // CHECK-GFX: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16 + // CHECK-GFX: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16 + // CHECK-GFX: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16 + // CHECK-GFX: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16 + // CHECK-GFX: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16 + // CHECK-GFX: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16 + // CHECK-GFX: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16 + // CHECK-GFX: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16 + // CHECK-GFX: "test.consume"(%[[VAR7]]) : (i16) -> () %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16 "test.consume"(%sum0) : (i16) -> () @@ -407,11 +364,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel6( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>) // - // CHECK-GFX9-LABEL: gpu.func @kernel6( - // CHECK-GFX9-NOT: amdgpu.dpp - // - // CHECK-GFX10-LABEL: gpu.func @kernel6( - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-LABEL: gpu.func @kernel6( + // CHECK-GFX-NOT: amdgpu.dpp gpu.func @kernel6(%arg0: vector<3xi8>) kernel { // CHECK-SHFL: %[[CZ:.+]] = arith.constant dense<0> : vector<4xi8> // CHECK-SHFL: %[[V0:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CZ]] {offsets = [0], strides = [1]} : vector<3xi8> into vector<4xi8> @@ -433,6 +387,44 @@ gpu.module @kernels { gpu.return } + // CHECK-GFX-LABEL: gpu.func @kernel7( + // CHECK-GFX-SAME: %[[ARG0:.+]]: f32) + // + // Checks, common to gfx942 and gfx1030, of + // (1) quad_perm, followed by reduction resulting in reduction over 2 consecutive lanes, + // (2) quad_perm, followed by reduction resulting in reduction over 4 consecutive lanes, + // (3) row_half_mirror, followed by reduction resulting in reduction over 8 consecutive lanes, and + // (4) row_mirror, followed by reduction resulting in reduction over 16 consecutive lanes. + // CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : f32 + // CHECK-GFX: %[[A0:.+]] = arith.addf %[[ARG0]], %[[D0]] : f32 + // CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : f32 + // CHECK-GFX: %[[A1:.+]] = arith.addf %[[A0]], %[[D1]] : f32 + // CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : f32 + // CHECK-GFX: %[[A2:.+]] = arith.addf %[[A1]], %[[D2]] : f32 + // CHECK-GFX: %[[D3:.+]] = amdgpu.dpp %[[A2]] %[[A2]] row_mirror(unit) {bound_ctrl = true} : f32 + // CHECK-GFX: %[[A3:.+]] = arith.addf %[[A2]], %[[D3]] : f32 + // + // Now, on gfx942: + // (1) Lane 15 gets broadcast to lanes [16, 32) and lane 31 gets broadcast to lanes [48, 64], after which + // the reduction in lanes [16, 32) is over the full cluster of the first 32 lanes, and the reduction in lanes + // [48, 64) is over the full cluster of the last 32 lanes. + // (2) Update the reduction value in lanes [0, 16) and [32, 48) with the final reduction result from + // lanes [16, 32) and [48, 64), respectively. + // CHECK-GFX9: %[[BCAST15:.+]] = amdgpu.dpp %[[A3]] %[[A3]] row_bcast_15(unit) {row_mask = 10 : i32} : f32 + // CHECK-GFX9: %[[SUM:.+]] = arith.addf %[[A3]], %[[BCAST15]] : f32 + // CHECK-GFX9: %[[SWIZ:.+]] = amdgpu.swizzle_bitmode %[[SUM]] 0 31 0 : f32 + // CHECK-GFX9: "test.consume"(%[[SWIZ]]) : (f32) -> () + // + // On gfx1030, the final step is to permute the lanes and perform final reduction: + // CHECK-GFX10: rocdl.permlanex16 + // CHECK-GFX10: arith.addf + // CHECK-GFX10: "test.consume" + gpu.func @kernel7(%arg0: f32) kernel { + %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 32) : (f32) -> (f32) + "test.consume"(%sum0) : (f32) -> () + gpu.return + } + // CHECK-SHFL-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>) // From 134ff1660feb2d0ae3e4539829a8a84efb20d2c5 Mon Sep 17 00:00:00 2001 From: vangthao95 Date: Fri, 31 Oct 2025 09:12:56 -0700 Subject: [PATCH 357/539] [AMDGPU][GlobalISel] Add RegBankLegalize support for G_READCYCLECOUNTER (#165754) --- .../lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp | 3 ++- llvm/test/CodeGen/AMDGPU/readcyclecounter.ll | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index b22e9bdc334d7..103cdec8233a0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -913,7 +913,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}}); - addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}}); + addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard) + .Uni(S64, {{Sgpr64}, {}}); bool hasSALUFloat = ST->hasSALUFloatInsts(); diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll index f67cbe381bfad..ddb522a82880b 100644 --- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll +++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll @@ -1,17 +1,17 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s ; -global-isel=1 SI run line skipped since store not yet implemented. ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-SDAG -check-prefix=GCN %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s declare i64 @llvm.readcyclecounter() #0 From 0fc9f5dbb2494d03480d9a04f78cd2f027106c89 Mon Sep 17 00:00:00 2001 From: Quan Zhuo Date: Sat, 1 Nov 2025 00:26:47 +0800 Subject: [PATCH 358/539] [clang][CodeComplete] Add completion for #embed directive in C23 mode (#165550) Fixes https://github.com/clangd/clangd/issues/2535 --- clang/lib/Sema/SemaCodeComplete.cpp | 18 ++++++++++++++++++ clang/test/Index/complete-preprocessor.m | 5 +++++ 2 files changed, 23 insertions(+) diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp index 0514d1033f74f..aa93507ab5c30 100644 --- a/clang/lib/Sema/SemaCodeComplete.cpp +++ b/clang/lib/Sema/SemaCodeComplete.cpp @@ -10208,6 +10208,24 @@ void SemaCodeCompletion::CodeCompletePreprocessorDirective(bool InConditional) { Builder.AddPlaceholderChunk("message"); Results.AddResult(Builder.TakeString()); + if (getLangOpts().C23) { + // #embed "file" + Builder.AddTypedTextChunk("embed"); + Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace); + Builder.AddTextChunk("\""); + Builder.AddPlaceholderChunk("file"); + Builder.AddTextChunk("\""); + Results.AddResult(Builder.TakeString()); + + // #embed + Builder.AddTypedTextChunk("embed"); + Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace); + Builder.AddTextChunk("<"); + Builder.AddPlaceholderChunk("file"); + Builder.AddTextChunk(">"); + Results.AddResult(Builder.TakeString()); + } + // Note: #ident and #sccs are such crazy anachronisms that we don't provide // completions for them. And __include_macros is a Clang-internal extension // that we don't want to encourage anyone to use. diff --git a/clang/test/Index/complete-preprocessor.m b/clang/test/Index/complete-preprocessor.m index 1cc2f32b7efa6..bd90a796240c4 100644 --- a/clang/test/Index/complete-preprocessor.m +++ b/clang/test/Index/complete-preprocessor.m @@ -80,3 +80,8 @@ // RUN: env CINDEXTEST_EDITING=1 CINDEXTEST_COMPLETION_CACHING=1 c-index-test -code-completion-at=%s:9:8 %s | FileCheck -check-prefix=CHECK-CC3 %s // RUN: env CINDEXTEST_EDITING=1 CINDEXTEST_COMPLETION_CACHING=1 c-index-test -code-completion-at=%s:11:5 %s | FileCheck -check-prefix=CHECK-CC4 %s // RUN: env CINDEXTEST_EDITING=1 CINDEXTEST_COMPLETION_CACHING=1 c-index-test -code-completion-at=%s:14:5 %s | FileCheck -check-prefix=CHECK-CC5 %s + +// Test #embed completion in C23 mode +// RUN: c-index-test -code-completion-at=%s:4:2 %s -std=c23 | FileCheck -check-prefix=CHECK-EMBED %s +// CHECK-EMBED: NotImplemented:{TypedText embed}{HorizontalSpace }{Text "}{Placeholder file}{Text "} (40) +// CHECK-EMBED: NotImplemented:{TypedText embed}{HorizontalSpace }{Text <}{Placeholder file}{Text >} (40) From 48c1d2f0cd8659cab86751f7c117a213c2a15c8a Mon Sep 17 00:00:00 2001 From: Artem Kroviakov <71938912+akroviakov@users.noreply.github.com> Date: Fri, 31 Oct 2025 17:33:11 +0100 Subject: [PATCH 359/539] [MLIR][XeGPU] Introduce `xegpu::uArch` usage in target-sensitive passes (#163801) --- .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 22 +- .../mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h | 30 -- .../mlir/Dialect/XeGPU/Transforms/Passes.td | 7 +- .../mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h | 2 + .../mlir/Dialect/XeGPU/uArch/uArchBase.h | 2 + mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 12 +- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 293 ++++++++++++++---- .../Transforms/XeGPUSubgroupDistribute.cpp | 30 +- .../XeGPU/move-gpu-func-to-warp-op.mlir | 2 +- .../XeGPU/propagate-layout-inst-data.mlir | 128 ++++++++ mlir/test/Dialect/XeGPU/propagate-layout.mlir | 82 +++-- 11 files changed, 464 insertions(+), 146 deletions(-) delete mode 100644 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h create mode 100644 mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 19a52317956d2..40352b44b6441 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -379,28 +379,28 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> { ); let builders = [ - AttrBuilder<(ins "llvm::ArrayRef": $lane_layout, + AttrBuilder<(ins "llvm::ArrayRef": $inst_data, + "llvm::ArrayRef": $lane_layout, "llvm::ArrayRef": $lane_data), [{ auto sg_layout = DenseI32ArrayAttr(); auto sg_data = DenseI32ArrayAttr(); - auto inst_data = DenseI32ArrayAttr(); auto order = DenseI32ArrayAttr(); - return $_get($_ctxt, sg_layout, sg_data, inst_data, + return $_get($_ctxt, sg_layout, sg_data, + DenseI32ArrayAttr::get($_ctxt, inst_data), DenseI32ArrayAttr::get($_ctxt, lane_layout), DenseI32ArrayAttr::get($_ctxt, lane_data), order); }]>, AttrBuilder<(ins "llvm::ArrayRef": $lane_layout, - "llvm::ArrayRef": $lane_data, - "llvm::ArrayRef": $order), + "llvm::ArrayRef": $lane_data), [{ - return $_get($_ctxt, - /*sg_layout =*/ nullptr, - /*sg_data =*/ nullptr, - /*inst_data =*/ nullptr, + auto sg_layout = DenseI32ArrayAttr(); + auto sg_data = DenseI32ArrayAttr(); + auto inst_data = DenseI32ArrayAttr(); + auto order = DenseI32ArrayAttr(); + return $_get($_ctxt, sg_layout, sg_data, inst_data, DenseI32ArrayAttr::get($_ctxt, lane_layout), - DenseI32ArrayAttr::get($_ctxt, lane_data), - DenseI32ArrayAttr::get($_ctxt, order)); + DenseI32ArrayAttr::get($_ctxt, lane_data), order); }]>, AttrBuilder<(ins "DenseI32ArrayAttr": $lane_layout, "DenseI32ArrayAttr": $lane_data, diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h deleted file mode 100644 index 8aa9536cb67c1..0000000000000 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h +++ /dev/null @@ -1,30 +0,0 @@ -//===- XeGPUTargetInfo.h - Target constants ---------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_ -#define MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_ - -namespace mlir { -namespace xegpu { -/// HW dependent constants. -/// TODO: These constants should be queried from the target information. -namespace targetinfo { -constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup. -/// If DPAS A or B operands have low precision element types they must be packed -/// according to the following sizes. -constexpr unsigned packedSizeInBitsForDefault = - 16; // Minimum packing size per register for DPAS A. -constexpr unsigned packedSizeInBitsForDpasB = - 32; // Minimum packing size per register for DPAS B. -constexpr unsigned packedSizeInBitsForGatherScatter = - 32; // Minimum packing size per register for Gather and Scatter ops. -} // namespace targetinfo -} // namespace xegpu -} // namespace mlir - -#endif // MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_ diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index 564d9c4d5422b..b7af5413669c9 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -43,7 +43,12 @@ def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> { let options = [Option< "printOnly", "print-analysis-only", "bool", /*default=*/"false", - "Print the result of layout propagation analysis and exit.">]; + "Print the result of layout propagation analysis and exit.">, + Option< + "layoutKind", "layout-kind", "std::string", + /*default=*/"\"lane\"", + "Propagate a `sg` / `inst` / `lane` level of xegpu layouts."> + ]; } def XeGPUWgToSgDistribute : Pass<"xegpu-wg-to-sg-distribute"> { diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h index dcb2ad5d67a25..b3231a173f33a 100644 --- a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h +++ b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h @@ -270,6 +270,8 @@ inline const uArch *getUArch(llvm::StringRef archName) { return PVCuArch::getInstance(); else if (archName.equals_insensitive("bmg")) return BMGuArch::getInstance(); + else + llvm_unreachable("No matching uArch found"); return nullptr; } diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h index ea33e885c78ff..8f23b89134773 100644 --- a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h +++ b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h @@ -29,6 +29,8 @@ namespace mlir { namespace xegpu { namespace uArch { +constexpr unsigned generalPackedFormatBitSize{32}; + // An enum class to represent the scope of an instruction enum class InstructionScope { Lane, Subgroup, Workgroup, Cluster }; enum class InstructionKind { diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index f9aa28d5203db..6b4c185d7d897 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -11,7 +11,7 @@ #include "mlir/Dialect/Index/IR/IndexOps.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" -#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h" +#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" #include "mlir/IR/Builders.h" #include "mlir/IR/DialectImplementation.h" @@ -229,8 +229,10 @@ LayoutAttr::verify(llvm::function_ref emitError, } if (inst_data && lane_layout && inst_data.size() != lane_layout.size()) { - return emitError() - << "expected inst_data and lane_layout to have the same rank"; + return emitError() << "expected inst_data and lane_layout to have the same " + "rank, got inst_data " + << inst_data.size() << ", lane_layout " + << lane_layout.size(); } // sg_data is optional for Workgroup layout, but its presence requires @@ -569,8 +571,8 @@ TensorDescType::verify(llvm::function_ref emitError, // for gather and scatter ops, Low-precision types are packed in 32-bit units. unsigned bitWidth = elementType.getIntOrFloatBitWidth(); int chunkAlignmentFactor = - bitWidth < targetinfo::packedSizeInBitsForGatherScatter - ? targetinfo::packedSizeInBitsForGatherScatter / bitWidth + bitWidth < xegpu::uArch::generalPackedFormatBitSize + ? xegpu::uArch::generalPackedFormatBitSize / bitWidth : 1; auto scatterAttr = mlir::dyn_cast_if_present(encoding); if (scatterAttr) { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 8fab255d6347f..90eae871a5ef3 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -14,7 +14,6 @@ #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" -#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h" #include "mlir/Dialect/XeGPU/Transforms/Passes.h" #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/IR/Attributes.h" @@ -37,6 +36,8 @@ #include "llvm/Support/LogicalResult.h" #include "llvm/Support/raw_ostream.h" +#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" + namespace mlir { namespace xegpu { #define GEN_PASS_DEF_XEGPUPROPAGATELAYOUT @@ -104,6 +105,8 @@ struct LayoutInfo { SmallVector getLaneData() const; + SmallVector getInstData() const; + bool isSliceLayout() const { if (!isAssigned()) return false; @@ -137,6 +140,13 @@ SmallVector LayoutInfo::getLaneData() const { [](int64_t val) { return static_cast(val); }); } +SmallVector LayoutInfo::getInstData() const { + if (!isAssigned()) + return {}; + return llvm::map_to_vector(storage.getEffectiveInstDataAsInt(), + [](int64_t val) { return static_cast(val); }); +} + void LayoutInfo::print(raw_ostream &os) const { if (isAssigned()) { os << storage; @@ -174,12 +184,14 @@ LayoutInfo LayoutInfo::transpose(ArrayRef permutation) const { SmallVector laneLayout; SmallVector laneData; + SmallVector instData; for (int64_t idx : permutation) { laneLayout.push_back(static_cast(getLaneLayout()[idx])); laneData.push_back(static_cast(getLaneData()[idx])); + instData.push_back(static_cast(getInstData()[idx])); } - return LayoutInfo( - xegpu::LayoutAttr::get(storage.getContext(), laneLayout, laneData)); + return LayoutInfo(xegpu::LayoutAttr::get(storage.getContext(), instData, + laneLayout, laneData)); } //===----------------------------------------------------------------------===// @@ -192,6 +204,28 @@ struct LayoutInfoLattice : public Lattice { using Lattice::Lattice; }; +/// Helper Function to find a proper instruction multiple for the user-supplied +/// sg-level data shape. `candidates` are uArch allowed shapes. +/// `candidateMultiples` are uArch multiples of such shapes (e.g., block count). +template +int getLargestDivisor(T dim, ArrayRef candidates, + ArrayRef candidateMultiples = {}) { + static_assert(std::is_integral::value, "T must be an integer type"); + int largest = -1; + SmallVector multiples = {1}; + if (!candidateMultiples.empty()) + multiples = + SmallVector(candidateMultiples.begin(), candidateMultiples.end()); + for (T candidate : candidates) { + for (T multiple : multiples) { + int value = static_cast(candidate * multiple); + if (value != 0 && dim % value == 0 && value > largest) + largest = value; + } + } + return largest; +} + /// Helper Functions to get default layouts. A `default layout` is a layout that /// is assigned to a value when the layout is not fixed by some anchor operation /// (like DPAS). @@ -200,18 +234,32 @@ struct LayoutInfoLattice : public Lattice { /// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1]. /// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1]. static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx, - unsigned rank) { + unsigned rank, + const xegpu::uArch::uArch *uArch, + ArrayRef instData) { assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector."); if (rank == 1) { return LayoutInfo( - xegpu::LayoutAttr::get(ctx, {xegpu::targetinfo::subgroupSize}, {1})); + xegpu::LayoutAttr::get(ctx, instData, {uArch->getSubgroupSize()}, {1})); } return LayoutInfo(xegpu::LayoutAttr::get( - ctx, {1, xegpu::targetinfo::subgroupSize}, {1, 1})); + ctx, instData, {1, uArch->getSubgroupSize()}, {1, 1})); +} + +static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx, + unsigned rank, int subgroupSize) { + assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector."); + if (rank == 1) { + return LayoutInfo(xegpu::LayoutAttr::get(ctx, {subgroupSize}, {1})); + } + return LayoutInfo(xegpu::LayoutAttr::get(ctx, {1, subgroupSize}, {1, 1})); } /// Helper to get the default layout for a vector type. static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy, + const xegpu::uArch::uArch *uArch, + ArrayRef instData, + unsigned packingSize, bool isScattered = false) { // Expecting a 1D or 2D vector. assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) && @@ -221,28 +269,25 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy, "Expected int or float element type."); // If the rank is 1, then return default layout for 1D vector. if (vectorTy.getRank() == 1) - return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1); + return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1, uArch, instData); // Packing factor is determined by the element type bitwidth. - int packingFactor = 1; unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth(); + int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1; if (isScattered) { - packingFactor = - bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter - ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth - : 1; - return LayoutInfo(xegpu::LayoutAttr::get( - vectorTy.getContext(), {xegpu::targetinfo::subgroupSize, 1}, - {1, packingFactor})); + return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData, + {uArch->getSubgroupSize(), 1}, + {1, packingFactor})); } - if (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault) - packingFactor = xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth; - return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), - {1, xegpu::targetinfo::subgroupSize}, + return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData, + {1, uArch->getSubgroupSize()}, {1, packingFactor})); } /// Helper to get the default layout for a vector type. static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, + const xegpu::uArch::uArch *uArch, + ArrayRef instData, + unsigned packingSize, bool isScattered = false) { // Expecting a 1D or 2D vector. assert((tdescTy.getRank() == 1 || tdescTy.getRank() == 2) && @@ -252,27 +297,18 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, "Expected int or float element type."); // If the rank is 1, then return default layout for 1D vector. if (tdescTy.getRank() == 1) - return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1); + return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1, uArch, instData); // Packing factor is determined by the element type bitwidth. unsigned bitwidth = tdescTy.getElementType().getIntOrFloatBitWidth(); - + int subgroupSize = uArch->getSubgroupSize(); + int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1; if (isScattered) { - int packingFactor = - bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter - ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth - : 1; return LayoutInfo(xegpu::LayoutAttr::get( - tdescTy.getContext(), {xegpu::targetinfo::subgroupSize, 1}, - {1, packingFactor})); + tdescTy.getContext(), instData, {subgroupSize, 1}, {1, packingFactor})); } - int packingFactor = - (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault) - ? xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth - : 1; - return LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), - {1, xegpu::targetinfo::subgroupSize}, - {1, packingFactor})); + return LayoutInfo(xegpu::LayoutAttr::get( + tdescTy.getContext(), instData, {1, subgroupSize}, {1, packingFactor})); } /// Helper Function to get the expected layouts for DPAS operands. `lane_data` @@ -281,25 +317,25 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, /// `packedSizeInBitsForDefault` /// * For B operand, the data must be packed in minimum /// `packedSizeInBitsForDpasB` -static LayoutInfo getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, - unsigned operandNum) { +static LayoutInfo +getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, unsigned operandNum, + const xegpu::uArch::uArch *uArch, + ArrayRef instData, unsigned packingSize) { Type elementTy = vectorTy.getElementType(); assert(elementTy.isIntOrFloat() && "Expected int or float type in DPAS operands"); - SmallVector layout({1, xegpu::targetinfo::subgroupSize}); + SmallVector layout({1, uArch->getSubgroupSize()}); // For B operand, data must be packed in minimum `packedDpasBSizeInBits` and // must have the VNNI format. - if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() < - xegpu::targetinfo::packedSizeInBitsForDpasB) { + if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() < packingSize) { SmallVector data( - {static_cast(xegpu::targetinfo::packedSizeInBitsForDpasB / - elementTy.getIntOrFloatBitWidth()), + {static_cast(packingSize / elementTy.getIntOrFloatBitWidth()), 1}); return LayoutInfo( - xegpu::LayoutAttr::get(vectorTy.getContext(), layout, data)); + xegpu::LayoutAttr::get(vectorTy.getContext(), instData, layout, data)); } // Otherwise, return the default layout for the vector type. - return getDefaultSIMTLayoutInfo(vectorTy); + return getDefaultSIMTLayoutInfo(vectorTy, uArch, instData, packingSize); } //===----------------------------------------------------------------------===// @@ -456,7 +492,37 @@ void LayoutInfoPropagation::visitPrefetchNdOp( // Here we assign the default layout to the tensor descriptor operand of // prefetch. auto tdescTy = prefetch.getTensorDescType(); - auto prefetchLayout = getDefaultSIMTLayoutInfo(tdescTy); + + auto uArch = getUArch(getChipStr(prefetch).value_or("")); + const auto *uArchInstruction = + dyn_cast( + uArch->getInstruction( + xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch)); + + auto blockWHC = + uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType()); + if (!blockWHC) + prefetch.emitWarning("No known block params found for the element type."); + auto [bWidth, bHeight, bCount] = blockWHC.value(); + SmallVector instData; + int instWidth = getLargestDivisor( + static_cast(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth, + bCount); + if (instWidth == -1) + prefetch.emitWarning( + "No suitable instruction multiple found for the given shape."); + if (tdescTy.getRank() == 1) + instData = {instWidth}; + else { + int instHeight = getLargestDivisor( + static_cast(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight); + if (instHeight == -1) + prefetch.emitWarning( + "No suitable instruction multiple found for the given shape."); + instData = {instHeight, instWidth}; + } + auto prefetchLayout = getDefaultSIMTLayoutInfo( + tdescTy, uArch, instData, uArchInstruction->getPackedFormatBitSize()); // Propagate the layout to the source tensor descriptor. propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout)); } @@ -475,10 +541,11 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp( reduction.emitWarning("Expecting output type to be 1D vector."); return; } + auto uArch = getUArch(xegpu::getChipStr(reduction).value_or("")); // Given that the result is 1D, the layout of the operand should be 2D with // default layout. - LayoutInfo operandLayout = - getDefaultSIMTLayoutInfo(reduction->getContext(), 2); + LayoutInfo operandLayout = getDefaultSIMTLayoutInfo( + reduction->getContext(), 2, uArch->getSubgroupSize()); propagateIfChanged(operands[0], operands[0]->meet(operandLayout)); // Accumulator should have the same layout as the result. propagateIfChanged(operands[1], operands[1]->meet(resultLayout)); @@ -557,15 +624,53 @@ void LayoutInfoPropagation::visitDpasOp( ArrayRef results) { VectorType aTy = dpas.getLhsType(); VectorType bTy = dpas.getRhsType(); - propagateIfChanged( - operands[0], operands[0]->meet(getSIMTLayoutInfoForDPASOperand(aTy, 0))); - propagateIfChanged( - operands[1], operands[1]->meet(getSIMTLayoutInfoForDPASOperand(bTy, 1))); + + auto uArch = getUArch(getChipStr(dpas).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + const auto *uArchInstruction = + dyn_cast(uArch->getInstruction( + xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc)); + + const unsigned dataALen = aTy.getShape().front(); + auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType()); + const int maxALen = + getLargestDivisor(dataALen, ArrayRef(supportedALen)); + if (maxALen == -1) + dpas.emitWarning( + "No suitable instruction multiple found for the given shape."); + + const unsigned dataBLen = bTy.getShape().back(); + auto supportedBLen = uArchInstruction->getSupportedK(bTy.getElementType()); + const int maxBLen = + getLargestDivisor(dataBLen, ArrayRef(supportedBLen)); + if (maxBLen == -1) + dpas.emitWarning( + "No suitable instruction multiple found for the given shape."); + SmallVector instDataA = {maxALen, subgroupSize}; + SmallVector instDataB = {subgroupSize, maxBLen}; + + propagateIfChanged(operands[0], + operands[0]->meet(getSIMTLayoutInfoForDPASOperand( + aTy, 0, uArch, instDataA, + uArchInstruction->getPackedFormatBitSizeA()))); + propagateIfChanged(operands[1], + operands[1]->meet(getSIMTLayoutInfoForDPASOperand( + bTy, 1, uArch, instDataB, + uArchInstruction->getPackedFormatBitSizeB()))); if (operands.size() > 2) { VectorType cTy = dpas.getAccType(); - propagateIfChanged( - operands[2], - operands[2]->meet(getSIMTLayoutInfoForDPASOperand(cTy, 2))); + const unsigned dataCLen = bTy.getShape().back(); + auto supportedCLen = uArchInstruction->getSupportedN(bTy.getElementType()); + const int maxCLen = + getLargestDivisor(dataCLen, ArrayRef(supportedCLen)); + if (maxCLen == -1) + dpas.emitWarning( + "No suitable instruction multiple found for the given shape."); + SmallVector instDataC = {maxALen, maxCLen}; + propagateIfChanged(operands[2], + operands[2]->meet(getSIMTLayoutInfoForDPASOperand( + cTy, 2, uArch, instDataC, + uArchInstruction->getPackedFormatBitSizeB()))); } } @@ -573,7 +678,38 @@ void LayoutInfoPropagation::visitDpasOp( void LayoutInfoPropagation::visitStoreNdOp( xegpu::StoreNdOp store, ArrayRef operands, ArrayRef results) { - LayoutInfo storeLayout = getDefaultSIMTLayoutInfo(store.getValueType()); + + auto uArch = getUArch(getChipStr(store).value_or("")); + const auto *uArchInstruction = + dyn_cast( + uArch->getInstruction( + xegpu::uArch::InstructionKind::Subgroup2DBlockStore)); + VectorType dataTy = store.getValueType(); + auto blockWHC = uArchInstruction->getBlockWidthHeightCount( + store.getValueType().getElementType()); + if (!blockWHC) + store.emitWarning("No known block params found for the element type."); + auto [bWidth, bHeight, bCount] = blockWHC.value(); + SmallVector instData; + int instWidth = getLargestDivisor( + static_cast(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth, + bCount); + if (instWidth == -1) + store.emitWarning( + "No suitable instruction multiple found for the given shape."); + if (dataTy.getRank() == 1) + instData = {instWidth}; + else { + int instHeight = getLargestDivisor( + static_cast(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight); + if (instHeight == -1) + store.emitWarning( + "No suitable instruction multiple found for the given shape."); + instData = {instHeight, instWidth}; + } + LayoutInfo storeLayout = + getDefaultSIMTLayoutInfo(store.getValueType(), uArch, instData, + uArchInstruction->getPackedFormatBitSize()); // Both operands should have the same layout for (LayoutInfoLattice *operand : operands) propagateIfChanged(operand, operand->meet(storeLayout)); @@ -694,10 +830,23 @@ void LayoutInfoPropagation::visitLoadGatherOp( load.emitWarning("Not propagating, non-vector payload supplied."); return; } - LayoutInfo layout = getDefaultSIMTLayoutInfo(payloadTy, /*scattered*/ true); + auto uArch = getUArch(getChipStr(load).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + SmallVector instData{subgroupSize}; + if (auto chunkSize = load.getChunkSize().value_or(0); chunkSize > 1) + instData.push_back(chunkSize); + else if (auto srcTdescTy = + dyn_cast(load.getSourceType())) { + if (srcTdescTy.getChunkSizeAsInt() > 1) + instData.push_back(chunkSize); + } + LayoutInfo layout = getDefaultSIMTLayoutInfo( + payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(), + /*scattered*/ true); // Mask operand should have 1D default layout. - LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(load->getContext(), 1); + LayoutInfo maskLayout = + getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize); // Propagate the new layout to the tensor descriptor operand. if (isa(load.getSourceType())) @@ -717,8 +866,10 @@ void LayoutInfoPropagation::visitCreateDescOp( // Need the layout of the descriptor to propagate to the operands. if (!descLayout.isAssigned()) return; + auto uArch = getUArch(getChipStr(createDesc).value_or("")); // For offset operand propagate 1D default layout. - LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1); + LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1, + uArch->getSubgroupSize()); propagateIfChanged(operands[1], operands[1]->meet(layout)); } @@ -735,18 +886,30 @@ void LayoutInfoPropagation::visitStoreScatterOp( storeScatter.emitWarning("Not propagating, non-vector payload supplied."); return; } + auto uArch = getUArch(getChipStr(storeScatter).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + auto payloadShape = payloadTy.getShape(); if (payloadShape.size() > 1) assert( - payloadShape[0] == xegpu::targetinfo::subgroupSize && + payloadShape[0] == subgroupSize && "Expected the first dimension of 2D tensor descriptor to be equal to " "subgroup size."); - LayoutInfo payloadLayout = - getDefaultSIMTLayoutInfo(payloadTy, /*scattered=*/true); + SmallVector instData{subgroupSize}; + if (auto chunkSize = storeScatter.getChunkSize().value_or(0); chunkSize > 1) + instData.push_back(chunkSize); + else if (auto dstTdescTy = + dyn_cast(storeScatter.getDestType())) { + if (dstTdescTy.getChunkSizeAsInt() > 1) + instData.push_back(chunkSize); + } + LayoutInfo payloadLayout = getDefaultSIMTLayoutInfo( + payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(), + /*scattered=*/true); LayoutInfo maskLayout = - getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1); + getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize); // Propagate the payload operand layout propagateIfChanged(operands[0], operands[0]->meet(payloadLayout)); // Propagate the destination (if tdesc) operand layout @@ -1023,9 +1186,13 @@ void XeGPUPropagateLayoutPass::runOnOperation() { LayoutInfo layout = analysis.getLayoutInfo(val); if (!layout.isAssigned()) return {}; + xegpu::DistributeLayoutAttr layoutAttr = + cast(layout.get()); + if (this->layoutKind == "lane") + layoutAttr = layoutAttr.dropInstData(); if (layout.isSliceLayout()) - return cast(layout.get()); - return cast(layout.get()); + return cast(layoutAttr); + return cast(layoutAttr); }; mlir::OpBuilder builder(&getContext()); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index d09dc196c0bf7..5a3b27ec6108e 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -11,10 +11,10 @@ #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" -#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h" #include "mlir/Dialect/XeGPU/Transforms/Passes.h" #include "mlir/Dialect/XeGPU/Transforms/Transforms.h" #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" +#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" @@ -159,17 +159,18 @@ static bool requirePacked(const xegpu::LayoutAttr layout) { /// Helper function to check if the layout requires a transpose effect. static bool requireTranspose(const xegpu::LayoutAttr layout, - const std::string &chipStr) { + const xegpu::uArch::uArch *uArch) { // Return false for unsupported targets. // TODO: Add more support or move to target info. - if (chipStr != "pvc" && chipStr != "bmg") + if (uArch->getName().equals_insensitive("pvc") && + uArch->getName().equals_insensitive("bmg")) return false; if (!layout) return false; auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); if (laneLayout.size() != 2) return false; - return laneLayout[0] == xegpu::targetinfo::subgroupSize && laneLayout[1] == 1; + return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1; } /// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body @@ -199,6 +200,11 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, PatternRewriter &rewriter) const override { + auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or("")); + if (!uArch) + return rewriter.notifyMatchFailure( + gpuFuncOp, "Subgroup distribution requires target attribute attached " + "to set the warp size"); // If the function only contains a single void return, skip. if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) { return isa(op) && !op.getNumOperands(); @@ -230,7 +236,7 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern { ArrayRef gpuFuncResultType = gpuFuncOp.getFunctionType().getResults(); auto warpOp = gpu::WarpExecuteOnLane0Op::create( rewriter, laneId.getLoc(), gpuFuncResultType, laneId, - xegpu::targetinfo::subgroupSize, newGpuFunc.getArguments(), + uArch->getSubgroupSize(), newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes()); Block &warpBodyBlock = warpOp.getBodyRegion().front(); // Replace the ReturnOp of the original gpu function with a YieldOp. @@ -495,14 +501,14 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { warpOp, "warp result is not a xegpu::LoadNd op"); auto loadOp = operand->get().getDefiningOp(); + auto uArch = getUArch(xegpu::getChipStr(loadOp).value_or("")); + if (!uArch) + return rewriter.notifyMatchFailure( + loadOp, "xegpu::LoadNdOp require target attribute attached to " + "determine transpose " + "requirement"); // Chip information is required to decide if the layout requires transpose // effect. - auto chipStr = xegpu::getChipStr(loadOp); - if (!chipStr) - return rewriter.notifyMatchFailure( - loadOp, - "xegpu::LoadNdOp require chip information to determine transpose " - "requirement"); // Expecting offsets to be present. SmallVector offsets = loadOp.getMixedOffsets(); if (offsets.empty()) @@ -556,7 +562,7 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { // Set the packed attribute if the layout requires it. newLoadOp.setPacked(requirePacked(layout)); // Set the transpose attribute if the layout requires it. - if (requireTranspose(layout, chipStr.value())) + if (requireTranspose(layout, uArch)) newLoadOp.setTranspose( DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0})); Value distributedVal = newWarpOp.getResult(operandIdx); diff --git a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir index d289d73e863c7..2780212d2917f 100644 --- a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir +++ b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s +// RUN: mlir-opt -xevm-attach-target='chip=pvc' -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s gpu.module @test { gpu.func @empty() { diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir new file mode 100644 index 0000000000000..58461b8be52c4 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir @@ -0,0 +1,128 @@ +// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=inst" -split-input-file %s | FileCheck %s + +// CHECK-LABEL: func.func @dpas_f16( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> +// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> +// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +gpu.module @test { + +func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { + %c0 = arith.constant 0 : index + %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + return +} +} + +// ----- +gpu.module @test_kernel { + gpu.func @elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %m = arith.muli %block_id_x, %c32 : index + + %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16> + %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16> + %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16> + + %out:3 = scf.for %k = %c0 to %c1024 step %c32 + iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) + -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) { + //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout} : + //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout> -> vector<16x32xf16> + %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> + %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> + + //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout} : vector<16x32xf16> + %c = arith.addf %a, %b : vector<16x32xf16> + + //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #xegpu.layout>> + xegpu.store_nd %c, %arg2: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16> + + //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x32xf16, #xegpu.layout> + %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16> + %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16> + %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16> + scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc + : !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16> + } + gpu.return + } +} + +// ----- +gpu.module @test_kernel { + gpu.func @elementwise_with_inst_data_12(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %m = arith.muli %block_id_x, %c32 : index + + %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16> + %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16> + %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16> + + %out:3 = scf.for %k = %c0 to %c1024 step %c32 + iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) + -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) { + //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout} : + //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout> -> vector<12x32xf16> + %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> + %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> + + //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout} : vector<12x32xf16> + %c = arith.addf %a, %b : vector<12x32xf16> + + //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16, #xegpu.layout>> + xegpu.store_nd %c, %arg2: vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16> + + //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<12x32xf16, #xegpu.layout> + %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16> + %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16> + %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16> + scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc + : !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16> + } + gpu.return + } +} + +// ----- +gpu.module @test { +// CHECK-LABEL: func.func @scatter_ops_chunksize( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { +// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> +// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> +// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> +// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> +// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +func.func @scatter_ops_chunksize(%src: memref<256xf16>) { + %1 = arith.constant dense<1>: vector<16xi1> + %offset = arith.constant dense<12> : vector<16xindex> + %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> + : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> + xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> + : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + return +} +} diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index 30f785ded975a..543e119d81d88 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -1,5 +1,6 @@ -// RUN: mlir-opt -xegpu-propagate-layout -split-input-file %s | FileCheck %s +// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout -split-input-file %s | FileCheck %s +gpu.module @test { // CHECK-LABEL: func.func @dpas_f16( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> @@ -25,8 +26,10 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @dpas_i8( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) { // CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_result_0 = #xegpu.layout, %arg1: vector<32x16xi8>, %arg2: memre xegpu.store_nd %0, %1 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @load_with_transpose_effect( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{transpose = array}> {layout_result_0 = #xegpu.layout} : @@ -55,8 +60,10 @@ func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_transpose( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %{{.*}} = vector.transpose %{{.*}}, [1, 0] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> to vector<16x16xf16> @@ -73,8 +80,10 @@ func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, % xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @extf_truncf( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) -> vector<8x16xf32> { @@ -88,8 +97,10 @@ func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor %4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> return %4 : vector<8x16xf32> } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @load_gather_with_chunksize( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} @@ -113,8 +124,10 @@ func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256 xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @load_gather_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} @@ -132,8 +145,9 @@ func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf xegpu.store_nd %1, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @store_scatter_with_chunksize( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<128xf32>) { // CHECK: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %{{.*}} : memref<128xf32>, vector<16xindex> -> @@ -148,8 +162,9 @@ func.func @store_scatter_with_chunksize(%arg0: memref<128xf32>) { xegpu.store %cst, %0, %cst_0 : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @store_scatter_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) { // CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, @@ -161,8 +176,9 @@ func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) { xegpu.store %arg0, %0, %cst_0 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @scatter_ops_chunksize( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> @@ -179,8 +195,9 @@ func.func @scatter_ops_chunksize(%src: memref<256xf16>) { : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @scatter_ops( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> @@ -195,8 +212,9 @@ func.func @scatter_ops(%src: memref<256xf16>) { xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16( // CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<8x16xi16, #xegpu.layout> -> vector<8x16xi16> @@ -219,8 +237,9 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1 xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i32_to_f16( // CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> @@ -239,8 +258,9 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8 xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_i32( // CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<8x32xi16, #xegpu.layout> -> vector<8x32xi16> @@ -255,8 +275,9 @@ func.func @vector_bitcast_i16_to_i32(%arg0: memref<8x32xi16>, %arg1: memref<8x16 xegpu.store_nd %3, %1 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_require_cross_lane_shuffle( // CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32> // CHECK: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout} @@ -270,9 +291,10 @@ func.func @vector_bitcast_require_cross_lane_shuffle(%arg0: memref<8x16xi32>, %a xegpu.store_nd %3, %1 : vector<8x32xi16>, !xegpu.tensor_desc<8x32xi16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @binary_op_one_use( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, @@ -291,8 +313,9 @@ func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu. xegpu.store_nd %4, %arg2 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @binary_op_multiple_uses( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, @@ -312,8 +335,9 @@ func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: ! xegpu.store_nd %2, %arg3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @for_op( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x128xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<128x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> @@ -353,8 +377,9 @@ func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: me xegpu.store_nd %2#2, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @if_single_use( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, @@ -381,8 +406,9 @@ func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tens xegpu.store_nd %2, %arg3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @if_multiple_uses( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, @@ -411,8 +437,9 @@ func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t xegpu.store_nd %1, %arg4 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_outer_reduction( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout>) { // CHECK: %{{.*}} = vector.multi_reduction , %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout} [0] : vector<16x16xf32> to vector<16xf32> @@ -422,8 +449,9 @@ func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_inner_reduction( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout>) { // CHECK: %{{.*}} = vector.multi_reduction , %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout} [1] : vector<16x16xf32> to vector<16xf32> @@ -433,8 +461,9 @@ func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @update_nd_offset_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> @@ -448,8 +477,9 @@ func.func @update_nd_offset_1d(%arg0: memref<256xf32>){ xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @update_nd_offset_2d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> @@ -463,8 +493,9 @@ func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){ xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @prefetch_2d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> @@ -475,8 +506,9 @@ func.func @prefetch_2d(%arg0: memref<256x256xf16>){ xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<16x16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @prefetch_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> @@ -487,8 +519,9 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){ xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @scf_while_and_condition( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) { // CHECK: %{{.*}}:3 = scf.while ({{.*}}) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout>) @@ -520,8 +553,9 @@ func.func @scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32 } return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { @@ -541,8 +575,9 @@ func.func @vector_shape_cast_1d_to_2d_dim1_distributed(%arg0: !xegpu.tensor_desc xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { @@ -563,3 +598,4 @@ func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } +} From 4fc6b2cf04c21c5607793d4852876db11f6f4cf5 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 31 Oct 2025 09:42:07 -0700 Subject: [PATCH 360/539] [ADT] Remove ArrayRef(std::nullopt_t) (#165831) ArrayRef(std::nullopt_t) has been deprecated since: commit 2529de5c935ad59e5f76d15890f857bf42817bc9 Author: Kazu Hirata Date: Fri Jun 27 01:03:02 2025 -0700 Note that we've made at lease one release, llvmorg-21.1, with this deprecation. --- llvm/include/llvm/ADT/ArrayRef.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llvm/include/llvm/ADT/ArrayRef.h b/llvm/include/llvm/ADT/ArrayRef.h index 448d10013d371..450f4d04c97fc 100644 --- a/llvm/include/llvm/ADT/ArrayRef.h +++ b/llvm/include/llvm/ADT/ArrayRef.h @@ -66,10 +66,6 @@ namespace llvm { /// Construct an empty ArrayRef. /*implicit*/ ArrayRef() = default; - /// Construct an empty ArrayRef from std::nullopt. - /*implicit*/ LLVM_DEPRECATED("Use {} or ArrayRef() instead", "{}") - ArrayRef(std::nullopt_t) {} - /// Construct an ArrayRef from a single element. /*implicit*/ ArrayRef(const T &OneElt LLVM_LIFETIME_BOUND) : Data(&OneElt), Length(1) {} From d016b537f03fb1e5e240c5a222ed336a2a3d5a45 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 31 Oct 2025 09:42:15 -0700 Subject: [PATCH 361/539] [MC] Remove SMRange(std::nullopt_t) (#165832) This patch removes SMRange(std::nullopt_t) to reduce the number of uses of std::nullopt outside the context of std::optional. Since there are only a handful of uses, this patch removes the constructor without going through deprecation. The use of std::nullopt here has its root in llvm::None, which was used as a convenient way to indicate "nothing" before we migrated llvm::Optional to std::optional. --- llvm/include/llvm/MC/MCParser/MCAsmParser.h | 13 +++++-------- llvm/include/llvm/Support/SMLoc.h | 2 -- llvm/lib/FileCheck/FileCheckImpl.h | 2 +- llvm/lib/MC/MCParser/AsmParser.cpp | 10 ++++------ llvm/lib/MC/MCParser/MasmParser.cpp | 10 ++++------ llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 6 +++--- llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp | 6 +++--- llvm/utils/TableGen/AsmMatcherEmitter.cpp | 2 +- 8 files changed, 21 insertions(+), 30 deletions(-) diff --git a/llvm/include/llvm/MC/MCParser/MCAsmParser.h b/llvm/include/llvm/MC/MCParser/MCAsmParser.h index e3f44a08db641..5d74b76592df9 100644 --- a/llvm/include/llvm/MC/MCParser/MCAsmParser.h +++ b/llvm/include/llvm/MC/MCParser/MCAsmParser.h @@ -209,28 +209,25 @@ class LLVM_ABI MCAsmParser { MCInstPrinter *IP, MCAsmParserSemaCallback &SI) = 0; /// Emit a note at the location \p L, with the message \p Msg. - virtual void Note(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) = 0; + virtual void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) = 0; /// Emit a warning at the location \p L, with the message \p Msg. /// /// \return The return value is true, if warnings are fatal. - virtual bool Warning(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) = 0; + virtual bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) = 0; /// Return an error at the location \p L, with the message \p Msg. This /// may be modified before being emitted. /// /// \return The return value is always true, as an idiomatic convenience to /// clients. - bool Error(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt); + bool Error(SMLoc L, const Twine &Msg, SMRange Range = {}); /// Emit an error at the location \p L, with the message \p Msg. /// /// \return The return value is always true, as an idiomatic convenience to /// clients. - virtual bool printError(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) = 0; + virtual bool printError(SMLoc L, const Twine &Msg, SMRange Range = {}) = 0; bool hasPendingError() { return !PendingErrors.empty(); } @@ -255,7 +252,7 @@ class LLVM_ABI MCAsmParser { const AsmToken &getTok() const; /// Report an error at the current lexer location. - bool TokError(const Twine &Msg, SMRange Range = std::nullopt); + bool TokError(const Twine &Msg, SMRange Range = {}); bool parseTokenLoc(SMLoc &Loc); bool parseToken(AsmToken::TokenKind T, const Twine &Msg = "unexpected token"); diff --git a/llvm/include/llvm/Support/SMLoc.h b/llvm/include/llvm/Support/SMLoc.h index c80969b1d83dc..b7ae6e488cde9 100644 --- a/llvm/include/llvm/Support/SMLoc.h +++ b/llvm/include/llvm/Support/SMLoc.h @@ -15,7 +15,6 @@ #define LLVM_SUPPORT_SMLOC_H #include -#include namespace llvm { @@ -50,7 +49,6 @@ class SMRange { SMLoc Start, End; SMRange() = default; - SMRange(std::nullopt_t) {} SMRange(SMLoc St, SMLoc En) : Start(St), End(En) { assert(Start.isValid() == End.isValid() && "Start and End should either both be valid or both be invalid!"); diff --git a/llvm/lib/FileCheck/FileCheckImpl.h b/llvm/lib/FileCheck/FileCheckImpl.h index a08502e4497e3..5851cfc4b5d5c 100644 --- a/llvm/lib/FileCheck/FileCheckImpl.h +++ b/llvm/lib/FileCheck/FileCheckImpl.h @@ -528,7 +528,7 @@ class ErrorDiagnostic : public ErrorInfo { SMRange getRange() const { return Range; } static Error get(const SourceMgr &SM, SMLoc Loc, const Twine &ErrMsg, - SMRange Range = std::nullopt) { + SMRange Range = {}) { return make_error( SM.GetMessage(Loc, SourceMgr::DK_Error, ErrMsg), Range); } diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index dd1bc2be5feb4..3c9ab8e108ddd 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -228,11 +228,9 @@ class AsmParser : public MCAsmParser { AssemblerDialect = i; } - void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) override; - bool Warning(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) override; - bool printError(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) override; + void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) override; + bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) override; + bool printError(SMLoc L, const Twine &Msg, SMRange Range = {}) override; const AsmToken &Lex() override; @@ -312,7 +310,7 @@ class AsmParser : public MCAsmParser { void printMacroInstantiations(); void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg, - SMRange Range = std::nullopt) const { + SMRange Range = {}) const { ArrayRef Ranges(Range); SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges); } diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp index 8a8f11122673f..34629548872c0 100644 --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -483,11 +483,9 @@ class MasmParser : public MCAsmParser { AssemblerDialect = i; } - void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) override; - bool Warning(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) override; - bool printError(SMLoc L, const Twine &Msg, - SMRange Range = std::nullopt) override; + void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) override; + bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) override; + bool printError(SMLoc L, const Twine &Msg, SMRange Range = {}) override; enum ExpandKind { ExpandMacros, DoNotExpandMacros }; const AsmToken &Lex(ExpandKind ExpandNextToken); @@ -592,7 +590,7 @@ class MasmParser : public MCAsmParser { bool expandStatement(SMLoc Loc); void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg, - SMRange Range = std::nullopt) const { + SMRange Range = {}) const { ArrayRef Ranges(Range); SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges); } diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index f60660b12baca..1bb670d195a98 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -426,15 +426,15 @@ class ARMAsmParser : public MCTargetAsmParser { VPTState.CurPosition = ~0U; } - void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) { + void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) { return getParser().Note(L, Msg, Range); } - bool Warning(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) { + bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) { return getParser().Warning(L, Msg, Range); } - bool Error(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) { + bool Error(SMLoc L, const Twine &Msg, SMRange Range = {}) { return getParser().Error(L, Msg, Range); } diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 127ee67517aea..b7ea6729cac79 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1121,7 +1121,7 @@ class X86AsmParser : public MCTargetAsmParser { void setTypeInfo(AsmTypeInfo Type) { CurType = Type; } }; - bool Error(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt, + bool Error(SMLoc L, const Twine &Msg, SMRange Range = {}, bool MatchingInlineAsm = false) { MCAsmParser &Parser = getParser(); if (MatchingInlineAsm) { @@ -4322,7 +4322,7 @@ bool X86AsmParser::matchAndEmitATTInstruction( SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { X86Operand &Op = static_cast(*Operands[0]); - SMRange EmptyRange = std::nullopt; + SMRange EmptyRange; // In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode // when matching the instruction. if (ForcedDataPrefix == X86::Is32Bit) @@ -4548,7 +4548,7 @@ bool X86AsmParser::matchAndEmitIntelInstruction( SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { X86Operand &Op = static_cast(*Operands[0]); - SMRange EmptyRange = std::nullopt; + SMRange EmptyRange; // Find one unsized memory operand, if present. X86Operand *UnsizedMemOp = nullptr; for (const auto &Op : Operands) { diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp index e1f2f06d755f1..9f18a11c236c0 100644 --- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp +++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp @@ -4164,7 +4164,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { OS << " MII.getDeprecatedInfo(Inst, getSTI(), Info)) {\n"; OS << " SMLoc Loc = ((" << Target.getName() << "Operand &)*Operands[0]).getStartLoc();\n"; - OS << " getParser().Warning(Loc, Info, std::nullopt);\n"; + OS << " getParser().Warning(Loc, Info, {});\n"; OS << " }\n"; } From 692eb48951b8c886588ce134c8ba51113151c0dc Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 31 Oct 2025 09:42:23 -0700 Subject: [PATCH 362/539] [SPIRV] Remove a redundant cast (NFC) (#165833) FinalFlags is already of type unsigned. --- llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index f7cdfcb65623b..db036a55ee6c6 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -613,8 +613,7 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI, << FinalFlags << "\n"; MachineInstr *OrigMINonConst = const_cast(OrigMI); MachineOperand &OrigFlagsOp = OrigMINonConst->getOperand(2); - OrigFlagsOp = - MachineOperand::CreateImm(static_cast(FinalFlags)); + OrigFlagsOp = MachineOperand::CreateImm(FinalFlags); return; // Merge done, so we found a duplicate; don't add it to MAI.MS } } From c259fc92e618feb13cd9b8b47cce2ce2380220a1 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 31 Oct 2025 09:42:31 -0700 Subject: [PATCH 363/539] [llvm] Proofread several *.rst files (#165835) This patch mechanically replaces: - "i.e." with "i.e.," - "e.g." with "e.g.," --- llvm/docs/CodeGenerator.rst | 40 +++++----- llvm/docs/LangRef.rst | 136 ++++++++++++++++---------------- llvm/docs/ProgrammersManual.rst | 26 +++--- 3 files changed, 101 insertions(+), 101 deletions(-) diff --git a/llvm/docs/CodeGenerator.rst b/llvm/docs/CodeGenerator.rst index fc704a3cdd51f..a74f16d7e9477 100644 --- a/llvm/docs/CodeGenerator.rst +++ b/llvm/docs/CodeGenerator.rst @@ -498,7 +498,7 @@ The ``MachineBasicBlock`` class The ``MachineBasicBlock`` class contains a list of machine instructions (:raw-html:`` `MachineInstr`_ :raw-html:`` instances). It roughly corresponds to the LLVM code input to the instruction selector, but there can be -a one-to-many mapping (i.e. one LLVM basic block can map to multiple machine +a one-to-many mapping (i.e., one LLVM basic block can map to multiple machine basic blocks). The ``MachineBasicBlock`` class has a "``getBasicBlock``" method, which returns the LLVM basic block that it comes from. @@ -522,7 +522,7 @@ LLVM code generator can model sequences of instructions as MachineInstr bundles. A MI bundle can model a VLIW group / pack which contains an arbitrary number of parallel instructions. It can also be used to model a sequential list of instructions (potentially with data dependencies) that cannot be legally -separated (e.g. ARM Thumb2 IT blocks). +separated (e.g., ARM Thumb2 IT blocks). Conceptually a MI bundle is a MI with a number of other MIs nested within: @@ -583,8 +583,8 @@ Packing / bundling of MachineInstrs for VLIW architectures should generally be done as part of the register allocation super-pass. More specifically, the pass which determines what MIs should be bundled together should be done after code generator exits SSA form -(i.e. after two-address pass, PHI elimination, and copy coalescing). -Such bundles should be finalized (i.e. adding BUNDLE MIs and input and +(i.e., after two-address pass, PHI elimination, and copy coalescing). +Such bundles should be finalized (i.e., adding BUNDLE MIs and input and output register MachineOperands) after virtual registers have been rewritten into physical registers. This eliminates the need to add virtual register operands to BUNDLE instructions which would @@ -615,7 +615,7 @@ The ``MCStreamer`` API ---------------------- MCStreamer is best thought of as an assembler API. It is an abstract API which -is *implemented* in different ways (e.g. to output a ``.s`` file, output an ELF ``.o`` +is *implemented* in different ways (e.g., to output a ``.s`` file, output an ELF ``.o`` file, etc) but whose API corresponds directly to what you see in a ``.s`` file. MCStreamer has one method per directive, such as EmitLabel, EmitSymbolAttribute, switchSection, emitValue (for .byte, .word), etc, which directly correspond to @@ -631,7 +631,7 @@ directives through MCStreamer. On the implementation side of MCStreamer, there are two major implementations: one for writing out a ``.s`` file (MCAsmStreamer), and one for writing out a ``.o`` file (MCObjectStreamer). MCAsmStreamer is a straightforward implementation -that prints out a directive for each method (e.g. ``EmitValue -> .byte``), but +that prints out a directive for each method (e.g., ``EmitValue -> .byte``), but MCObjectStreamer implements a full assembler. For target-specific directives, the MCStreamer has a MCTargetStreamer instance. @@ -681,7 +681,7 @@ The ``MCSection`` class ----------------------- The ``MCSection`` class represents an object-file specific section. It is -subclassed by object file specific implementations (e.g. ``MCSectionMachO``, +subclassed by object file specific implementations (e.g., ``MCSectionMachO``, ``MCSectionCOFF``, ``MCSectionELF``) and these are created and uniqued by MCContext. The MCStreamer has a notion of the current section, which can be changed with the SwitchToSection method (which corresponds to a ".section" @@ -696,7 +696,7 @@ The ``MCInst`` class is a target-independent representation of an instruction. It is a simple class (much more so than `MachineInstr`_) that holds a target-specific opcode and a vector of MCOperands. MCOperand, in turn, is a simple discriminated union of three cases: 1) a simple immediate, 2) a target -register ID, 3) a symbolic expression (e.g. "``Lfoo-Lbar+42``") as an MCExpr. +register ID, 3) a symbolic expression (e.g., "``Lfoo-Lbar+42``") as an MCExpr. MCInst is the common currency used to represent machine instructions at the MC layer. It is the type used by the instruction encoder, the instruction printer, @@ -711,9 +711,9 @@ The MC layer's object writers support a variety of object formats. Because of target-specific aspects of object formats each target only supports a subset of the formats supported by the MC layer. Most targets support emitting ELF objects. Other vendor-specific objects are generally supported only on targets -that are supported by that vendor (i.e. MachO is only supported on targets +that are supported by that vendor (i.e., MachO is only supported on targets supported by Darwin, and XCOFF is only supported on targets that support AIX). -Additionally some targets have their own object formats (i.e. DirectX, SPIR-V +Additionally some targets have their own object formats (i.e., DirectX, SPIR-V and WebAssembly). The table below captures a snapshot of object file support in LLVM: @@ -769,7 +769,7 @@ Introduction to SelectionDAGs The SelectionDAG provides an abstraction for code representation in a way that is amenable to instruction selection using automatic techniques -(e.g. dynamic-programming based optimal pattern matching selectors). It is also +(e.g., dynamic-programming based optimal pattern matching selectors). It is also well-suited to other phases of code generation; in particular, instruction scheduling (SelectionDAG's are very close to scheduling DAGs post-selection). Additionally, the SelectionDAG provides a host representation where a large @@ -898,7 +898,7 @@ Initial SelectionDAG Construction The initial SelectionDAG is na\ :raw-html:`ï`\ vely peephole expanded from the LLVM input by the ``SelectionDAGBuilder`` class. The intent of this pass is to expose as much low-level, target-specific details to the SelectionDAG as -possible. This pass is mostly hard-coded (e.g. an LLVM ``add`` turns into an +possible. This pass is mostly hard-coded (e.g., an LLVM ``add`` turns into an ``SDNode add`` while a ``getelementptr`` is expanded into the obvious arithmetic). This pass requires target-specific hooks to lower calls, returns, varargs, etc. For these features, the :raw-html:`` `TargetLowering`_ @@ -944,7 +944,7 @@ The Legalize phase is in charge of converting a DAG to only use the operations that are natively supported by the target. Targets often have weird constraints, such as not supporting every operation on -every supported data type (e.g. X86 does not support byte conditional moves and +every supported data type (e.g., X86 does not support byte conditional moves and PowerPC does not support sign-extending loads from a 16-bit memory location). Legalize takes care of this by open-coding another sequence of operations to emulate the operation ("expansion"), by promoting one type to a larger type that @@ -995,7 +995,7 @@ SelectionDAG Optimization Phase: the DAG Combiner The SelectionDAG optimization phase is run multiple times for code generation, immediately after the DAG is built and once after each legalization. The first -run of the pass allows the initial code to be cleaned up (e.g. performing +run of the pass allows the initial code to be cleaned up (e.g., performing optimizations that depend on knowing that the operators have restricted type inputs). Subsequent runs of the pass clean up the messy code generated by the Legalize passes, which allows Legalize to be very simple (it can focus on making @@ -1120,10 +1120,10 @@ for your target. It has the following strengths: 16-bits of the immediate). * When using the 'Pat' class to map a pattern to an instruction that has one - or more complex operands (like e.g. `X86 addressing mode`_), the pattern may + or more complex operands (like e.g., `X86 addressing mode`_), the pattern may either specify the operand as a whole using a ``ComplexPattern``, or else it may specify the components of the complex operand separately. The latter is - done e.g. for pre-increment instructions by the PowerPC back end: + done e.g., for pre-increment instructions by the PowerPC back end: :: @@ -1145,13 +1145,13 @@ While it has many strengths, the system currently has some limitations, primarily because it is a work in progress and is not yet finished: * Overall, there is no way to define or match SelectionDAG nodes that define - multiple values (e.g. ``SMUL_LOHI``, ``LOAD``, ``CALL``, etc). This is the + multiple values (e.g., ``SMUL_LOHI``, ``LOAD``, ``CALL``, etc). This is the biggest reason that you currently still *have to* write custom C++ code for your instruction selector. * There is no great way to support matching complex addressing modes yet. In the future, we will extend pattern fragments to allow them to define multiple - values (e.g. the four operands of the `X86 addressing mode`_, which are + values (e.g., the four operands of the `X86 addressing mode`_, which are currently matched with custom C++ code). In addition, we'll extend fragments so that a fragment can match multiple different patterns. @@ -1175,7 +1175,7 @@ SelectionDAG Scheduling and Formation Phase The scheduling phase takes the DAG of target instructions from the selection phase and assigns an order. The scheduler can pick an order depending on -various constraints of the machines (i.e. order for minimal register pressure or +various constraints of the machines (i.e., order for minimal register pressure or try to cover instruction latencies). Once an order is established, the DAG is converted to a list of :raw-html:`` `MachineInstr`_\s :raw-html:`` and the SelectionDAG is destroyed. @@ -1615,7 +1615,7 @@ Since the MC layer works at the level of abstraction of object files, it doesn't have a notion of functions, global variables etc. Instead, it thinks about labels, directives, and instructions. A key class used at this time is the MCStreamer class. This is an abstract API that is implemented in different ways -(e.g. to output a ``.s`` file, output an ELF ``.o`` file, etc) that is effectively an +(e.g., to output a ``.s`` file, output an ELF ``.o`` file, etc) that is effectively an "assembler API". MCStreamer has one method per directive, such as EmitLabel, EmitSymbolAttribute, switchSection, etc, which directly correspond to assembly level directives. diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 1c6823be44dcb..54c7d0fdfbf18 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -159,7 +159,7 @@ There are two kinds of escapes. * ``\\`` represents a single ``\`` character. * ``\`` followed by two hexadecimal characters (0-9, a-f, or A-F) - represents the byte with the given value (e.g. ``\00`` represents a + represents the byte with the given value (e.g., ``\00`` represents a null byte). To represent a ``"`` character, use ``\22``. (``\"`` will end the string @@ -168,7 +168,7 @@ with a trailing ``\``.) Newlines do not terminate string constants; strings can span multiple lines. -The interpretation of string constants (e.g. their character encoding) +The interpretation of string constants (e.g., their character encoding) depends on context. @@ -330,7 +330,7 @@ added in the future: the function (as does normal C). "``fastcc``" - The fast calling convention This calling convention attempts to make calls as fast as possible - (e.g. by passing things in registers). This calling convention + (e.g., by passing things in registers). This calling convention allows the target to use whatever tricks it wants to produce fast code for the target, without having to conform to an externally specified ABI (Application Binary Interface). `Tail calls can only @@ -465,7 +465,7 @@ added in the future: This calling convention doesn't preserve any general registers. So all general registers are caller saved registers. It also uses all general registers to pass arguments. This attribute doesn't impact non-general - purpose registers (e.g. floating point registers, on X86 XMMs/YMMs). + purpose registers (e.g., floating point registers, on X86 XMMs/YMMs). Non-general purpose registers still follow the standard C calling convention. Currently it is for x86_64 and AArch64 only. "``cxx_fast_tlscc``" - The `CXX_FAST_TLS` calling convention for access functions @@ -700,7 +700,7 @@ Unstable pointer representation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Pointers in this address space have an *unspecified* bitwise representation -(i.e. not backed by a fixed integer). The bitwise pattern of such pointers is +(i.e., not backed by a fixed integer). The bitwise pattern of such pointers is allowed to change in a target-specific way. For example, this could be a pointer type used with copying garbage collection where the garbage collector could update the pointer at any time in the collection sweep. @@ -903,7 +903,7 @@ size is unknown at compile time. They are allowed in structs to facilitate intrinsics returning multiple values. Generally, structs containing scalable vectors are not considered "sized" and cannot be used in loads, stores, allocas, or GEPs. The only exception to this rule is for structs that contain scalable -vectors of the same type (e.g. ``{, }`` +vectors of the same type (e.g., ``{, }`` contains the same type while ``{, }`` doesn't). These kinds of structs (we may call them homogeneous scalable vector structs) are considered sized and can be used in loads, stores, allocas, but @@ -1221,7 +1221,7 @@ sections. Note that certain IR constructs like global variables and functions may create COMDATs in the object file in addition to any which are specified using COMDAT IR. This arises when the code generator is configured to emit globals -in individual sections (e.g. when `-data-sections` or `-function-sections` +in individual sections (e.g., when `-data-sections` or `-function-sections` is supplied to `llc`). .. _namedmetadatastructure: @@ -1722,7 +1722,7 @@ Currently, only the following parameter attributes are defined: The function parameter marked with this attribute is the alignment in bytes of the newly allocated block returned by this function. The returned value must either have the specified alignment or be the null pointer. The return value MAY be more aligned - than the requested alignment, but not less aligned. Invalid (e.g. non-power-of-2) + than the requested alignment, but not less aligned. Invalid (e.g., non-power-of-2) alignments are permitted for the allocalign parameter, so long as the returned pointer is null. This attribute may only be applied to integer parameters. @@ -1989,7 +1989,7 @@ functions will use the same set of attributes. In the degenerate case of a group will capture the important command line flags used to build that file. An attribute group is a module-level object. To use an attribute group, an -object references the attribute group's ID (e.g. ``#37``). An object may refer +object references the attribute group's ID (e.g., ``#37``). An object may refer to more than one attribute group. In that situation, the attributes from the different groups are merged. @@ -2222,7 +2222,7 @@ For example: - ``errnomem``: This refers to accesses to the ``errno`` variable. - The default access kind (specified without a location prefix) applies to all locations that haven't been specified explicitly, including those that - don't currently have a dedicated location kind (e.g. accesses to globals + don't currently have a dedicated location kind (e.g., accesses to globals or captured pointers). If the ``memory`` attribute is not specified, then ``memory(readwrite)`` @@ -2713,7 +2713,7 @@ For example: ``mustprogress`` This attribute indicates that the function is required to return, unwind, - or interact with the environment in an observable way e.g. via a volatile + or interact with the environment in an observable way e.g., via a volatile memory access, I/O, or other synchronization. The ``mustprogress`` attribute is intended to model the requirements of the first section of [intro.progress] of the C++ Standard. As a consequence, a loop in a @@ -2851,7 +2851,7 @@ are grouped into a single :ref:`attribute group `. with `__attribute__((no_sanitize("memtag")))`, `__attribute__((disable_sanitizer_instrumentation))`, or included in the `-fsanitize-ignorelist` file. The AArch64 Globals Tagging pass may remove - this attribute when it's not possible to tag the global (e.g. it's a TLS + this attribute when it's not possible to tag the global (e.g., it's a TLS variable). ``sanitize_address_dyninit`` This attribute indicates that the global variable, when instrumented with @@ -3076,7 +3076,7 @@ the behavior is undefined, unless one of the following exceptions applies: * ``dereferenceable()`` operand bundles only guarantee the pointer is dereferenceable at the point of the assumption. The pointer may not be - dereferenceable at later pointers, e.g. because it could have been freed. + dereferenceable at later pointers, e.g., because it could have been freed. In addition to allowing operand bundles encoding function and parameter attributes, an assume operand bundle may also encode a ``separate_storage`` @@ -3270,7 +3270,7 @@ as follows: address space 0. Note: variable declarations without an address space are always created in address space 0, this property only affects the default value to be used - when creating globals without additional contextual information (e.g. in + when creating globals without additional contextual information (e.g., in LLVM passes). .. _alloca_addrspace: @@ -3282,7 +3282,7 @@ as follows: This specifies the properties of a pointer in address space ``as``. The ```` parameter specifies the size of the bitwise representation. For :ref:`non-integral pointers ` the representation size may - be larger than the address width of the underlying address space (e.g. to + be larger than the address width of the underlying address space (e.g., to accommodate additional metadata). The alignment requirements are specified via the ```` and ````\erred alignments parameters. @@ -3478,7 +3478,7 @@ variables) may *not* change their size. (``realloc``-style operations do not change the size of an existing allocated object; instead, they create a new allocated object. Even if the object is at the same location as the old one, old pointers cannot be used to access this new object.) However, allocated objects -can also be created by means not recognized by LLVM, e.g. by directly calling +can also be created by means not recognized by LLVM, e.g., by directly calling ``mmap``. Those allocated objects are allowed to grow to the right (i.e., keeping the same base address, but increasing their size) while maintaining the validity of existing pointers, as long as they always satisfy the properties @@ -3632,7 +3632,7 @@ through the return value only: } However, we always consider direct inspection of the pointer address -(e.g. using ``ptrtoint``) to be location-independent. The following example +(e.g., using ``ptrtoint``) to be location-independent. The following example is *not* considered a return-only capture, even though the ``ptrtoint`` ultimately only contributes to the return value: @@ -4145,7 +4145,7 @@ output, given the original flags. ``a * (c / b)`` can be rewritten into ``a / (b / c)``. ``contract`` - Allow floating-point contraction (e.g. fusing a multiply followed by an + Allow floating-point contraction (e.g., fusing a multiply followed by an addition into a fused multiply-and-add). This does not enable reassociation to form arbitrary contractions. For example, ``(a*b) + (c*d) + e`` can not be transformed into ``(a*b) + ((c*d) + e)`` to create two fma operations. @@ -4440,7 +4440,7 @@ the default globals address space and ``addrspace("P")`` the program address space. The representation of pointers can be different for each address space and does -not necessarily need to be a plain integer address (e.g. for +not necessarily need to be a plain integer address (e.g., for :ref:`non-integral pointers `). In addition to a representation bits size, pointers in each address space also have an index size which defines the bitwidth of indexing operations as well as the size of `integer addresses` @@ -4750,7 +4750,7 @@ is inserted as defined by the DataLayout string in the module, which is required to match what the underlying code generator expects. Structures can either be "literal" or "identified". A literal structure -is defined inline with other types (e.g. ``[2 x {i32, i32}]``) whereas +is defined inline with other types (e.g., ``[2 x {i32, i32}]``) whereas identified types are always defined at the top level with a name. Literal types are uniqued by their contents and can never be recursive or opaque since there is no way to write one. Identified types can be @@ -4791,7 +4791,7 @@ Simple Constants Standard integers (such as '4') are constants of the :ref:`integer ` type. They can be either decimal or hexadecimal. Decimal integers can be prefixed with - to represent - negative integers, e.g. '``-1234``'. Hexadecimal integers must be + negative integers, e.g., '``-1234``'. Hexadecimal integers must be prefixed with either u or s to indicate whether they are unsigned or signed respectively. e.g '``u0x8000``' gives 32768, whilst '``s0x8000``' gives -32768. @@ -4801,7 +4801,7 @@ Simple Constants zeros. So '``s0x0001``' of type '``i16``' will be -1, not 1. **Floating-point constants** Floating-point constants use standard decimal notation (e.g. - 123.421), exponential notation (e.g. 1.23421e+2), or a more precise + 123.421), exponential notation (e.g., 1.23421e+2), or a more precise hexadecimal notation (see below). The assembler requires the exact decimal value of a floating-point constant. For example, the assembler accepts 1.25 but rejects 1.3 because 1.3 is a repeating @@ -4883,7 +4883,7 @@ constants and smaller complex constants. The string '``zeroinitializer``' can be used to zero initialize a value to zero of *any* type, including scalar and :ref:`aggregate ` types. This is often used to avoid - having to print large zero initializers (e.g. for large arrays) and + having to print large zero initializers (e.g., for large arrays) and is always exactly equivalent to using explicit zero initializers. **Metadata node** A metadata node is a constant tuple without types. For example: @@ -5286,7 +5286,7 @@ Constant Expressions Constant expressions are used to allow expressions involving other constants to be used as constants. Constant expressions may be of any :ref:`first class ` type and may involve any LLVM operation -that does not have side effects (e.g. load and call are not supported). +that does not have side effects (e.g., load and call are not supported). The following is the syntax for constant expressions: ``trunc (CST to TYPE)`` @@ -5472,7 +5472,7 @@ There are also three different categories of constraint codes: Output constraints """""""""""""""""" -Output constraints are specified by an "``=``" prefix (e.g. "``=r``"). This +Output constraints are specified by an "``=``" prefix (e.g., "``=r``"). This indicates that the assembly will write to this operand, and the operand will then be made available as a return value of the ``asm`` expression. Output constraints do not consume an argument from the call instruction. (Except, see @@ -5480,10 +5480,10 @@ below about indirect outputs). Normally, it is expected that no output locations are written to by the assembly expression until *all* of the inputs have been read. As such, LLVM may assign -the same register to an output and an input. If this is not safe (e.g. if the +the same register to an output and an input. If this is not safe (e.g., if the assembly contains two instructions, where the first writes to one output, and the second reads an input and writes to a second output), then the "``&``" -modifier must be used (e.g. "``=&r``") to specify that the output is an +modifier must be used (e.g., "``=&r``") to specify that the output is an "early-clobber" output. Marking an output as "early-clobber" ensures that LLVM will not use the same register for any inputs (other than an input tied to this output). @@ -5523,17 +5523,17 @@ However, this feature is often not as useful as you might think. Firstly, the registers are *not* guaranteed to be consecutive. So, on those architectures that have instructions which operate on multiple consecutive -instructions, this is not an appropriate way to support them. (e.g. the 32-bit +instructions, this is not an appropriate way to support them. (e.g., the 32-bit SparcV8 has a 64-bit load, which instruction takes a single 32-bit register. The hardware then loads into both the named register, and the next register. This feature of inline asm would not be useful to support that.) A few of the targets provide a template string modifier allowing explicit access -to the second register of a two-register operand (e.g. MIPS ``L``, ``M``, and +to the second register of a two-register operand (e.g., MIPS ``L``, ``M``, and ``D``). On such an architecture, you can actually access the second allocated register (yet, still, not any subsequent ones). But, in that case, you're still probably better off simply splitting the value into two separate operands, for -clarity. (e.g. see the description of the ``A`` constraint on X86, which, +clarity. (e.g., see the description of the ``A`` constraint on X86, which, despite existing only for use with this feature, is not really a good idea to use) @@ -5549,11 +5549,11 @@ rather than producing a return value. An indirect output constraint is an "output" only in that the asm is expected to write to the contents of the input memory location, instead of just read from it). -This is most typically used for memory constraint, e.g. "``=*m``", to pass the +This is most typically used for memory constraint, e.g., "``=*m``", to pass the address of a variable as a value. It is also possible to use an indirect *register* constraint, but only on output -(e.g. "``=*r``"). This will cause LLVM to allocate a register for an output +(e.g., "``=*r``"). This will cause LLVM to allocate a register for an output value normally, and then, separately emit a store to the address provided as input, after the provided inline asm. (It's not clear what value this functionality provides, compared to writing the store explicitly after the asm @@ -5570,7 +5570,7 @@ Clobber constraints A clobber constraint is indicated by a "``~``" prefix. A clobber does not consume an input operand, nor generate an output. Clobbers cannot use any of the general constraint code letters -- they may use only explicit register -constraints, e.g. "``~{eax}``". The one exception is that a clobber string of +constraints, e.g., "``~{eax}``". The one exception is that a clobber string of "``~{memory}``" indicates that the assembly writes to arbitrary undeclared memory locations -- not only the memory pointed to by a declared indirect output. @@ -5594,9 +5594,9 @@ Constraint Codes """""""""""""""" After a potential prefix comes constraint code, or codes. -A Constraint Code is either a single letter (e.g. "``r``"), a "``^``" character -followed by two letters (e.g. "``^wc``"), or "``{``" register-name "``}``" -(e.g. "``{eax}``"). +A Constraint Code is either a single letter (e.g., "``r``"), a "``^``" character +followed by two letters (e.g., "``^wc``"), or "``{``" register-name "``}``" +(e.g., "``{eax}``"). The one and two letter constraint codes are typically chosen to be the same as GCC's constraint codes. @@ -5973,11 +5973,11 @@ Target-independent: - ``a``: Print a memory reference. Targets might customize the output. - ``c``: Print an immediate integer constant unadorned, without - the target-specific immediate punctuation (e.g. no ``$`` prefix). + the target-specific immediate punctuation (e.g., no ``$`` prefix). - ``n``: Negate and print immediate integer constant unadorned, without the - target-specific immediate punctuation (e.g. no ``$`` prefix). + target-specific immediate punctuation (e.g., no ``$`` prefix). - ``l``: Print as an unadorned label, without the target-specific label - punctuation (e.g. no ``$`` prefix). + punctuation (e.g., no ``$`` prefix). AArch64: @@ -5998,7 +5998,7 @@ ARM: register). - ``P``: No effect. - ``q``: No effect. -- ``y``: Print a VFP single-precision register as an indexed double (e.g. print +- ``y``: Print a VFP single-precision register as an indexed double (e.g., print as ``d4[1]`` instead of ``s9``) - ``B``: Bitwise invert and print an immediate integer constant without ``#`` prefix. @@ -6114,18 +6114,18 @@ X86: - ``c``: Print an unadorned integer or symbol name. (The latter is target-specific behavior for this typically target-independent modifier). - ``A``: Print a register name with a '``*``' before it. -- ``b``: Print an 8-bit register name (e.g. ``al``); do nothing on a memory +- ``b``: Print an 8-bit register name (e.g., ``al``); do nothing on a memory operand. -- ``h``: Print the upper 8-bit register name (e.g. ``ah``); do nothing on a +- ``h``: Print the upper 8-bit register name (e.g., ``ah``); do nothing on a memory operand. -- ``w``: Print the 16-bit register name (e.g. ``ax``); do nothing on a memory +- ``w``: Print the 16-bit register name (e.g., ``ax``); do nothing on a memory operand. -- ``k``: Print the 32-bit register name (e.g. ``eax``); do nothing on a memory +- ``k``: Print the 32-bit register name (e.g., ``eax``); do nothing on a memory operand. -- ``q``: Print the 64-bit register name (e.g. ``rax``), if 64-bit registers are +- ``q``: Print the 64-bit register name (e.g., ``rax``), if 64-bit registers are available, otherwise the 32-bit register name; do nothing on a memory operand. - ``n``: Negate and print an unadorned integer, or, for operands other than an - immediate integer (e.g. a relocatable symbol expression), print a '-' before + immediate integer (e.g., a relocatable symbol expression), print a '-' before the operand. (The behavior for relocatable symbol expressions is a target-specific behavior for this typically target-independent modifier) - ``H``: Print a memory reference with additional offset +8. @@ -6883,7 +6883,7 @@ See :ref:`diexpression` for details. .. note:: ``DIExpression``\s are always printed and parsed inline; they can never be - referenced by an ID (e.g. ``!1``). + referenced by an ID (e.g., ``!1``). Some examples of expressions: @@ -8469,8 +8469,8 @@ that was typically cold and one allocating memory that was typically not cold. The format of the metadata describing a context specific profile (e.g. ``!1`` and ``!3`` above) requires a first operand that is a metadata node describing the context, followed by a list of string metadata tags describing -the profile behavior (e.g. ``cold`` and ``notcold``) above. The metadata nodes -describing the context (e.g. ``!2`` and ``!4`` above) are unique ids +the profile behavior (e.g., ``cold`` and ``notcold``) above. The metadata nodes +describing the context (e.g., ``!2`` and ``!4`` above) are unique ids corresponding to callsites, which can be matched to associated IR calls via :ref:`callsite metadata`. In practice these ids are formed via a hash of the callsite's debug info, and the associated call may be in a @@ -8946,7 +8946,7 @@ in syntax by a caret ('``^``'). The summary is parsed into a bitcode output, along with the Module IR, via the "``llvm-as``" tool. Tools that parse the Module IR for the purposes -of optimization (e.g. "``clang -x ir``" and "``opt``"), will ignore the +of optimization (e.g., "``clang -x ir``" and "``opt``"), will ignore the summary entries (just as they currently ignore summary entries in a bitcode input file). @@ -9176,7 +9176,7 @@ The optional ``Refs`` field looks like: refs: ((Ref)[, (Ref)]*) where each ``Ref`` contains a reference to the summary id of the referenced -value (e.g. ``^1``). +value (e.g., ``^1``). .. _typeidinfo_summary: @@ -10385,7 +10385,7 @@ bit width of the result. Because LLVM integers use a two's complement representation, and the result is the same width as the operands, this instruction returns the correct result for both signed and unsigned integers. If a full product -(e.g. ``i32`` * ``i32`` -> ``i64``) is needed, the operands should be +(e.g., ``i32`` * ``i32`` -> ``i64``) is needed, the operands should be sign-extended or zero-extended as appropriate to the width of the full product. @@ -11378,7 +11378,7 @@ allocation on any convenient boundary compatible with the type. '``type``' may be any sized type. Structs containing scalable vectors cannot be used in allocas unless all -fields are the same scalable vector type (e.g. ``{, +fields are the same scalable vector type (e.g., ``{, }`` contains the same type while ``{, }`` doesn't). @@ -12766,7 +12766,7 @@ pointer then a truncation is done. If ``value`` is smaller than the size of a pointer then a zero extension is done. If they are the same size, nothing is done (*no-op cast*). The behavior is equivalent to a ``bitcast``, however, the resulting value is not -guaranteed to be dereferenceable (e.g. if the result type is a +guaranteed to be dereferenceable (e.g., if the result type is a :ref:`non-integral pointers `). Example: @@ -14697,7 +14697,7 @@ C++ object with a non-trivial destructor. ``llvm.seh.scope.begin`` is used to m the start of the region; it is always called with ``invoke``, with the unwind block being the desired unwind destination for any potentially-throwing instructions within the region. `llvm.seh.scope.end` is used to mark when the scope ends -and the EH cleanup is no longer required (e.g. because the destructor is being +and the EH cleanup is no longer required (e.g., because the destructor is being called). .. _int_read_register: @@ -14737,7 +14737,7 @@ return the current value of the register, where possible. The where possible. A call to '``llvm.read_volatile_register``' is assumed to have side-effects -and possibly return a different value each time (e.g. for a timer register). +and possibly return a different value each time (e.g., for a timer register). This is useful to implement named register global variables that need to always be mapped to a specific register, as is common practice on @@ -15008,9 +15008,9 @@ flushes the instruction cache. Semantics: """""""""" -On platforms with coherent instruction and data caches (e.g. x86), this +On platforms with coherent instruction and data caches (e.g., x86), this intrinsic is a nop. On platforms with non-coherent instruction and data -cache (e.g. ARM, MIPS), the intrinsic is lowered either to appropriate +cache (e.g., ARM, MIPS), the intrinsic is lowered either to appropriate instructions or a system call, if cache flushing requires special privileges. @@ -15462,7 +15462,7 @@ A call to '``llvm.call.preallocated.arg``' must have a call site ``preallocated`` attribute. The type of the ``preallocated`` attribute must match the type used by the ``preallocated`` attribute of the corresponding argument at the preallocated call. The type is used in the case that an -``llvm.call.preallocated.setup`` does not have a corresponding call (e.g. due +``llvm.call.preallocated.setup`` does not have a corresponding call (e.g., due to DCE), where otherwise we cannot know how large the arguments are. It is undefined behavior if this is called with a token from an @@ -16656,7 +16656,7 @@ for large input values. .. note:: Currently, the default lowering of this intrinsic relies on the ``sincospi[f|l]`` - functions being available in the target's runtime (e.g. libc). + functions being available in the target's runtime (e.g., libc). When specified with the fast-math-flag 'afn', the result may be approximated using a less accurate calculation. @@ -19719,7 +19719,7 @@ Arguments: """""""""" The integer operand is the loop trip count of the hardware-loop, and thus -not e.g. the loop back-edge taken count. +not e.g., the loop back-edge taken count. Semantics: """""""""" @@ -19758,7 +19758,7 @@ Arguments: """""""""" The integer operand is the loop trip count of the hardware-loop, and thus -not e.g. the loop back-edge taken count. +not e.g., the loop back-edge taken count. Semantics: """""""""" @@ -19794,7 +19794,7 @@ Arguments: """""""""" The integer operand is the loop trip count of the hardware-loop, and thus -not e.g. the loop back-edge taken count. +not e.g., the loop back-edge taken count. Semantics: """""""""" @@ -19832,7 +19832,7 @@ Arguments: """""""""" The integer operand is the loop trip count of the hardware-loop, and thus -not e.g. the loop back-edge taken count. +not e.g., the loop back-edge taken count. Semantics: """""""""" @@ -20768,7 +20768,7 @@ of the result's type, while maintaining the same element type. Semantics: """""""""" -Other than the reduction operator (e.g. add) the way in which the concatenated +Other than the reduction operator (e.g., add) the way in which the concatenated arguments is reduced is entirely unspecified. By their nature these intrinsics are not expected to be useful in isolation but instead implement the first phase of an overall reduction operation. @@ -24286,7 +24286,7 @@ The arguments are scalar types to accommodate scalable vector types, for which it is unknown what the type of the step vector needs to be that enumerate its lanes without overflow. -This mask ``%m`` can e.g. be used in masked load/store instructions. These +This mask ``%m`` can e.g., be used in masked load/store instructions. These intrinsics provide a hint to the backend. I.e., for a vector loop, the back-edge taken count of the original scalar loop is explicit as the second argument. @@ -27966,7 +27966,7 @@ The quiet comparison operation performed by if either argument is a SNAN. The signaling comparison operation performed by '``llvm.experimental.constrained.fcmps``' will raise an exception if either argument is a NAN (QNAN or SNAN). Such an exception -does not preclude a result being produced (e.g. exception might only +does not preclude a result being produced (e.g., exception might only set a flag), therefore the distinction between ordered and unordered comparisons is also relevant for the '``llvm.experimental.constrained.fcmps``' intrinsic. @@ -29983,7 +29983,7 @@ Semantics: On some platforms, the value returned by this intrinsic remains unchanged between loads in the same thread. On other platforms, it returns the same -global variable value, if any, e.g. ``@__stack_chk_guard``. +global variable value, if any, e.g., ``@__stack_chk_guard``. Currently some platforms have IR-level customized stack guard loading (e.g. X86 Linux) that is not handled by ``llvm.stackguard()``, while they should be diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst index d99b5843c2133..7b7a1ce8740f5 100644 --- a/llvm/docs/ProgrammersManual.rst +++ b/llvm/docs/ProgrammersManual.rst @@ -1043,7 +1043,7 @@ compared to ``end`` and found to be unequal (in particular, this marks the error as checked throughout the body of a range-based for loop), enabling early exit from the loop without redundant error checking. -Instances of the fallible iterator interface (e.g. FallibleChildIterator above) +Instances of the fallible iterator interface (e.g., FallibleChildIterator above) are wrapped using the ``make_fallible_itr`` and ``make_fallible_end`` functions. E.g.: @@ -1640,7 +1640,7 @@ dynamically smaller than N, no malloc is performed. This can be a big win in cases where the malloc/free call is far more expensive than the code that fiddles around with the elements. -This is good for vectors that are "usually small" (e.g. the number of +This is good for vectors that are "usually small" (e.g., the number of predecessors/successors of a block is usually less than 8). On the other hand, this makes the size of the ``SmallVector`` itself large, so you don't want to allocate lots of them (doing so will waste a lot of space). As such, @@ -1684,7 +1684,7 @@ to keep ``sizeof(SmallVector)`` around 64 bytes). .. code-block:: c++ - // DISCOURAGED: Clients cannot pass e.g. raw arrays. + // DISCOURAGED: Clients cannot pass e.g., raw arrays. hardcodedContiguousStorage(const SmallVectorImpl &In); // ENCOURAGED: Clients can pass any contiguous storage of Foo. allowsAnyContiguousStorage(ArrayRef In); @@ -1695,7 +1695,7 @@ to keep ``sizeof(SmallVector)`` around 64 bytes). allowsAnyContiguousStorage(Vec); // Works. } - // DISCOURAGED: Clients cannot pass e.g. SmallVector. + // DISCOURAGED: Clients cannot pass e.g., SmallVector. hardcodedSmallSize(SmallVector &Out); // ENCOURAGED: Clients can pass any SmallVector. allowsAnySmallSize(SmallVectorImpl &Out); @@ -2064,7 +2064,7 @@ so it can be embedded into heap data structures and returned by-value. On the other hand, ``std::string`` is highly inefficient for inline editing (e.g. concatenating a bunch of stuff together) and because it is provided by the standard library, its performance characteristics depend a lot of the host -standard library (e.g. libc++ and MSVC provide a highly optimized string class, +standard library (e.g., libc++ and MSVC provide a highly optimized string class, GCC contains a really slow implementation). The major disadvantage of ``std::string`` is that almost every operation that makes @@ -2198,7 +2198,7 @@ physical registers, virtual registers, or numbered basic blocks. ``SparseMultiSet`` is useful for algorithms that need very fast clear/find/insert/erase of the entire collection, and iteration over sets of elements sharing a key. It is often a more efficient choice than using composite -data structures (e.g. vector-of-vectors, map-of-vectors). It is not intended for +data structures (e.g., vector-of-vectors, map-of-vectors). It is not intended for building composite data structures. .. _dss_FoldingSet: @@ -2268,7 +2268,7 @@ iteration. The difference between ``SetVector`` and other sets is that the order of iteration is guaranteed to match the order of insertion into the ``SetVector``. This property is really important for things like sets of pointers. Because pointer values -are non-deterministic (e.g. vary across runs of the program on different +are non-deterministic (e.g., vary across runs of the program on different machines), iterating over the pointers in the set will not be in a well-defined order. @@ -2473,7 +2473,7 @@ pair in the map, etc. ``std::map`` is most useful when your keys or values are very large, if you need to iterate over the collection in sorted order, or if you need stable iterators -into the map (i.e. they don't get invalidated if an insertion or deletion of +into the map (i.e., they don't get invalidated if an insertion or deletion of another element takes place). .. _dss_mapvector: @@ -2542,7 +2542,7 @@ There are several bit storage containers, and choosing when to use each is relatively straightforward. One additional option is ``std::vector``: we discourage its use for two -reasons 1) the implementation in many common compilers (e.g. commonly +reasons 1) the implementation in many common compilers (e.g., commonly available versions of GCC) is extremely inefficient and 2) the C++ standards committee is likely to deprecate this container and/or change it significantly somehow. In any case, please don't use it. @@ -2557,7 +2557,7 @@ It supports individual bit setting/testing, as well as set operations. The set operations take time O(size of bitvector), but operations are performed one word at a time, instead of one bit at a time. This makes the ``BitVector`` very fast for set operations compared to other containers. Use the ``BitVector`` when you expect -the number of set bits to be high (i.e. a dense set). +the number of set bits to be high (i.e., a dense set). .. _dss_smallbitvector: @@ -3305,7 +3305,7 @@ naming value definitions. The symbol table can provide a name for any Value_. Note that the ``SymbolTable`` class should not be directly accessed by most clients. It should only be used when iteration over the symbol table names themselves are required, which is very special purpose. Note that not all LLVM -Value_\ s have names, and those without names (i.e. they have an empty name) do +Value_\ s have names, and those without names (i.e., they have an empty name) do not exist in the symbol table. Symbol tables support iteration over the values in the symbol table with @@ -3871,7 +3871,7 @@ Important Public Members of the ``Instruction`` class * ``bool mayWriteToMemory()`` - Returns true if the instruction writes to memory, i.e. it is a ``call``, + Returns true if the instruction writes to memory, i.e., it is a ``call``, ``free``, ``invoke``, or ``store``. * ``unsigned getOpcode()`` @@ -3881,7 +3881,7 @@ Important Public Members of the ``Instruction`` class * ``Instruction *clone() const`` Returns another instance of the specified instruction, identical in all ways - to the original except that the instruction has no parent (i.e. it's not + to the original except that the instruction has no parent (i.e., it's not embedded into a BasicBlock_), and it has no name. .. _Constant: From 5f07758976a1e6517b1e6f862a567dd453821cdd Mon Sep 17 00:00:00 2001 From: Sarah Spall Date: Fri, 31 Oct 2025 09:49:32 -0700 Subject: [PATCH 364/539] [HLSL] Add NativeInt16Type langopt to control whether short type is supported. Enabled by default for all but HLSL. (#165584) Add a new langopt NativeInt16Type to control support for 16 bit integers. Enable by default for all languages but HLSL. HLSL defines uint16_t and int16_t as a typedef of short. If -enable-16bit-types is not used, the typedefs don't exist so int16_t and uint16_t can't be used. However, short was still allowed. This change will produce an error 'unknown type name short' if -enable-16bit-types isn't used. Update failing tests. Add new test. Closes #81779 --- clang/include/clang/Basic/LangOptions.def | 1 + clang/include/clang/Driver/Options.td | 7 ++- clang/lib/Driver/ToolChains/Clang.cpp | 1 + clang/lib/Driver/ToolChains/HLSL.cpp | 9 ++++ clang/lib/Frontend/CompilerInvocation.cpp | 11 +++-- clang/lib/Frontend/InitPreprocessor.cpp | 2 +- clang/lib/Parse/ParseDecl.cpp | 7 +++ clang/test/AST/HLSL/packoffset.hlsl | 2 +- .../test/AST/HLSL/vk.spec-constant.usage.hlsl | 2 +- .../BasicFeatures/StructElementwiseCast.hlsl | 2 +- .../BasicFeatures/frem_modulo.hlsl | 4 +- .../test/CodeGenHLSL/HLSLControlFlowHint.hlsl | 4 +- .../CodeGenHLSL/Operators/logical-not.hlsl | 2 +- clang/test/CodeGenHLSL/basic_types.hlsl | 4 +- .../builtins/WaveActiveAllTrue.hlsl | 4 +- .../builtins/WaveActiveAnyTrue.hlsl | 4 +- .../CodeGenHLSL/builtins/WaveReadLaneAt.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/abs.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/acos.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/all.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/any.hlsl | 8 ++-- clang/test/CodeGenHLSL/builtins/asfloat.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/asin.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/asint.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/asint16.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/asuint.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/asuint16.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/atan.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/atan2.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/ceil.hlsl | 2 +- .../CodeGenHLSL/builtins/clamp-builtin.hlsl | 2 +- .../CodeGenHLSL/builtins/clamp-overloads.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/clamp.hlsl | 4 +- .../CodeGenHLSL/builtins/clip-builtin.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/clip.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/cos.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/cosh.hlsl | 2 +- .../test/CodeGenHLSL/builtins/countbits.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/cross.hlsl | 4 +- .../CodeGenHLSL/builtins/degrees-builtin.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/degrees.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/distance.hlsl | 4 +- .../CodeGenHLSL/builtins/dot-builtin.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/dot.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/dot2add.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/dst.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/exp.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/exp2.hlsl | 2 +- .../CodeGenHLSL/builtins/faceforward.hlsl | 4 +- .../CodeGenHLSL/builtins/firstbithigh.hlsl | 4 +- .../CodeGenHLSL/builtins/firstbitlow.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/floor.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/fmod.hlsl | 4 +- .../CodeGenHLSL/builtins/frac-builtin.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/frac.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/isinf.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/isnan.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/ldexp.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/length.hlsl | 4 +- .../CodeGenHLSL/builtins/lerp-builtin.hlsl | 2 +- .../CodeGenHLSL/builtins/lerp-overloads.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/lerp.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/lit.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/log.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/log10.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/log2.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/mad.hlsl | 4 +- .../CodeGenHLSL/builtins/max-overloads.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/max.hlsl | 2 +- .../CodeGenHLSL/builtins/min-overloads.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/min.hlsl | 2 +- .../builtins/normalize-builtin.hlsl | 2 +- .../test/CodeGenHLSL/builtins/normalize.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/pow.hlsl | 2 +- .../CodeGenHLSL/builtins/radians-builtin.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/radians.hlsl | 4 +- .../CodeGenHLSL/builtins/rcp-builtin.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/rcp.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/reflect.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/refract.hlsl | 4 +- .../CodeGenHLSL/builtins/reversebits.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/round.hlsl | 2 +- .../CodeGenHLSL/builtins/rsqrt-builtin.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/rsqrt.hlsl | 4 +- .../builtins/saturate-builtin.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/saturate.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/sign.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/sin.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/sinh.hlsl | 2 +- .../test/CodeGenHLSL/builtins/smoothstep.hlsl | 4 +- .../CodeGenHLSL/builtins/splitdouble.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/sqrt.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/step.hlsl | 4 +- clang/test/CodeGenHLSL/builtins/tan.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/tanh.hlsl | 2 +- .../builtins/transpose-builtin.hlsl | 2 +- clang/test/CodeGenHLSL/builtins/trunc.hlsl | 2 +- .../test/CodeGenHLSL/enable-16bit-types.hlsl | 2 +- clang/test/CodeGenHLSL/float3.hlsl | 2 +- clang/test/CodeGenHLSL/no_int_promotion.hlsl | 2 +- ...erOrderedStructuredBuffer-elementtype.hlsl | 2 +- .../StructuredBuffers-elementtype.hlsl | 16 +++---- .../resources/TypedBuffers-elementtype.hlsl | 8 ++-- clang/test/CodeGenHLSL/resources/cbuffer.hlsl | 2 +- .../vk-features/vk.spec-constant.hlsl | 2 +- clang/test/Driver/dxc_enable16bittypes.hlsl | 7 +++ .../enable_16bit_types_validation_spirv.hlsl | 8 ++-- .../Preprocessor/predefined-macros-hlsl.hlsl | 2 +- .../SemaHLSL/BuiltIns/AddUint64-errors.hlsl | 2 +- clang/test/SemaHLSL/BuiltIns/all-errors.hlsl | 2 +- clang/test/SemaHLSL/BuiltIns/any-errors.hlsl | 2 +- .../SemaHLSL/BuiltIns/asfloat-errors.hlsl | 2 +- .../test/SemaHLSL/BuiltIns/asint-errors.hlsl | 2 +- .../SemaHLSL/BuiltIns/asint16-errors.hlsl | 2 +- .../test/SemaHLSL/BuiltIns/asuint-errors.hlsl | 2 +- .../SemaHLSL/BuiltIns/asuint16-errors.hlsl | 2 +- .../SemaHLSL/BuiltIns/clamp-errors-16bit.hlsl | 6 +-- .../test/SemaHLSL/BuiltIns/clamp-errors.hlsl | 2 +- clang/test/SemaHLSL/BuiltIns/clip-errors.hlsl | 2 +- .../SemaHLSL/BuiltIns/countbits-errors.hlsl | 2 +- .../test/SemaHLSL/BuiltIns/cross-errors.hlsl | 2 +- .../SemaHLSL/BuiltIns/distance-errors.hlsl | 2 +- clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl | 2 +- .../SemaHLSL/BuiltIns/dot2add-errors.hlsl | 2 +- clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl | 6 +-- .../SemaHLSL/BuiltIns/faceforward-errors.hlsl | 2 +- .../BuiltIns/firstbithigh-errors.hlsl | 2 +- .../SemaHLSL/BuiltIns/firstbitlow-errors.hlsl | 2 +- clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl | 2 +- clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl | 2 +- .../BuiltIns/half-float-only-errors.hlsl | 44 +++++++++---------- .../BuiltIns/half-float-only-errors2.hlsl | 6 +-- .../test/SemaHLSL/BuiltIns/isinf-errors.hlsl | 2 +- .../test/SemaHLSL/BuiltIns/isnan-errors.hlsl | 2 +- .../test/SemaHLSL/BuiltIns/ldexp-errors.hlsl | 2 +- .../test/SemaHLSL/BuiltIns/length-errors.hlsl | 2 +- clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl | 2 +- clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl | 2 +- .../BuiltIns/matrix-basic_types-errors.hlsl | 2 +- .../SemaHLSL/BuiltIns/max-errors-16bit.hlsl | 6 +-- .../SemaHLSL/BuiltIns/min-errors-16bit.hlsl | 6 +-- .../SemaHLSL/BuiltIns/normalize-errors.hlsl | 2 +- .../SemaHLSL/BuiltIns/radians-errors.hlsl | 2 +- clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl | 2 +- .../SemaHLSL/BuiltIns/reflect-errors.hlsl | 2 +- .../SemaHLSL/BuiltIns/refract-errors.hlsl | 2 +- .../SemaHLSL/BuiltIns/reversebits-errors.hlsl | 2 +- .../test/SemaHLSL/BuiltIns/round-errors.hlsl | 2 +- .../test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl | 2 +- .../SemaHLSL/BuiltIns/saturate-errors.hlsl | 2 +- clang/test/SemaHLSL/BuiltIns/sign-errors.hlsl | 2 +- .../SemaHLSL/BuiltIns/smoothstep-errors.hlsl | 2 +- .../SemaHLSL/BuiltIns/splitdouble-errors.hlsl | 2 +- clang/test/SemaHLSL/BuiltIns/step-errors.hlsl | 2 +- .../test/SemaHLSL/Operators/logical-not.hlsl | 2 +- .../SemaHLSL/Types/Arithmetic/half_size.hlsl | 4 +- clang/test/SemaHLSL/Types/short-errors.hlsl | 21 +++++++++ clang/test/SemaHLSL/Types/typedefs.hlsl | 4 +- .../SemaHLSL/VectorOverloadResolution.hlsl | 4 +- 159 files changed, 299 insertions(+), 241 deletions(-) create mode 100644 clang/test/Driver/dxc_enable16bittypes.hlsl create mode 100644 clang/test/SemaHLSL/Types/short-errors.hlsl diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 8d6b8a14740ce..d3cca82b4bdff 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -216,6 +216,7 @@ LANGOPT(OpenCLGenericAddressSpace, 1, 0, NotCompatible, "OpenCL generic keyword" LANGOPT(OpenCLPipes , 1, 0, NotCompatible, "OpenCL pipes language constructs and built-ins") LANGOPT(NativeHalfType , 1, 0, NotCompatible, "Native half type support") LANGOPT(NativeHalfArgsAndReturns, 1, 0, NotCompatible, "Native half args and returns") +LANGOPT(NativeInt16Type , 1, 1, NotCompatible, "Native int 16 type support") LANGOPT(CUDA , 1, 0, NotCompatible, "CUDA") LANGOPT(HIP , 1, 0, NotCompatible, "HIP") LANGOPT(OpenMP , 32, 0, NotCompatible, "OpenMP support and version of OpenMP (31, 40 or 45)") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 7f33f31eeea67..6e1c9425d8d75 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -8626,6 +8626,11 @@ def fobjc_subscripting_legacy_runtime : Flag<["-"], "fobjc-subscripting-legacy-r def vtordisp_mode_EQ : Joined<["-"], "vtordisp-mode=">, HelpText<"Control vtordisp placement on win32 targets">, MarshallingInfoInt, "1">; +def fnative_int16_type : Flag<["-"], "fnative-int16-type">, + HelpText<"Use 16 bit integer types">, + // This option is implied unless we are in HLSL lang mode + ImpliedByAnyOf<[!strconcat("!", hlsl.KeyPath)]>, + MarshallingInfoFlag>; def fnative_half_type: Flag<["-"], "fnative-half-type">, HelpText<"Use the native half type for __fp16 instead of promoting to float">, MarshallingInfoFlag>, @@ -9518,7 +9523,7 @@ def emit_pristine_llvm : DXCFlag<"emit-pristine-llvm">, HelpText<"Emit pristine LLVM IR from the frontend by not running any LLVM passes at all." "Same as -S + -emit-llvm + -disable-llvm-passes.">; def fcgl : DXCFlag<"fcgl">, Alias; -def enable_16bit_types : DXCFlag<"enable-16bit-types">, Alias, +def enable_16bit_types : DXCFlag<"enable-16bit-types">, HelpText<"Enable 16-bit types and disable min precision types." "Available in HLSL 2018 and shader model 6.2.">; def fdx_rootsignature_version : diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 4e8f63ea49480..d3ab6f1261ad6 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -3708,6 +3708,7 @@ static void RenderHLSLOptions(const ArgList &Args, ArgStringList &CmdArgs, options::OPT_emit_obj, options::OPT_disable_llvm_passes, options::OPT_fnative_half_type, + options::OPT_fnative_int16_type, options::OPT_hlsl_entrypoint, options::OPT_fdx_rootsignature_define, options::OPT_fdx_rootsignature_version, diff --git a/clang/lib/Driver/ToolChains/HLSL.cpp b/clang/lib/Driver/ToolChains/HLSL.cpp index 20a320ea233d4..8d3fba7137c7c 100644 --- a/clang/lib/Driver/ToolChains/HLSL.cpp +++ b/clang/lib/Driver/ToolChains/HLSL.cpp @@ -498,6 +498,15 @@ HLSLToolChain::TranslateArgs(const DerivedArgList &Args, StringRef BoundArch, continue; } + if (A->getOption().getID() == options::OPT_enable_16bit_types) { + // Translate -enable-16bit-types into -fnative-half-type and + // -fnative-int16-type + DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_fnative_half_type)); + DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_fnative_int16_type)); + A->claim(); + continue; + } + DAL->append(A); } diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index bd36eb4ecf9da..1951e7f747487 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -4600,7 +4600,8 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, // Validate that if fnative-half-type is given, that // the language standard is at least hlsl2018, and that // the target shader model is at least 6.2. - if (Args.getLastArg(OPT_fnative_half_type)) { + if (Args.getLastArg(OPT_fnative_half_type) || + Args.getLastArg(OPT_fnative_int16_type)) { const LangStandard &Std = LangStandard::getLangStandardForKind(Opts.LangStd); if (!(Opts.LangStd >= LangStandard::lang_hlsl2018 && @@ -4614,12 +4615,16 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, Diags.Report(diag::err_drv_hlsl_bad_shader_unsupported) << VulkanEnv << T.getOSName() << T.str(); } - if (Args.getLastArg(OPT_fnative_half_type)) { + if (Args.getLastArg(OPT_fnative_half_type) || + Args.getLastArg(OPT_fnative_int16_type)) { + const char *Str = Args.getLastArg(OPT_fnative_half_type) + ? "-fnative-half-type" + : "-fnative-int16-type"; const LangStandard &Std = LangStandard::getLangStandardForKind(Opts.LangStd); if (!(Opts.LangStd >= LangStandard::lang_hlsl2018)) Diags.Report(diag::err_drv_hlsl_16bit_types_unsupported) - << "-fnative-half-type" << false << Std.getName(); + << Str << false << Std.getName(); } } else { llvm_unreachable("expected DXIL or SPIR-V target"); diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 47f1d5a6b636c..8602be1d8a173 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -399,7 +399,7 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI, Builder.defineMacro("__HLSL_202y", Twine((unsigned)LangOptions::HLSLLangStd::HLSL_202y)); - if (LangOpts.NativeHalfType) + if (LangOpts.NativeHalfType && LangOpts.NativeInt16Type) Builder.defineMacro("__HLSL_ENABLE_16_BIT", "1"); // Shader target information diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index e4b158e4a6248..7e4a164e34eda 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -4248,6 +4248,13 @@ void Parser::ParseDeclarationSpecifiers( // type-specifier case tok::kw_short: + if (!getLangOpts().NativeInt16Type) { + Diag(Tok, diag::err_unknown_typename) << Tok.getName(); + DS.SetTypeSpecError(); + DS.SetRangeEnd(Tok.getLocation()); + ConsumeToken(); + goto DoneWithDeclSpec; + } isInvalid = DS.SetTypeSpecWidth(TypeSpecifierWidth::Short, Loc, PrevSpec, DiagID, Policy); break; diff --git a/clang/test/AST/HLSL/packoffset.hlsl b/clang/test/AST/HLSL/packoffset.hlsl index 4d18a9ca631f1..05b927279e198 100644 --- a/clang/test/AST/HLSL/packoffset.hlsl +++ b/clang/test/AST/HLSL/packoffset.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.3-library -S -finclude-default-header -fnative-half-type -ast-dump -x hlsl %s | FileCheck %s +// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.3-library -S -finclude-default-header -fnative-half-type -fnative-int16-type -ast-dump -x hlsl %s | FileCheck %s // CHECK: HLSLBufferDecl {{.*}} cbuffer A diff --git a/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl b/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl index 733c4e2ee5a36..5654974b26d2d 100644 --- a/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl +++ b/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute -x hlsl -ast-dump -o - %s | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -fnative-int16-type -triple spirv-unknown-vulkan-compute -x hlsl -ast-dump -o - %s | FileCheck %s // CHECK: VarDecl {{.*}} bool_const 'const hlsl_private bool' static cinit // CHECK-NEXT: CallExpr {{.*}} 'bool' diff --git a/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl index 4e29994afd27e..bd9a62f4db359 100644 --- a/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl +++ b/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s struct S { int X; diff --git a/clang/test/CodeGenHLSL/BasicFeatures/frem_modulo.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/frem_modulo.hlsl index edc28c5c80b51..393efcc360d08 100644 --- a/clang/test/CodeGenHLSL/BasicFeatures/frem_modulo.hlsl +++ b/clang/test/CodeGenHLSL/BasicFeatures/frem_modulo.hlsl @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s half2 half_vec_mod_by_int(half2 p1) { diff --git a/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl b/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl index aa13b27581850..6737cd3ee78ba 100644 --- a/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl +++ b/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s // CHECK: define {{.*}} i32 {{.*}}test_branch{{.*}}(i32 {{.*}} [[VALD:%.*]]) // CHECK: [[PARAM:%.*]] = load i32, ptr [[VALD]].addr, align 4 diff --git a/clang/test/CodeGenHLSL/Operators/logical-not.hlsl b/clang/test/CodeGenHLSL/Operators/logical-not.hlsl index 0f9d0677d8610..d5130ab88ea64 100644 --- a/clang/test/CodeGenHLSL/Operators/logical-not.hlsl +++ b/clang/test/CodeGenHLSL/Operators/logical-not.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -disable-llvm-passes -emit-llvm -finclude-default-header -fnative-half-type -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -disable-llvm-passes -emit-llvm -finclude-default-header -fnative-half-type -fnative-int16-type -o - %s | FileCheck %s // CHECK-LABEL: case1 // CHECK: [[ToBool:%.*]] = icmp ne <2 x i32> {{.*}}, zeroinitializer diff --git a/clang/test/CodeGenHLSL/basic_types.hlsl b/clang/test/CodeGenHLSL/basic_types.hlsl index 37fb5195e9768..8836126934957 100644 --- a/clang/test/CodeGenHLSL/basic_types.hlsl +++ b/clang/test/CodeGenHLSL/basic_types.hlsl @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - -DNAMESPACED| FileCheck %s diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl index df530a9cee561..f499fc97f43fc 100644 --- a/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl @@ -1,7 +1,7 @@ -// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \ +// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \ // RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \ +// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \ // RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl index 87bb1dee01905..3655cdb443fa9 100644 --- a/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl @@ -1,7 +1,7 @@ -// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \ +// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \ // RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \ +// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \ // RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV diff --git a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl index 8c787a42618ac..da6cbc40a79bb 100644 --- a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl @@ -1,7 +1,7 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -triple \ +// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \ // RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -triple \ +// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \ // RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV diff --git a/clang/test/CodeGenHLSL/builtins/abs.hlsl b/clang/test/CodeGenHLSL/builtins/abs.hlsl index 6abe2f816c844..45cc907c0ada9 100644 --- a/clang/test/CodeGenHLSL/builtins/abs.hlsl +++ b/clang/test/CodeGenHLSL/builtins/abs.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ diff --git a/clang/test/CodeGenHLSL/builtins/acos.hlsl b/clang/test/CodeGenHLSL/builtins/acos.hlsl index 8152339a34e87..f710d1f738a48 100644 --- a/clang/test/CodeGenHLSL/builtins/acos.hlsl +++ b/clang/test/CodeGenHLSL/builtins/acos.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ diff --git a/clang/test/CodeGenHLSL/builtins/all.hlsl b/clang/test/CodeGenHLSL/builtins/all.hlsl index 391fad0ef33f5..bfa3b903d66a8 100644 --- a/clang/test/CodeGenHLSL/builtins/all.hlsl +++ b/clang/test/CodeGenHLSL/builtins/all.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="hidden spir_func noundef" -DTARGET=spv @@ -8,7 +8,7 @@ // RUN: -o - | FileCheck %s --check-prefixes=CHECK \ // RUN: -DFNATTRS="hidden spir_func noundef" -DTARGET=spv // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="hidden noundef" -DTARGET=dx diff --git a/clang/test/CodeGenHLSL/builtins/any.hlsl b/clang/test/CodeGenHLSL/builtins/any.hlsl index e4837876e2693..fa2cd2698b392 100644 --- a/clang/test/CodeGenHLSL/builtins/any.hlsl +++ b/clang/test/CodeGenHLSL/builtins/any.hlsl @@ -1,19 +1,19 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="hidden spir_func noundef" -DTARGET=spv // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-int16-type -emit-llvm -disable-llvm-passes \ // RUN: -o - | FileCheck %s --check-prefixes=CHECK \ // RUN: -DFNATTRS="hidden spir_func noundef" -DTARGET=spv // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="hidden noundef" -DTARGET=dx // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-int16-type -emit-llvm -disable-llvm-passes \ // RUN: -o - | FileCheck %s --check-prefixes=CHECK \ // RUN: -DFNATTRS="hidden noundef" -DTARGET=dx diff --git a/clang/test/CodeGenHLSL/builtins/asfloat.hlsl b/clang/test/CodeGenHLSL/builtins/asfloat.hlsl index 59fc15fa60b1e..72802e8ef09be 100644 --- a/clang/test/CodeGenHLSL/builtins/asfloat.hlsl +++ b/clang/test/CodeGenHLSL/builtins/asfloat.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s // CHECK: define {{.*}}test_uint{{.*}}(i32 {{.*}} [[VAL:%.*]]){{.*}} // CHECK: bitcast i32 [[VAL]] to float diff --git a/clang/test/CodeGenHLSL/builtins/asin.hlsl b/clang/test/CodeGenHLSL/builtins/asin.hlsl index 16efbba79670e..ccf704834116c 100644 --- a/clang/test/CodeGenHLSL/builtins/asin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/asin.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ diff --git a/clang/test/CodeGenHLSL/builtins/asint.hlsl b/clang/test/CodeGenHLSL/builtins/asint.hlsl index e1d80df5015c9..587d2bdc657d8 100644 --- a/clang/test/CodeGenHLSL/builtins/asint.hlsl +++ b/clang/test/CodeGenHLSL/builtins/asint.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s // CHECK: define {{.*}}test_int{{.*}}(i32 {{.*}} [[VAL:%.*]]){{.*}} // CHECK-NOT: bitcast diff --git a/clang/test/CodeGenHLSL/builtins/asint16.hlsl b/clang/test/CodeGenHLSL/builtins/asint16.hlsl index 8a1513012fd99..fd2cb8d10ee6b 100644 --- a/clang/test/CodeGenHLSL/builtins/asint16.hlsl +++ b/clang/test/CodeGenHLSL/builtins/asint16.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s //CHECK-LABEL: define {{.*}}test_ints //CHECK-SAME: {{.*}}(i16 {{.*}} [[VAL:%.*]]){{.*}} diff --git a/clang/test/CodeGenHLSL/builtins/asuint.hlsl b/clang/test/CodeGenHLSL/builtins/asuint.hlsl index 252a434ccce0d..5fd1e62d66ddb 100644 --- a/clang/test/CodeGenHLSL/builtins/asuint.hlsl +++ b/clang/test/CodeGenHLSL/builtins/asuint.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s // CHECK: define {{.*}}test_uint{{.*}}(i32 {{.*}} [[VAL:%.*]]){{.*}} // CHECK-NOT: bitcast diff --git a/clang/test/CodeGenHLSL/builtins/asuint16.hlsl b/clang/test/CodeGenHLSL/builtins/asuint16.hlsl index 6d44377df2ffb..31e151e210d7e 100644 --- a/clang/test/CodeGenHLSL/builtins/asuint16.hlsl +++ b/clang/test/CodeGenHLSL/builtins/asuint16.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s //CHECK-LABEL: define {{.*}}test_ints //CHECK-SAME: {{.*}}(i16 {{.*}} [[VAL:%.*]]){{.*}} diff --git a/clang/test/CodeGenHLSL/builtins/atan.hlsl b/clang/test/CodeGenHLSL/builtins/atan.hlsl index 437835a863703..91fe139ddf05b 100644 --- a/clang/test/CodeGenHLSL/builtins/atan.hlsl +++ b/clang/test/CodeGenHLSL/builtins/atan.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ diff --git a/clang/test/CodeGenHLSL/builtins/atan2.hlsl b/clang/test/CodeGenHLSL/builtins/atan2.hlsl index 6c93f57be6b3d..512b44a5780db 100644 --- a/clang/test/CodeGenHLSL/builtins/atan2.hlsl +++ b/clang/test/CodeGenHLSL/builtins/atan2.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ diff --git a/clang/test/CodeGenHLSL/builtins/ceil.hlsl b/clang/test/CodeGenHLSL/builtins/ceil.hlsl index 1a9c630b60e57..d87d56edd9443 100644 --- a/clang/test/CodeGenHLSL/builtins/ceil.hlsl +++ b/clang/test/CodeGenHLSL/builtins/ceil.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ diff --git a/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl index 356836b40e9c0..56a2b090bdeaf 100644 --- a/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s // CHECK-LABEL: builtin_clamp_half diff --git a/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl index eaedfb419c195..8044047c5ef40 100644 --- a/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl +++ b/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)" // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ @@ -7,7 +7,7 @@ // RUN: -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)" // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple spirv-unknown-vulkan-compute %s \ -// RUN: -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)" // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple spirv-unknown-vulkan-compute %s \ diff --git a/clang/test/CodeGenHLSL/builtins/clamp.hlsl b/clang/test/CodeGenHLSL/builtins/clamp.hlsl index 58db4423799be..10570e9b6ddb4 100644 --- a/clang/test/CodeGenHLSL/builtins/clamp.hlsl +++ b/clang/test/CodeGenHLSL/builtins/clamp.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)" // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ @@ -7,7 +7,7 @@ // RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF \ // RUN: -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)" // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)" // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \ diff --git a/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl index aaeb2f026449b..0baf0db9bd0b6 100644 --- a/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s // CHECK: define hidden void @{{.*}}builtin_clip_float{{.*}}(float {{.*}} [[P0:%.*]]) // CHECK: [[LOAD:%.*]] = load float, ptr [[P0]].addr, align 4 diff --git a/clang/test/CodeGenHLSL/builtins/clip.hlsl b/clang/test/CodeGenHLSL/builtins/clip.hlsl index e067828c38bf6..bb21f084deba5 100644 --- a/clang/test/CodeGenHLSL/builtins/clip.hlsl +++ b/clang/test/CodeGenHLSL/builtins/clip.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-pixel %s -fnative-half-type -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -finclude-default-header -triple spirv-vulkan-pixel %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefix=SPIRV +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-pixel %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -triple spirv-vulkan-pixel %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefix=SPIRV void test_scalar(float Buf) { diff --git a/clang/test/CodeGenHLSL/builtins/cos.hlsl b/clang/test/CodeGenHLSL/builtins/cos.hlsl index 79f9e1e6fbec2..1f8970096a349 100644 --- a/clang/test/CodeGenHLSL/builtins/cos.hlsl +++ b/clang/test/CodeGenHLSL/builtins/cos.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ diff --git a/clang/test/CodeGenHLSL/builtins/cosh.hlsl b/clang/test/CodeGenHLSL/builtins/cosh.hlsl index 07c64206412db..80474d459fcbd 100644 --- a/clang/test/CodeGenHLSL/builtins/cosh.hlsl +++ b/clang/test/CodeGenHLSL/builtins/cosh.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ diff --git a/clang/test/CodeGenHLSL/builtins/countbits.hlsl b/clang/test/CodeGenHLSL/builtins/countbits.hlsl index 218d8dcd10f8d..87524ae58a0d6 100644 --- a/clang/test/CodeGenHLSL/builtins/countbits.hlsl +++ b/clang/test/CodeGenHLSL/builtins/countbits.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s #ifdef __HLSL_ENABLE_16_BIT diff --git a/clang/test/CodeGenHLSL/builtins/cross.hlsl b/clang/test/CodeGenHLSL/builtins/cross.hlsl index 873cb6db30425..e53b34bb9dc42 100644 --- a/clang/test/CodeGenHLSL/builtins/cross.hlsl +++ b/clang/test/CodeGenHLSL/builtins/cross.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx @@ -8,7 +8,7 @@ // RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ // RUN: -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv diff --git a/clang/test/CodeGenHLSL/builtins/degrees-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/degrees-builtin.hlsl index 2e639f5577d20..3098ed242a492 100644 --- a/clang/test/CodeGenHLSL/builtins/degrees-builtin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/degrees-builtin.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s // CHECK-LABEL: builtin_degrees_half diff --git a/clang/test/CodeGenHLSL/builtins/degrees.hlsl b/clang/test/CodeGenHLSL/builtins/degrees.hlsl index f0fb12855e5f6..645e44eba3d95 100644 --- a/clang/test/CodeGenHLSL/builtins/degrees.hlsl +++ b/clang/test/CodeGenHLSL/builtins/degrees.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx @@ -8,7 +8,7 @@ // RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ // RUN: -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx // RUN: %clang_cc1 -finclude-default-header -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv diff --git a/clang/test/CodeGenHLSL/builtins/distance.hlsl b/clang/test/CodeGenHLSL/builtins/distance.hlsl index 0c24fbb9f1859..bf015415a7d2f 100644 --- a/clang/test/CodeGenHLSL/builtins/distance.hlsl +++ b/clang/test/CodeGenHLSL/builtins/distance.hlsl @@ -1,9 +1,9 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -O1 -o - | FileCheck %s // RUN: %clang_cc1 -finclude-default-header -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK // CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh( diff --git a/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl index 716704a1bfdad..cbbf38aba3504 100644 --- a/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s // CHECK-LABEL: builtin_dot_half diff --git a/clang/test/CodeGenHLSL/builtins/dot.hlsl b/clang/test/CodeGenHLSL/builtins/dot.hlsl index c1fdb0740adc3..a496842281d6d 100644 --- a/clang/test/CodeGenHLSL/builtins/dot.hlsl +++ b/clang/test/CodeGenHLSL/builtins/dot.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,DXCHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ @@ -7,7 +7,7 @@ // RUN: -o - | FileCheck %s --check-prefixes=CHECK,DXCHECK,NO_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,SPVCHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ diff --git a/clang/test/CodeGenHLSL/builtins/dot2add.hlsl b/clang/test/CodeGenHLSL/builtins/dot2add.hlsl index e80ffba2bcfdb..3165c24f2a60e 100644 --- a/clang/test/CodeGenHLSL/builtins/dot2add.hlsl +++ b/clang/test/CodeGenHLSL/builtins/dot2add.hlsl @@ -1,7 +1,7 @@ -// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \ +// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \ // RUN: dxil-pc-shadermodel6.4-compute %s -emit-llvm -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \ +// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \ // RUN: spirv-pc-vulkan-compute %s -emit-llvm -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV diff --git a/clang/test/CodeGenHLSL/builtins/dst.hlsl b/clang/test/CodeGenHLSL/builtins/dst.hlsl index a0840c66e5da9..d8292d31fba7c 100644 --- a/clang/test/CodeGenHLSL/builtins/dst.hlsl +++ b/clang/test/CodeGenHLSL/builtins/dst.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s // CHECK-LABEL: define {{.*}} <4 x float> @{{[A-Za-z1-9_]+}}dst_impl{{[A-Za-z1-9_]*}}( diff --git a/clang/test/CodeGenHLSL/builtins/exp.hlsl b/clang/test/CodeGenHLSL/builtins/exp.hlsl index 5a8f60528a84c..d50ef021eecb8 100644 --- a/clang/test/CodeGenHLSL/builtins/exp.hlsl +++ b/clang/test/CodeGenHLSL/builtins/exp.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ diff --git a/clang/test/CodeGenHLSL/builtins/exp2.hlsl b/clang/test/CodeGenHLSL/builtins/exp2.hlsl index a9bbcb0d9bff9..ed8cfcf47b04b 100644 --- a/clang/test/CodeGenHLSL/builtins/exp2.hlsl +++ b/clang/test/CodeGenHLSL/builtins/exp2.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ diff --git a/clang/test/CodeGenHLSL/builtins/faceforward.hlsl b/clang/test/CodeGenHLSL/builtins/faceforward.hlsl index d2ece57aba4ae..70459d81685a1 100644 --- a/clang/test/CodeGenHLSL/builtins/faceforward.hlsl +++ b/clang/test/CodeGenHLSL/builtins/faceforward.hlsl @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -finclude-default-header -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - | FileCheck %s // RUN: %clang_cc1 -finclude-default-header -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - | FileCheck %s --check-prefix=SPVCHECK // CHECK-LABEL: test_faceforward_half diff --git a/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl b/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl index a71b1878f8b55..368d652a6f779 100644 --- a/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl +++ b/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s -DTARGET=dx // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes \ // RUN: -o - | FileCheck %s -DTARGET=spv diff --git a/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl index 007db0c9c2ad5..a1d2a1b31c99a 100644 --- a/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl +++ b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s -DTARGET=dx // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes \ // RUN: -o - | FileCheck %s -DTARGET=spv diff --git a/clang/test/CodeGenHLSL/builtins/floor.hlsl b/clang/test/CodeGenHLSL/builtins/floor.hlsl index b3ff58317981a..4763e54f92b8e 100644 --- a/clang/test/CodeGenHLSL/builtins/floor.hlsl +++ b/clang/test/CodeGenHLSL/builtins/floor.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ diff --git a/clang/test/CodeGenHLSL/builtins/fmod.hlsl b/clang/test/CodeGenHLSL/builtins/fmod.hlsl index cc91c0b67f6cc..527eb6020469e 100644 --- a/clang/test/CodeGenHLSL/builtins/fmod.hlsl +++ b/clang/test/CodeGenHLSL/builtins/fmod.hlsl @@ -3,7 +3,7 @@ // ---------- Native Half support test ----------- // // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - | FileCheck %s -DFNATTRS="hidden noundef nofpclass(nan inf)" \ // RUN: -DTYPE=half -DINT_TYPE=f16 --check-prefixes=DXCHECK @@ -21,7 +21,7 @@ // ---------- Native Half support test ----------- // // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - | FileCheck %s \ // RUN: -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTYPE=half diff --git a/clang/test/CodeGenHLSL/builtins/frac-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/frac-builtin.hlsl index 9f144f470ed90..e41fd856c6a42 100644 --- a/clang/test/CodeGenHLSL/builtins/frac-builtin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/frac-builtin.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s // CHECK-LABEL: builtin_frac_half diff --git a/clang/test/CodeGenHLSL/builtins/frac.hlsl b/clang/test/CodeGenHLSL/builtins/frac.hlsl index d8397407cd013..3b61c482e86ad 100644 --- a/clang/test/CodeGenHLSL/builtins/frac.hlsl +++ b/clang/test/CodeGenHLSL/builtins/frac.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx @@ -8,7 +8,7 @@ // RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ // RUN: -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv diff --git a/clang/test/CodeGenHLSL/builtins/isinf.hlsl b/clang/test/CodeGenHLSL/builtins/isinf.hlsl index dc869a64a65b7..b778df38bc9b6 100644 --- a/clang/test/CodeGenHLSL/builtins/isinf.hlsl +++ b/clang/test/CodeGenHLSL/builtins/isinf.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,DXCHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ @@ -7,7 +7,7 @@ // RUN: -o - | FileCheck %s --check-prefixes=CHECK,DXCHECK,NO_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,SPVCHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ diff --git a/clang/test/CodeGenHLSL/builtins/isnan.hlsl b/clang/test/CodeGenHLSL/builtins/isnan.hlsl index ce7dbe1aedea4..cca3863557229 100644 --- a/clang/test/CodeGenHLSL/builtins/isnan.hlsl +++ b/clang/test/CodeGenHLSL/builtins/isnan.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,DXCHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ @@ -7,7 +7,7 @@ // RUN: -o - | FileCheck %s --check-prefixes=CHECK,DXCHECK,NO_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,SPVCHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ diff --git a/clang/test/CodeGenHLSL/builtins/ldexp.hlsl b/clang/test/CodeGenHLSL/builtins/ldexp.hlsl index f8fa06c39f2a1..012adc588ddfa 100644 --- a/clang/test/CodeGenHLSL/builtins/ldexp.hlsl +++ b/clang/test/CodeGenHLSL/builtins/ldexp.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s // CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) half @_ZN4hlsl8__detail10ldexp_implIDhEET_S2_S2_ // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn half @llvm.exp2.f16(half %{{.*}}) diff --git a/clang/test/CodeGenHLSL/builtins/length.hlsl b/clang/test/CodeGenHLSL/builtins/length.hlsl index 9297c35abfd16..95edb20dacdac 100644 --- a/clang/test/CodeGenHLSL/builtins/length.hlsl +++ b/clang/test/CodeGenHLSL/builtins/length.hlsl @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -O1 -o - | FileCheck %s // RUN: %clang_cc1 -finclude-default-header -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK diff --git a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl index 96bcf2b49bf25..cb8634c9234e3 100644 --- a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s // CHECK-LABEL: builtin_lerp_half diff --git a/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl index 3b13e43873c77..20f758b18218e 100644 --- a/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl +++ b/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl @@ -1,6 +1,6 @@ -// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx +// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx -// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv +// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv // CHECK: define [[FNATTRS]] float @_Z16test_lerp_doubled( diff --git a/clang/test/CodeGenHLSL/builtins/lerp.hlsl b/clang/test/CodeGenHLSL/builtins/lerp.hlsl index d7a7113de4878..02cf14c0e1772 100644 --- a/clang/test/CodeGenHLSL/builtins/lerp.hlsl +++ b/clang/test/CodeGenHLSL/builtins/lerp.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx @@ -8,7 +8,7 @@ // RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ // RUN: -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv diff --git a/clang/test/CodeGenHLSL/builtins/lit.hlsl b/clang/test/CodeGenHLSL/builtins/lit.hlsl index 44b3e96ef88bf..c0b109a75906b 100644 --- a/clang/test/CodeGenHLSL/builtins/lit.hlsl +++ b/clang/test/CodeGenHLSL/builtins/lit.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s // CHECK-LABEL: test_lit_half // CHECK: %cmp.i = fcmp reassoc nnan ninf nsz arcp afn olt half %{{.*}}, 0xH0000 diff --git a/clang/test/CodeGenHLSL/builtins/log.hlsl b/clang/test/CodeGenHLSL/builtins/log.hlsl index 0136c1a052ed4..20e62120b64a6 100644 --- a/clang/test/CodeGenHLSL/builtins/log.hlsl +++ b/clang/test/CodeGenHLSL/builtins/log.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ diff --git a/clang/test/CodeGenHLSL/builtins/log10.hlsl b/clang/test/CodeGenHLSL/builtins/log10.hlsl index 6a75444143b18..feeccf7cd7ab3 100644 --- a/clang/test/CodeGenHLSL/builtins/log10.hlsl +++ b/clang/test/CodeGenHLSL/builtins/log10.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ diff --git a/clang/test/CodeGenHLSL/builtins/log2.hlsl b/clang/test/CodeGenHLSL/builtins/log2.hlsl index 84d73c1810890..a57fc44e09b70 100644 --- a/clang/test/CodeGenHLSL/builtins/log2.hlsl +++ b/clang/test/CodeGenHLSL/builtins/log2.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ diff --git a/clang/test/CodeGenHLSL/builtins/mad.hlsl b/clang/test/CodeGenHLSL/builtins/mad.hlsl index e764e20748d58..1116c1419997d 100644 --- a/clang/test/CodeGenHLSL/builtins/mad.hlsl +++ b/clang/test/CodeGenHLSL/builtins/mad.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,DXIL_CHECK,DXIL_NATIVE_HALF,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ @@ -7,7 +7,7 @@ // RUN: -o - | FileCheck %s --check-prefixes=CHECK,DXIL_CHECK,NO_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF,SPIR_NATIVE_HALF,SPIR_CHECK // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ diff --git a/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl index cd7013ba75825..a5ef87a822dd5 100644 --- a/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl +++ b/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF diff --git a/clang/test/CodeGenHLSL/builtins/max.hlsl b/clang/test/CodeGenHLSL/builtins/max.hlsl index fab53a160c856..9c621e62b5336 100644 --- a/clang/test/CodeGenHLSL/builtins/max.hlsl +++ b/clang/test/CodeGenHLSL/builtins/max.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ diff --git a/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl index f81fa128ce9c7..c0e06b0d204b3 100644 --- a/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl +++ b/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF diff --git a/clang/test/CodeGenHLSL/builtins/min.hlsl b/clang/test/CodeGenHLSL/builtins/min.hlsl index b3e8fedff9b1b..44d2063229cdb 100644 --- a/clang/test/CodeGenHLSL/builtins/min.hlsl +++ b/clang/test/CodeGenHLSL/builtins/min.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ diff --git a/clang/test/CodeGenHLSL/builtins/normalize-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/normalize-builtin.hlsl index 3db64604a1319..46bfb44c9b2a1 100644 --- a/clang/test/CodeGenHLSL/builtins/normalize-builtin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/normalize-builtin.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s // CHECK-LABEL: builtin_normalize_half diff --git a/clang/test/CodeGenHLSL/builtins/normalize.hlsl b/clang/test/CodeGenHLSL/builtins/normalize.hlsl index 85937346ead65..bbea11a8b432f 100644 --- a/clang/test/CodeGenHLSL/builtins/normalize.hlsl +++ b/clang/test/CodeGenHLSL/builtins/normalize.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx @@ -8,7 +8,7 @@ // RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ // RUN: -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv diff --git a/clang/test/CodeGenHLSL/builtins/pow.hlsl b/clang/test/CodeGenHLSL/builtins/pow.hlsl index fcde755e15fcc..b11ded8c1d173 100644 --- a/clang/test/CodeGenHLSL/builtins/pow.hlsl +++ b/clang/test/CodeGenHLSL/builtins/pow.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ diff --git a/clang/test/CodeGenHLSL/builtins/radians-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/radians-builtin.hlsl index 0c86357d5ecad..1f7e19055ee6b 100644 --- a/clang/test/CodeGenHLSL/builtins/radians-builtin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/radians-builtin.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s // CHECK-LABEL: builtin_radians_half diff --git a/clang/test/CodeGenHLSL/builtins/radians.hlsl b/clang/test/CodeGenHLSL/builtins/radians.hlsl index f281747fbf298..6521606a25c05 100644 --- a/clang/test/CodeGenHLSL/builtins/radians.hlsl +++ b/clang/test/CodeGenHLSL/builtins/radians.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DTARGET=dx -DFNATTRS="hidden noundef nofpclass(nan inf)" @@ -8,7 +8,7 @@ // RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ // RUN: -DTARGET=dx -DFNATTRS="hidden noundef nofpclass(nan inf)" // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DTARGET=spv -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" diff --git a/clang/test/CodeGenHLSL/builtins/rcp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/rcp-builtin.hlsl index d81a49b8c6048..2cc38203bd060 100644 --- a/clang/test/CodeGenHLSL/builtins/rcp-builtin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/rcp-builtin.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s // CHECK-LABEL: builtin_rcp_half diff --git a/clang/test/CodeGenHLSL/builtins/rcp.hlsl b/clang/test/CodeGenHLSL/builtins/rcp.hlsl index cdfaa3c5f1ee3..c9c47c737114d 100644 --- a/clang/test/CodeGenHLSL/builtins/rcp.hlsl +++ b/clang/test/CodeGenHLSL/builtins/rcp.hlsl @@ -1,12 +1,12 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,DXIL_CHECK,DXIL_NATIVE_HALF,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ // RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ // RUN: -o - | FileCheck %s --check-prefixes=CHECK,DXIL_CHECK,NO_HALF,DXIL_NO_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF,SPIR_NATIVE_HALF,SPIR_CHECK // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ diff --git a/clang/test/CodeGenHLSL/builtins/reflect.hlsl b/clang/test/CodeGenHLSL/builtins/reflect.hlsl index 65fefd801ffed..feb5a5b2ea78f 100644 --- a/clang/test/CodeGenHLSL/builtins/reflect.hlsl +++ b/clang/test/CodeGenHLSL/builtins/reflect.hlsl @@ -1,9 +1,9 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -finclude-default-header -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -O1 -o - | FileCheck %s // RUN: %clang_cc1 -finclude-default-header -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK // CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh( diff --git a/clang/test/CodeGenHLSL/builtins/refract.hlsl b/clang/test/CodeGenHLSL/builtins/refract.hlsl index eda256451ee2b..ffeb2a78b2517 100644 --- a/clang/test/CodeGenHLSL/builtins/refract.hlsl +++ b/clang/test/CodeGenHLSL/builtins/refract.hlsl @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -finclude-default-header -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - | FileCheck %s // RUN: %clang_cc1 -finclude-default-header -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - | FileCheck %s --check-prefix=SPVCHECK // CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_refract_halfDhDhDh( diff --git a/clang/test/CodeGenHLSL/builtins/reversebits.hlsl b/clang/test/CodeGenHLSL/builtins/reversebits.hlsl index 91375c8f4eb8f..5fd8de9c95df8 100644 --- a/clang/test/CodeGenHLSL/builtins/reversebits.hlsl +++ b/clang/test/CodeGenHLSL/builtins/reversebits.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s #ifdef __HLSL_ENABLE_16_BIT diff --git a/clang/test/CodeGenHLSL/builtins/round.hlsl b/clang/test/CodeGenHLSL/builtins/round.hlsl index 755f2e86fb116..0d4afee6ba9a8 100644 --- a/clang/test/CodeGenHLSL/builtins/round.hlsl +++ b/clang/test/CodeGenHLSL/builtins/round.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ diff --git a/clang/test/CodeGenHLSL/builtins/rsqrt-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/rsqrt-builtin.hlsl index 43ad9d0d0b844..d45f8cbbb5cf1 100644 --- a/clang/test/CodeGenHLSL/builtins/rsqrt-builtin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/rsqrt-builtin.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s // CHECK-LABEL: builtin_rsqrt_half diff --git a/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl index 9c398fd6f06cb..de2a222ae78d1 100644 --- a/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl +++ b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx @@ -8,7 +8,7 @@ // RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ // RUN: -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv diff --git a/clang/test/CodeGenHLSL/builtins/saturate-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/saturate-builtin.hlsl index 7dbba72f3abb5..c407362c1c85f 100644 --- a/clang/test/CodeGenHLSL/builtins/saturate-builtin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/saturate-builtin.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s // CHECK-LABEL: builtin_saturate_half diff --git a/clang/test/CodeGenHLSL/builtins/saturate.hlsl b/clang/test/CodeGenHLSL/builtins/saturate.hlsl index 3304073d9b501..c583013d4b245 100644 --- a/clang/test/CodeGenHLSL/builtins/saturate.hlsl +++ b/clang/test/CodeGenHLSL/builtins/saturate.hlsl @@ -1,12 +1,12 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -Dtar=dx // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF -Dtar=dx // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -Dtar=spv // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ diff --git a/clang/test/CodeGenHLSL/builtins/sign.hlsl b/clang/test/CodeGenHLSL/builtins/sign.hlsl index cbdb929388934..ef8f7168b1002 100644 --- a/clang/test/CodeGenHLSL/builtins/sign.hlsl +++ b/clang/test/CodeGenHLSL/builtins/sign.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DTARGET=dx -DFNATTRS="hidden noundef" @@ -8,7 +8,7 @@ // RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ // RUN: -DTARGET=dx -DFNATTRS="hidden noundef" // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DTARGET=spv -DFNATTRS="hidden spir_func noundef" diff --git a/clang/test/CodeGenHLSL/builtins/sin.hlsl b/clang/test/CodeGenHLSL/builtins/sin.hlsl index 9bbe97997aa33..5a900972c7ac9 100644 --- a/clang/test/CodeGenHLSL/builtins/sin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/sin.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ diff --git a/clang/test/CodeGenHLSL/builtins/sinh.hlsl b/clang/test/CodeGenHLSL/builtins/sinh.hlsl index d55d60515418c..ab0f814ecd694 100644 --- a/clang/test/CodeGenHLSL/builtins/sinh.hlsl +++ b/clang/test/CodeGenHLSL/builtins/sinh.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ diff --git a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl index bef64ce77d470..dcf9013045c07 100644 --- a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl +++ b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl @@ -1,9 +1,9 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -O1 -o - | FileCheck %s // RUN: %clang_cc1 -finclude-default-header -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK // CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh( diff --git a/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl b/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl index aeb2b79e90291..53f4f6aa2cb5f 100644 --- a/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl +++ b/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -emit-llvm -O0 -o - | FileCheck %s --check-prefix=SPIRV +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O0 -o - | FileCheck %s --check-prefix=SPIRV diff --git a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl index 31839f6bc177d..ce77459c77c41 100644 --- a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl +++ b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ diff --git a/clang/test/CodeGenHLSL/builtins/step.hlsl b/clang/test/CodeGenHLSL/builtins/step.hlsl index 6f6588a026a45..5061f8126d7e2 100644 --- a/clang/test/CodeGenHLSL/builtins/step.hlsl +++ b/clang/test/CodeGenHLSL/builtins/step.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx @@ -8,7 +8,7 @@ // RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ // RUN: -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF \ // RUN: -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv diff --git a/clang/test/CodeGenHLSL/builtins/tan.hlsl b/clang/test/CodeGenHLSL/builtins/tan.hlsl index c8c948624a613..2a108bf97bd1f 100644 --- a/clang/test/CodeGenHLSL/builtins/tan.hlsl +++ b/clang/test/CodeGenHLSL/builtins/tan.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ diff --git a/clang/test/CodeGenHLSL/builtins/tanh.hlsl b/clang/test/CodeGenHLSL/builtins/tanh.hlsl index f947c7f53b110..91345caad84c9 100644 --- a/clang/test/CodeGenHLSL/builtins/tanh.hlsl +++ b/clang/test/CodeGenHLSL/builtins/tanh.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ // RUN: --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ diff --git a/clang/test/CodeGenHLSL/builtins/transpose-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/transpose-builtin.hlsl index 86aa7cd6985dd..ef282fc355b23 100644 --- a/clang/test/CodeGenHLSL/builtins/transpose-builtin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/transpose-builtin.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s // NOTE: This test is only to confirm we can do codgen with the matrix alias. diff --git a/clang/test/CodeGenHLSL/builtins/trunc.hlsl b/clang/test/CodeGenHLSL/builtins/trunc.hlsl index c1c6ee4119f0d..58cc78ed03596 100644 --- a/clang/test/CodeGenHLSL/builtins/trunc.hlsl +++ b/clang/test/CodeGenHLSL/builtins/trunc.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ -// RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ +// RUN: -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \ // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | \ diff --git a/clang/test/CodeGenHLSL/enable-16bit-types.hlsl b/clang/test/CodeGenHLSL/enable-16bit-types.hlsl index 690404c4fde24..9e92eb04ada5b 100644 --- a/clang/test/CodeGenHLSL/enable-16bit-types.hlsl +++ b/clang/test/CodeGenHLSL/enable-16bit-types.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.3-unknown-shadermodel6.3-library \ +// RUN: %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.3-unknown-shadermodel6.3-library \ // RUN: -finclude-default-header -emit-llvm -o - %s 2>&1 | FileCheck %s --check-prefix=FLAG // RUN: %clang_cc1 -std=hlsl202x -triple dxilv1.3-unknown-shadermodel6.3-library \ // RUN: -finclude-default-header -emit-llvm -o - %s 2>&1 | FileCheck %s --check-prefix=NOFLAG diff --git a/clang/test/CodeGenHLSL/float3.hlsl b/clang/test/CodeGenHLSL/float3.hlsl index 4f03464586bf0..4abd18713e718 100644 --- a/clang/test/CodeGenHLSL/float3.hlsl +++ b/clang/test/CodeGenHLSL/float3.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s // Make sure float3 is not changed into float4. diff --git a/clang/test/CodeGenHLSL/no_int_promotion.hlsl b/clang/test/CodeGenHLSL/no_int_promotion.hlsl index b4ffcb477f1ba..adea165c1c864 100644 --- a/clang/test/CodeGenHLSL/no_int_promotion.hlsl +++ b/clang/test/CodeGenHLSL/no_int_promotion.hlsl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -D__HLSL_ENABLE_16_BIT \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s // FIXME: add test for char/int8_t/uint8_t when these types are supported in HLSL. diff --git a/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl index c97ad4237000f..843f14474a23f 100644 --- a/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL struct MyStruct { float4 a; diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl index 2b286bde88468..43f2e9cb7f333 100644 --- a/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl @@ -1,25 +1,25 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - -DRESOURCE=StructuredBuffer %s | FileCheck %s -DRESOURCE=StructuredBuffer -check-prefixes=DXIL-RO -// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \ +// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - -DRESOURCE=StructuredBuffer %s | FileCheck %s -DRESOURCE=StructuredBuffer -check-prefixes=SPV-RO -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - -DRESOURCE=RWStructuredBuffer %s | FileCheck %s -DRESOURCE=RWStructuredBuffer -check-prefixes=DXIL-RW -// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \ +// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - -DRESOURCE=RWStructuredBuffer %s | FileCheck %s -DRESOURCE=RWStructuredBuffer -check-prefixes=SPV-RW -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - -DRESOURCE=AppendStructuredBuffer %s | FileCheck %s -DRESOURCE=AppendStructuredBuffer -check-prefixes=DXIL-RW -// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \ +// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - -DRESOURCE=AppendStructuredBuffer %s | FileCheck %s -DRESOURCE=AppendStructuredBuffer -check-prefixes=SPV-RW -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck %s -DRESOURCE=ConsumeStructuredBuffer -check-prefixes=DXIL-RW -// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \ +// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck %s -DRESOURCE=ConsumeStructuredBuffer -check-prefixes=SPV-RW // DXIL-RO: %"class.hlsl::[[RESOURCE]]" = type { target("dx.RawBuffer", i16, 0, 0) } diff --git a/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl index d3dba8a69cc72..7d59bc5fed5ea 100644 --- a/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl @@ -1,13 +1,13 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - -DRESOURCE=Buffer %s | FileCheck %s -DRESOURCE=Buffer -DRW=0 -check-prefixes=DXIL -// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type \ +// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - -DRESOURCE=Buffer %s | FileCheck %s -DRESOURCE=Buffer -DRW=1 -check-prefixes=SPV-RO -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - -DRESOURCE=RWBuffer %s | FileCheck %s -DRESOURCE=RWBuffer -DRW=1 -check-prefixes=DXIL -// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type \ +// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -o - -DRESOURCE=RWBuffer %s | FileCheck %s -DRESOURCE=RWBuffer --DRW=2 -check-prefixes=SPV-RW // DXIL: %"class.hlsl::[[RESOURCE]]" = type { target("dx.TypedBuffer", i16, [[RW]], 0, 1) } diff --git a/clang/test/CodeGenHLSL/resources/cbuffer.hlsl b/clang/test/CodeGenHLSL/resources/cbuffer.hlsl index 8dcff5dad9d13..c8efe0d64c985 100644 --- a/clang/test/CodeGenHLSL/resources/cbuffer.hlsl +++ b/clang/test/CodeGenHLSL/resources/cbuffer.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-compute -fnative-half-type -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-compute -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s // CHECK: %__cblayout_CBScalars = type <{ float, double, half, i64, i32, i16, i32, i64 }> // CHECK: %__cblayout_CBVectors = type <{ <3 x float>, <3 x double>, <2 x half>, <3 x i64>, <4 x i32>, <3 x i16>, <3 x i64> }> diff --git a/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl b/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl index 15c54beb03d38..3f7c59916316d 100644 --- a/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl +++ b/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-int16-type -emit-llvm -disable-llvm-passes \ // RUN: -o - | FileCheck %s [[vk::constant_id(1)]] diff --git a/clang/test/Driver/dxc_enable16bittypes.hlsl b/clang/test/Driver/dxc_enable16bittypes.hlsl new file mode 100644 index 0000000000000..4cd1d2fd402b3 --- /dev/null +++ b/clang/test/Driver/dxc_enable16bittypes.hlsl @@ -0,0 +1,7 @@ +// RUN: %clang_dxc -enable-16bit-types -T lib_6_7 %s -### %s 2>&1 | FileCheck %s + +// Make sure enable-16bit-types flag translates into '-fnative-half-type' and 'fnative-int16-type' +// CHECK: "-fnative-half-type" +// CHECK-SAME: "-fnative-int16-type" + +// expected-no-diagnostics diff --git a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl index f37d00503fe57..6a507b0990df5 100644 --- a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl +++ b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl @@ -1,7 +1,9 @@ -// RUN: not %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2016 -fnative-half-type -emit-llvm -disable-llvm-passes -o - %s 2>&1 | FileCheck %s --check-prefix=SPIRV -// RUN: %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2021 -fnative-half-type -emit-llvm -disable-llvm-passes -o - %s 2>&1 | FileCheck %s --check-prefix=valid +// RUN: not %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2016 -fnative-half-type -emit-llvm -disable-llvm-passes -o - %s 2>&1 | FileCheck %s --check-prefix=SPIRV-HALF +// RUN: not %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2016 -fnative-int16-type -emit-llvm -disable-llvm-passes -o - %s 2>&1 | FileCheck %s --check-prefix=SPIRV-INT +// RUN: %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2021 -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - %s 2>&1 | FileCheck %s --check-prefix=valid -// SPIRV: error: '-fnative-half-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016' +// SPIRV-HALF: error: '-fnative-half-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016' +// SPIRV-INT: error: '-fnative-int16-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016' // valid: "spirv-unknown-vulkan-library" // valid: define hidden spir_func void @{{.*main.*}}() #0 { diff --git a/clang/test/Preprocessor/predefined-macros-hlsl.hlsl b/clang/test/Preprocessor/predefined-macros-hlsl.hlsl index 26bda6b7be167..f10c79cc9c2d4 100644 --- a/clang/test/Preprocessor/predefined-macros-hlsl.hlsl +++ b/clang/test/Preprocessor/predefined-macros-hlsl.hlsl @@ -7,7 +7,7 @@ // RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-mesh | FileCheck -match-full-lines %s --check-prefixes=CHECK,MESH,NOHALF // RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-pixel | FileCheck -match-full-lines %s --check-prefixes=CHECK,PIXEL,NOHALF // RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-vertex | FileCheck -match-full-lines %s --check-prefixes=CHECK,VERTEX,NOHALF -// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.3-vertex -fnative-half-type | FileCheck -match-full-lines %s --check-prefixes=CHECK,VERTEX,HALF +// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.3-vertex -fnative-half-type -fnative-int16-type | FileCheck -match-full-lines %s --check-prefixes=CHECK,VERTEX,HALF // RUN: %clang_cc1 %s -E -dM -o - -triple spirv-unknown-vulkan-compute | FileCheck -match-full-lines %s --check-prefixes=CHECK,COMPUTE,NOHALF,SPIRV diff --git a/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl index b4ef0550bf88a..553db49231ae0 100644 --- a/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify uint2 test_too_few_arg() { return __builtin_hlsl_adduint64(); diff --git a/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl index 4afd799f8539e..5e00428de0c82 100644 --- a/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected bool test_too_few_arg() { return __builtin_hlsl_all(); diff --git a/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl index e42fd97b40219..6210c998d8e2d 100644 --- a/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected bool test_too_few_arg() { return __builtin_hlsl_any(); diff --git a/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl index f5f223943b4cd..9872f39ebcfba 100644 --- a/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify float4 test_float_too_many_arg(float p0, float p1) { diff --git a/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl index 815a0c35cb04c..52f2cd224a13c 100644 --- a/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify int4 test_asint_too_many_arg(float p0, float p1) { diff --git a/clang/test/SemaHLSL/BuiltIns/asint16-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asint16-errors.hlsl index fee1c2eb87b11..5f3d5c9772d84 100644 --- a/clang/test/SemaHLSL/BuiltIns/asint16-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/asint16-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -verify int16_t4 test_asint16_too_many_arg(uint16_t p0, uint16_t p1) diff --git a/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl index 9d0c206a3b3ad..3bb6cc0094926 100644 --- a/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify uint4 test_asuint_too_many_arg(float p0, float p1) { diff --git a/clang/test/SemaHLSL/BuiltIns/asuint16-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asuint16-errors.hlsl index 024fd406fe8ef..709d2067d9df2 100644 --- a/clang/test/SemaHLSL/BuiltIns/asuint16-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/asuint16-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -verify uint16_t test_asuint16_less_argument() { diff --git a/clang/test/SemaHLSL/BuiltIns/clamp-errors-16bit.hlsl b/clang/test/SemaHLSL/BuiltIns/clamp-errors-16bit.hlsl index 7a6341659493b..40910bc9108ed 100644 --- a/clang/test/SemaHLSL/BuiltIns/clamp-errors-16bit.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/clamp-errors-16bit.hlsl @@ -1,8 +1,8 @@ -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ // RUN: -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=half -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ // RUN: -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=int16_t -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ // RUN: -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=uint16_t // check we error on 16 bit type if shader model is too old diff --git a/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl index 93e37075773f5..bbe567b6d6ac1 100644 --- a/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note float2 test_no_second_arg(float2 p0) { return __builtin_hlsl_elementwise_clamp(p0); diff --git a/clang/test/SemaHLSL/BuiltIns/clip-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/clip-errors.hlsl index 2cb401601f7eb..f47468897312c 100644 --- a/clang/test/SemaHLSL/BuiltIns/clip-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/clip-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify void test_arg_missing() { diff --git a/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl index 5704165e1a450..8949324ec69f6 100644 --- a/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected double test_int_builtin(double p0) { diff --git a/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl index 4f73dad79f21f..2c3e8d1560c87 100644 --- a/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -disable-llvm-passes -verify void test_too_few_arg() { diff --git a/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl index e7521c7251432..4ec1bcef2b6fc 100644 --- a/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify float test_no_second_arg(float2 p0) { return distance(p0); diff --git a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl index 606194692931f..f514a04eb9f49 100644 --- a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note float test_no_second_arg(float2 p0) { return __builtin_hlsl_dot(p0); diff --git a/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl index 5933faeae2aac..84333ba08b9b8 100644 --- a/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify float test_too_few_arg() { return dot2add(); diff --git a/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl index 1435232cbfbc5..f0076ac4e5881 100644 --- a/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl @@ -1,7 +1,7 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp2 -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp10 +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp2 +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp10 float test_too_few_arg() { return TEST_FUNC(); // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} diff --git a/clang/test/SemaHLSL/BuiltIns/faceforward-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/faceforward-errors.hlsl index 469d55995f966..01261a00295b1 100644 --- a/clang/test/SemaHLSL/BuiltIns/faceforward-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/faceforward-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify float test_double_inputs(double p0, double p1, double p2) { return faceforward(p0, p1, p2); diff --git a/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl index 8badaf0b99a20..f99e606fc6562 100644 --- a/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected int test_too_few_arg() { return firstbithigh(); diff --git a/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl index b12afe65a863e..37090796577fc 100644 --- a/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected int test_too_few_arg() { return firstbitlow(); diff --git a/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl index fc931139e523d..eceac9be8d7d1 100644 --- a/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify float test_no_second_arg(float2 p0) { return fmod(p0); diff --git a/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl index 1e277186f22c4..cdf2b61c45207 100644 --- a/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify float test_too_few_arg() { return __builtin_hlsl_elementwise_frac(); diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl index bf044797c3acb..e9cc0ed338e3e 100644 --- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl @@ -1,25 +1,25 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_acos -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_asin -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_ceil -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cos -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cosh -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp2 -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp10 -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_floor -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log2 -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log10 -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sin -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sinh -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sqrt -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_roundeven -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tan -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tanh -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_trunc -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_degrees -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_radians +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_acos +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_asin +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_ceil +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cos +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cosh +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp2 +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp10 +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_floor +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log2 +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log10 +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sin +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sinh +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sqrt +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_roundeven +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tan +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tanh +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_trunc +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_degrees +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_radians double test_double_builtin(double p0) { return TEST_FUNC(p0); diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl index c264617558261..9e10e1afa9385 100644 --- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl @@ -1,6 +1,6 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan2 -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_fmod -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_pow +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan2 +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_fmod +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_pow double test_double_builtin(double p0, double p1) { return TEST_FUNC(p0, p1); diff --git a/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl index 8d14df91f1409..a32bc9628a295 100644 --- a/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify bool test_too_few_arg() { return __builtin_hlsl_elementwise_isinf(); diff --git a/clang/test/SemaHLSL/BuiltIns/isnan-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/isnan-errors.hlsl index a6be28117af4f..625c415f91de2 100644 --- a/clang/test/SemaHLSL/BuiltIns/isnan-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/isnan-errors.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify bool test_too_few_arg() { return __builtin_hlsl_elementwise_isnan(); diff --git a/clang/test/SemaHLSL/BuiltIns/ldexp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/ldexp-errors.hlsl index 0bc7f7e40f5d3..fa146a5bce525 100644 --- a/clang/test/SemaHLSL/BuiltIns/ldexp-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/ldexp-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify float test_double_inputs(double p0, double p1) { return ldexp(p0, p1); diff --git a/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl index 3aaafa37e8e82..8c5c9a4a0d22a 100644 --- a/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify void test_too_few_arg() { diff --git a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl index 9592d8766dada..22720a4a37d02 100644 --- a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note float2 test_no_second_arg(float2 p0) { return __builtin_hlsl_lerp(p0); diff --git a/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl index 5dec0f68d71fa..0e9dda7055f98 100644 --- a/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note float2 test_no_second_arg(float2 p0) { return __builtin_hlsl_mad(p0); diff --git a/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl index 5ad1d6aefde38..6a6f14b52cb16 100644 --- a/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl @@ -3,7 +3,7 @@ uint64_t5x5 mat; // expected-error@-1 {{unknown type name 'uint64_t5x5'}} -// Note: this one only fails because -fnative-half-type is not set +// Note: this one only fails because -fnative-half-type -fnative-int16-type is not set uint16_t4x4 mat2; // expected-error@-1 {{unknown type name 'uint16_t4x4'}} diff --git a/clang/test/SemaHLSL/BuiltIns/max-errors-16bit.hlsl b/clang/test/SemaHLSL/BuiltIns/max-errors-16bit.hlsl index 32a4bbd42e5ec..71c14efa60b0f 100644 --- a/clang/test/SemaHLSL/BuiltIns/max-errors-16bit.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/max-errors-16bit.hlsl @@ -1,8 +1,8 @@ -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ // RUN: -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=half -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ // RUN: -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=int16_t -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ // RUN: -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=uint16_t // check we error on 16 bit type if shader model is too old diff --git a/clang/test/SemaHLSL/BuiltIns/min-errors-16bit.hlsl b/clang/test/SemaHLSL/BuiltIns/min-errors-16bit.hlsl index eb0066835689a..c2cffa18892d5 100644 --- a/clang/test/SemaHLSL/BuiltIns/min-errors-16bit.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/min-errors-16bit.hlsl @@ -1,8 +1,8 @@ -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ // RUN: -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=half -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ // RUN: -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=int16_t -// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ +// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \ // RUN: -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=uint16_t // check we error on 16 bit type if shader model is too old diff --git a/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl index 6ec32257a370f..377c2d5e41a73 100644 --- a/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -disable-llvm-passes -verify void test_too_few_arg() { diff --git a/clang/test/SemaHLSL/BuiltIns/radians-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/radians-errors.hlsl index dbffce226b54e..70e5b671bb3c9 100644 --- a/clang/test/SemaHLSL/BuiltIns/radians-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/radians-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify float test_too_few_arg() { return __builtin_hlsl_elementwise_radians(); diff --git a/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl index 01876240e82d0..79076b4815a6e 100644 --- a/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify float test_too_few_arg() { return __builtin_hlsl_elementwise_rcp(); diff --git a/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl index 9934a3e525d38..b0ae770f49f20 100644 --- a/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify float test_no_second_arg(float2 p0) { return reflect(p0); diff --git a/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl index 6cb3e56c20f0e..fce41a4a46d38 100644 --- a/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify float test_no_second_arg(float3 p0) { return refract(p0); diff --git a/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl index 1ac275beba642..5b33b89cb8eb8 100644 --- a/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify double2 test_int_builtin(double2 p0) { diff --git a/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl index 45f86450b37c2..54feed35379d7 100644 --- a/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected float test_too_few_arg() { return __builtin_elementwise_round(); diff --git a/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl index 1f81c51207bc3..cedfcca35225e 100644 --- a/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify float test_too_few_arg() { return __builtin_hlsl_elementwise_rsqrt(); diff --git a/clang/test/SemaHLSL/BuiltIns/saturate-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/saturate-errors.hlsl index 721b28f86f950..4054ebfb3f649 100644 --- a/clang/test/SemaHLSL/BuiltIns/saturate-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/saturate-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -Werror +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -Werror float2 test_no_arg() { return saturate(); diff --git a/clang/test/SemaHLSL/BuiltIns/sign-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/sign-errors.hlsl index b67725fc77e52..68583d10d1287 100644 --- a/clang/test/SemaHLSL/BuiltIns/sign-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/sign-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected bool test_too_few_arg() { return __builtin_hlsl_elementwise_sign(); diff --git a/clang/test/SemaHLSL/BuiltIns/smoothstep-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/smoothstep-errors.hlsl index e5e902d6ab887..4c6bea8f02411 100644 --- a/clang/test/SemaHLSL/BuiltIns/smoothstep-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/smoothstep-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify float test_no_second_arg(float2 p0) { return smoothstep(p0); diff --git a/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl index 312230a2d6aff..e2ef0f796c166 100644 --- a/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify void test_no_second_arg(double D) { __builtin_hlsl_elementwise_splitdouble(D); diff --git a/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl index 5346f217b83aa..993450a17ebfb 100644 --- a/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -disable-llvm-passes -verify void test_too_few_arg() { diff --git a/clang/test/SemaHLSL/Operators/logical-not.hlsl b/clang/test/SemaHLSL/Operators/logical-not.hlsl index d06ca3982be05..bd1a4be84c47f 100644 --- a/clang/test/SemaHLSL/Operators/logical-not.hlsl +++ b/clang/test/SemaHLSL/Operators/logical-not.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -ast-dump -ast-dump-filter=case | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -ast-dump -ast-dump-filter=case | FileCheck %s // CHECK-LABEL: FunctionDecl {{.*}} used case1 'uint32_t2 (uint32_t2)' // CHECK-NEXT: ParmVarDecl {{.*}} used b 'uint32_t2':'vector' diff --git a/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl b/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl index 7de4674699930..22e18769a2fe4 100644 --- a/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl +++ b/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -verify %s -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -verify -fnative-half-type %s +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -verify -fnative-half-type -fnative-int16-type %s // RUN: %clang_cc1 -triple spirv-linux-vulkan-library -verify %s -// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -verify -fnative-half-type %s +// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -verify -fnative-half-type -fnative-int16-type %s // expected-no-diagnostics #ifdef __HLSL_ENABLE_16_BIT diff --git a/clang/test/SemaHLSL/Types/short-errors.hlsl b/clang/test/SemaHLSL/Types/short-errors.hlsl new file mode 100644 index 0000000000000..93250084e300b --- /dev/null +++ b/clang/test/SemaHLSL/Types/short-errors.hlsl @@ -0,0 +1,21 @@ +// RUN: %clang_cc1 -finclude-default-header -triple spirv-pc-vulkan1.3-compute -verify %s +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.8-compute -verify %s + +void asArg(inout short F) { F + 1;} +// expected-error@-1 {{unknown type name short}} + +export void asVarDecl() { + short A = 1; + // expected-error@-1 {{unknown type name short}} + fn(A); +} + +export short asReturnType() { +// expected-error@-1 {{unknown type name short}} + return 1; +} + +struct S { + short A; + // expected-error@-1 {{unknown type name short}} +}; diff --git a/clang/test/SemaHLSL/Types/typedefs.hlsl b/clang/test/SemaHLSL/Types/typedefs.hlsl index fd72b1ae8a47f..c9c8ff2fc02de 100644 --- a/clang/test/SemaHLSL/Types/typedefs.hlsl +++ b/clang/test/SemaHLSL/Types/typedefs.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -finclude-default-header -verify -fnative-half-type %s -// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -finclude-default-header -verify -fnative-half-type %s +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -finclude-default-header -verify -fnative-half-type -fnative-int16-type %s +// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -finclude-default-header -verify -fnative-half-type -fnative-int16-type %s // expected-no-diagnostics #define SizeCheck(Ty, SizeInBits) \ diff --git a/clang/test/SemaHLSL/VectorOverloadResolution.hlsl b/clang/test/SemaHLSL/VectorOverloadResolution.hlsl index b320abdd81182..756dcb4034e4e 100644 --- a/clang/test/SemaHLSL/VectorOverloadResolution.hlsl +++ b/clang/test/SemaHLSL/VectorOverloadResolution.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.6-library -S -fnative-half-type -finclude-default-header -o - -ast-dump %s | FileCheck %s -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECKIR +// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.6-library -S -fnative-half-type -fnative-int16-type -finclude-default-header -o - -ast-dump %s | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECKIR void Fn(double2 D); void Fn(half2 H); From feb2549cbf01feb12f1017df79a1a01cd7951231 Mon Sep 17 00:00:00 2001 From: gbMattN Date: Fri, 31 Oct 2025 16:51:55 +0000 Subject: [PATCH 365/539] [TySan] Add option to outline instrumentation (#120582) Added a command line option to use function calls rather than inline checks for TySan instrumentation. --- compiler-rt/lib/tysan/tysan.cpp | 121 ++- compiler-rt/lib/tysan/tysan_platform.h | 16 +- compiler-rt/test/tysan/basic.c | 8 +- .../test/tysan/simple_verify_outlines.c | 22 + .../test/tysan/struct-offset-outline.c | 32 + .../Instrumentation/TypeSanitizer.cpp | 143 +++- .../TypeSanitizer/basic_outlined.ll | 68 ++ .../TypeSanitizer/basic_verify_outlined.ll | 736 ++++++++++++++++++ .../TypeSanitizer/globals_outlined.ll | 24 + 9 files changed, 1136 insertions(+), 34 deletions(-) create mode 100644 compiler-rt/test/tysan/simple_verify_outlines.c create mode 100644 compiler-rt/test/tysan/struct-offset-outline.c create mode 100644 llvm/test/Instrumentation/TypeSanitizer/basic_outlined.ll create mode 100644 llvm/test/Instrumentation/TypeSanitizer/basic_verify_outlined.ll create mode 100644 llvm/test/Instrumentation/TypeSanitizer/globals_outlined.ll diff --git a/compiler-rt/lib/tysan/tysan.cpp b/compiler-rt/lib/tysan/tysan.cpp index 4fa8166986d76..1c67adeba0fc5 100644 --- a/compiler-rt/lib/tysan/tysan.cpp +++ b/compiler-rt/lib/tysan/tysan.cpp @@ -22,6 +22,7 @@ #include "tysan/tysan.h" +#include #include using namespace __sanitizer; @@ -254,10 +255,68 @@ static void reportError(void *Addr, int Size, tysan_type_descriptor *TD, } } +ALWAYS_INLINE +static void SetShadowType(tysan_type_descriptor *td, + tysan_type_descriptor **shadowData, + uint64_t AccessSize) { + *shadowData = td; + uint64_t shadowDataInt = (uint64_t)shadowData; + + for (uint64_t i = 1; i < AccessSize; ++i) { + int64_t dataOffset = i << PtrShift(); + int64_t *badShadowData = (int64_t *)(shadowDataInt + dataOffset); + int64_t badTD = int64_t(i) * -1; + *badShadowData = badTD; + } +} + +ALWAYS_INLINE +static bool GetNotAllBadTD(uint64_t ShadowDataInt, uint64_t AccessSize) { + bool notAllBadTD = false; + for (uint64_t i = 1; i < AccessSize; ++i) { + int64_t **unkShadowData = (int64_t **)(ShadowDataInt + (i << PtrShift())); + int64_t *ILdTD = *unkShadowData; + notAllBadTD = notAllBadTD || (ILdTD != nullptr); + } + return notAllBadTD; +} + +ALWAYS_INLINE +static bool GetNotAllUnkTD(uint64_t ShadowDataInt, uint64_t AccessSize) { + bool notAllBadTD = false; + for (uint64_t i = 1; i < AccessSize; ++i) { + int64_t *badShadowData = (int64_t *)(ShadowDataInt + (i << PtrShift())); + int64_t ILdTD = *badShadowData; + notAllBadTD = notAllBadTD || (ILdTD >= 0); + } + return notAllBadTD; +} + extern "C" SANITIZER_INTERFACE_ATTRIBUTE void -__tysan_check(void *addr, int size, tysan_type_descriptor *td, int flags) { - GET_CALLER_PC_BP_SP; +__tysan_instrument_mem_inst(char *dest, char *src, uint64_t size, + bool needsMemMove) { + tysan_type_descriptor **destShadowDataPtr = shadow_for(dest); + + if (!src) { + internal_memset((char *)destShadowDataPtr, 0, size << PtrShift()); + return; + } + + uint64_t srcInt = (uint64_t)src; + uint64_t srcShadowInt = ((srcInt & AppMask()) << PtrShift()) + ShadowAddr(); + uint64_t *srcShadow = (uint64_t *)srcShadowInt; + if (needsMemMove) { + internal_memmove((char *)destShadowDataPtr, srcShadow, size << PtrShift()); + } else { + internal_memcpy((char *)destShadowDataPtr, srcShadow, size << PtrShift()); + } +} + +ALWAYS_INLINE +static void __tysan_check_internal(void *addr, int size, + tysan_type_descriptor *td, int flags, + uptr pc, uptr bp, uptr sp) { bool IsRead = flags & 1; bool IsWrite = flags & 2; const char *AccessStr; @@ -300,6 +359,64 @@ __tysan_check(void *addr, int size, tysan_type_descriptor *td, int flags) { } } +extern "C" SANITIZER_INTERFACE_ATTRIBUTE void +__tysan_check(void *addr, int size, tysan_type_descriptor *td, int flags) { + GET_CALLER_PC_BP_SP; + __tysan_check_internal(addr, size, td, flags, pc, bp, sp); +} + +extern "C" SANITIZER_INTERFACE_ATTRIBUTE void +__tysan_instrument_with_shadow_update(void *ptr, tysan_type_descriptor *td, + bool sanitizeFunction, + uint64_t accessSize, int flags) { + tysan_type_descriptor **shadowData = shadow_for(ptr); + tysan_type_descriptor *loadedTD = *shadowData; + bool shadowIsNull = loadedTD == nullptr; + + // TODO, sanitizeFunction is known at compile time, so maybe this is split + // into two different functions + if (sanitizeFunction) { + + if (td != loadedTD) { + + // We now know that the types did not match (we're on the slow path). If + // the type is unknown, then set it. + if (shadowIsNull) { + // We're about to set the type. Make sure that all bytes in the value + // are also of unknown type. + bool isAllUnknownTD = GetNotAllUnkTD((uint64_t)shadowData, accessSize); + if (isAllUnknownTD) { + GET_CALLER_PC_BP_SP; + __tysan_check_internal(ptr, accessSize, td, flags, pc, bp, sp); + } + SetShadowType(td, shadowData, accessSize); + } else { + GET_CALLER_PC_BP_SP; + __tysan_check_internal(ptr, accessSize, td, flags, pc, bp, sp); + } + } else { + // We appear to have the right type. Make sure that all other bytes in + // the type are still marked as interior bytes. If not, call the runtime. + bool isNotAllBadTD = GetNotAllBadTD((uint64_t)shadowData, accessSize); + if (isNotAllBadTD) { + GET_CALLER_PC_BP_SP; + __tysan_check_internal(ptr, accessSize, td, flags, pc, bp, sp); + } + } + } else if (shadowIsNull) { + SetShadowType(td, shadowData, accessSize); + } +} + +extern "C" SANITIZER_INTERFACE_ATTRIBUTE void +__tysan_set_shadow_type(void *ptr, tysan_type_descriptor *td, + uint64_t accessSize) { + // In the mode where writes always set the type, for a write (which does + // not also read), we just set the type. + tysan_type_descriptor **shadow = shadow_for(ptr); + SetShadowType(td, shadow, accessSize); +} + Flags __tysan::flags_data; SANITIZER_INTERFACE_ATTRIBUTE uptr __tysan_shadow_memory_address; diff --git a/compiler-rt/lib/tysan/tysan_platform.h b/compiler-rt/lib/tysan/tysan_platform.h index f01392885d939..19f77f0cace6b 100644 --- a/compiler-rt/lib/tysan/tysan_platform.h +++ b/compiler-rt/lib/tysan/tysan_platform.h @@ -21,24 +21,28 @@ struct Mapping { static const uptr kShadowAddr = 0x010000000000ull; static const uptr kAppAddr = 0x550000000000ull; static const uptr kAppMemMsk = ~0x780000000000ull; + static const uptr kPtrShift = 3; }; #elif defined(__aarch64__) struct Mapping39 { static const uptr kShadowAddr = 0x0800000000ull; static const uptr kAppAddr = 0x5500000000ull; static const uptr kAppMemMsk = ~0x7800000000ull; + static const uptr kPtrShift = 3; }; struct Mapping42 { static const uptr kShadowAddr = 0x10000000000ull; static const uptr kAppAddr = 0x2aa00000000ull; static const uptr kAppMemMsk = ~0x3c000000000ull; + static const uptr kPtrShift = 3; }; struct Mapping48 { static const uptr kShadowAddr = 0x0002000000000ull; static const uptr kAppAddr = 0x0aaaa00000000ull; static const uptr kAppMemMsk = ~0x0fff800000000ull; + static const uptr kPtrShift = 3; }; #define TYSAN_RUNTIME_VMA 1 #else @@ -49,7 +53,12 @@ struct Mapping48 { extern int vmaSize; #endif -enum MappingType { MAPPING_SHADOW_ADDR, MAPPING_APP_ADDR, MAPPING_APP_MASK }; +enum MappingType { + MAPPING_SHADOW_ADDR, + MAPPING_APP_ADDR, + MAPPING_APP_MASK, + MAPPING_PTR_SHIFT +}; template uptr MappingImpl(void) { switch (Type) { @@ -59,6 +68,8 @@ template uptr MappingImpl(void) { return Mapping::kAppAddr; case MAPPING_APP_MASK: return Mapping::kAppMemMsk; + case MAPPING_PTR_SHIFT: + return Mapping::kPtrShift; } } @@ -88,6 +99,9 @@ uptr AppAddr() { return MappingArchImpl(); } ALWAYS_INLINE uptr AppMask() { return MappingArchImpl(); } +ALWAYS_INLINE +uptr PtrShift() { return MappingArchImpl(); } + } // namespace __tysan #endif diff --git a/compiler-rt/test/tysan/basic.c b/compiler-rt/test/tysan/basic.c index 8e66e1a721383..28b94c425757e 100644 --- a/compiler-rt/test/tysan/basic.c +++ b/compiler-rt/test/tysan/basic.c @@ -1,6 +1,10 @@ -// RUN: %clang_tysan -O0 %s -o %t && %run %t 10 >%t.out.0 2>&1 +// RUN: %clang_tysan -O0 -mllvm -tysan-outline-instrumentation=false %s -o %t && %run %t 10 >%t.out.0 2>&1 // RUN: FileCheck %s < %t.out.0 -// RUN: %clang_tysan -O2 %s -o %t && %run %t 10 >%t.out 2>&1 +// RUN: %clang_tysan -O2 -mllvm -tysan-outline-instrumentation=false %s -o %t && %run %t 10 >%t.out 2>&1 +// RUN: FileCheck %s < %t.out +// RUN: %clang_tysan -O0 -mllvm -tysan-outline-instrumentation=true %s -o %t && %run %t 10 >%t.out.0 2>&1 +// RUN: FileCheck %s < %t.out.0 +// RUN: %clang_tysan -O2 -mllvm -tysan-outline-instrumentation=true %s -o %t && %run %t 10 >%t.out 2>&1 // RUN: FileCheck %s < %t.out #include diff --git a/compiler-rt/test/tysan/simple_verify_outlines.c b/compiler-rt/test/tysan/simple_verify_outlines.c new file mode 100644 index 0000000000000..0d0730edb0b99 --- /dev/null +++ b/compiler-rt/test/tysan/simple_verify_outlines.c @@ -0,0 +1,22 @@ +// RUN: %clang_tysan -mllvm -tysan-outline-instrumentation=true -mllvm -tysan-verify-outlined-instrumentation=true %s -o %t && %run %t >%t.out.0 2>&1 +// RUN: FileCheck %s < %t.out.0 + +#include + +void printInt(int *i) { printf("%d\n", *i); } + +int main() { + + float value = 5.0f; + printInt((int *)&value); + + return 0; +} + +// CHECK: ERROR: TypeSanitizer: type-aliasing-violation +// CHECK-NEXT: READ of size 4 at {{.*}} with type int accesses an existing object of type float +// CHECK-NEXT: {{#0 0x.* in printInt}} +// CHECK-EMPTY: +// CHECK-NEXT: ERROR: TypeSanitizer: type-aliasing-violation +// CHECK-NEXT: READ of size 4 at {{.*}} with type int accesses an existing object of type float +// CHECK-NEXT: {{#0 0x.* in printInt}} diff --git a/compiler-rt/test/tysan/struct-offset-outline.c b/compiler-rt/test/tysan/struct-offset-outline.c new file mode 100644 index 0000000000000..c84eb2762f669 --- /dev/null +++ b/compiler-rt/test/tysan/struct-offset-outline.c @@ -0,0 +1,32 @@ +// RUN: %clang_tysan -mllvm -tysan-outline-instrumentation=true -O0 %s -o %t && %run %t >%t.out 2>&1 +// RUN: FileCheck %s < %t.out +// RUN: %clang_tysan -mllvm -tysan-outline-instrumentation=true -mllvm -tysan-verify-outlined-instrumentation=true -O0 %s -o %t && %run %t >%t.out 2>&1 +// RUN: FileCheck %s --check-prefixes='CHECK,CHECK-VERIFY' < %t.out + +#include +#include + +struct X { + int i; + int j; +}; + +int foo(struct X *p, struct X *q) { + q->j = 1; + p->i = 0; + // CHECK: ERROR: TypeSanitizer: type-aliasing-violation + // CHECK-NEXT: WRITE of size 4 at {{.*}} with type int (in X at offset 0) accesses an existing object of type int (in X at offset 4) + // CHECK-NEXT: {{#0 0x.* in foo .*struct-offset-outline.c:}}[[@LINE-3]] + // CHECK-VERIFY-EMPTY: + // CHECK-VERIFY-NEXT: ERROR: TypeSanitizer: type-aliasing-violation + // CHECK-VERIFY-NEXT: WRITE of size 4 at {{.*}} with type int (in X at offset 0) accesses an existing object of type int (in X at offset 4) + // CHECK-VERIFY-NEXT: {{#0 0x.* in foo .*struct-offset-outline.c:}}[[@LINE-7]] + return q->j; +} + +int main() { + unsigned char *p = malloc(3 * sizeof(int)); + printf("%i\n", foo((struct X *)(p + sizeof(int)), (struct X *)p)); +} + +// CHECK-NOT: ERROR: TypeSanitizer: type-aliasing-violation diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp index 78d4a57ecea87..87eba5f2c5242 100644 --- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp @@ -58,6 +58,18 @@ static cl::opt cl::desc("Writes always set the type"), cl::Hidden, cl::init(false)); +static cl::opt ClOutlineInstrumentation( + "tysan-outline-instrumentation", + cl::desc("Uses function calls for all TySan instrumentation, reducing " + "ELF size"), + cl::Hidden, cl::init(false)); + +static cl::opt ClVerifyOutlinedInstrumentation( + "tysan-verify-outlined-instrumentation", + cl::desc("Check types twice with both inlined instrumentation and " + "function calls. This verifies that they behave the same."), + cl::Hidden, cl::init(false)); + STATISTIC(NumInstrumentedAccesses, "Number of instrumented accesses"); namespace { @@ -105,12 +117,16 @@ struct TypeSanitizer { Regex AnonNameRegex; Type *IntptrTy; uint64_t PtrShift; - IntegerType *OrdTy; + IntegerType *OrdTy, *U64Ty; /// Callbacks to run-time library are computed in initializeCallbacks. FunctionCallee TysanCheck; FunctionCallee TysanCtorFunction; + FunctionCallee TysanIntrumentMemInst; + FunctionCallee TysanInstrumentWithShadowUpdate; + FunctionCallee TysanSetShadowType; + /// Callback to set types for gloabls. Function *TysanGlobalsSetTypeFunction; }; @@ -130,6 +146,8 @@ TypeSanitizer::TypeSanitizer(Module &M) void TypeSanitizer::initializeCallbacks(Module &M) { IRBuilder<> IRB(M.getContext()); OrdTy = IRB.getInt32Ty(); + U64Ty = IRB.getInt64Ty(); + Type *BoolType = IRB.getInt1Ty(); AttributeList Attr; Attr = Attr.addFnAttribute(M.getContext(), Attribute::NoUnwind); @@ -144,6 +162,30 @@ void TypeSanitizer::initializeCallbacks(Module &M) { TysanCtorFunction = M.getOrInsertFunction(kTysanModuleCtorName, Attr, IRB.getVoidTy()); + + TysanIntrumentMemInst = M.getOrInsertFunction( + "__tysan_instrument_mem_inst", Attr, IRB.getVoidTy(), + IRB.getPtrTy(), // Pointer of data to be written to + IRB.getPtrTy(), // Pointer of data to write + U64Ty, // Size of the data in bytes + BoolType // Do we need to call memmove + ); + + TysanInstrumentWithShadowUpdate = M.getOrInsertFunction( + "__tysan_instrument_with_shadow_update", Attr, IRB.getVoidTy(), + IRB.getPtrTy(), // Pointer to data to be read + IRB.getPtrTy(), // Pointer to type descriptor + BoolType, // Do we need to type check this + U64Ty, // Size of data we access in bytes + OrdTy // Flags + ); + + TysanSetShadowType = M.getOrInsertFunction( + "__tysan_set_shadow_type", Attr, IRB.getVoidTy(), + IRB.getPtrTy(), // Pointer of data to be written to + IRB.getPtrTy(), // Pointer to the new type descriptor + U64Ty // Size of data we access in bytes + ); } void TypeSanitizer::instrumentGlobals(Module &M) { @@ -587,6 +629,29 @@ bool TypeSanitizer::instrumentWithShadowUpdate( Value *TD = IRB.CreateBitCast(TDGV, IRB.getPtrTy()); + if (ClOutlineInstrumentation) { + if (!ForceSetType && (!ClWritesAlwaysSetType || IsRead)) { + // We need to check the type here. If the type is unknown, then the read + // sets the type. If the type is known, then it is checked. If the type + // doesn't match, then we call the runtime type check (which may yet + // determine that the mismatch is okay). + + Constant *Flags = + ConstantInt::get(OrdTy, (int)IsRead | (((int)IsWrite) << 1)); + + IRB.CreateCall(TysanInstrumentWithShadowUpdate, + {Ptr, TD, + SanitizeFunction ? IRB.getTrue() : IRB.getFalse(), + IRB.getInt64(AccessSize), Flags}); + } else if (ForceSetType || IsWrite) { + // In the mode where writes always set the type, for a write (which does + // not also read), we just set the type. + IRB.CreateCall(TysanSetShadowType, {Ptr, TD, IRB.getInt64(AccessSize)}); + } + + return true; + } + Value *ShadowDataInt = convertToShadowDataInt(IRB, Ptr, IntptrTy, PtrShift, ShadowBase, AppMemMask); Type *Int8PtrPtrTy = PointerType::get(IRB.getContext(), 0); @@ -838,37 +903,47 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase, } } - if (!ShadowBase) - ShadowBase = getShadowBase(*F); - if (!AppMemMask) - AppMemMask = getAppMemMask(*F); + if (ClOutlineInstrumentation) { + if (!Src) + Src = ConstantPointerNull::get(IRB.getPtrTy()); - Value *ShadowDataInt = IRB.CreateAdd( - IRB.CreateShl( - IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask), - PtrShift), - ShadowBase); - Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy()); - - if (!Src) { - IRB.CreateMemSet(ShadowData, IRB.getInt8(0), IRB.CreateShl(Size, PtrShift), - Align(1ull << PtrShift)); + IRB.CreateCall( + TysanIntrumentMemInst, + {Dest, Src, Size, NeedsMemMove ? IRB.getTrue() : IRB.getFalse()}); return true; - } - - Value *SrcShadowDataInt = IRB.CreateAdd( - IRB.CreateShl( - IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask), - PtrShift), - ShadowBase); - Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy()); - - if (NeedsMemMove) { - IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData, - Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); } else { - IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData, - Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); + if (!ShadowBase) + ShadowBase = getShadowBase(*F); + if (!AppMemMask) + AppMemMask = getAppMemMask(*F); + + Value *ShadowDataInt = IRB.CreateAdd( + IRB.CreateShl( + IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask), + PtrShift), + ShadowBase); + Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy()); + + if (!Src) { + IRB.CreateMemSet(ShadowData, IRB.getInt8(0), + IRB.CreateShl(Size, PtrShift), Align(1ull << PtrShift)); + return true; + } + + Value *SrcShadowDataInt = IRB.CreateAdd( + IRB.CreateShl( + IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask), + PtrShift), + ShadowBase); + Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy()); + + if (NeedsMemMove) { + IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData, + Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); + } else { + IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData, + Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); + } } return true; @@ -890,6 +965,16 @@ PreservedAnalyses TypeSanitizerPass::run(Module &M, for (Function &F : M) { const TargetLibraryInfo &TLI = FAM.getResult(F); TySan.sanitizeFunction(F, TLI); + if (ClVerifyOutlinedInstrumentation && ClOutlineInstrumentation) { + // Outlined instrumentation is a new option, and so this exists to + // verify there is no difference in behaviour between the options. + // If the outlined instrumentation triggers a verification failure + // when the original inlined instrumentation does not, or vice versa, + // then there is a discrepency which should be investigated. + ClOutlineInstrumentation = false; + TySan.sanitizeFunction(F, TLI); + ClOutlineInstrumentation = true; + } } return PreservedAnalyses::none(); diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic_outlined.ll b/llvm/test/Instrumentation/TypeSanitizer/basic_outlined.ll new file mode 100644 index 0000000000000..1d118560f7580 --- /dev/null +++ b/llvm/test/Instrumentation/TypeSanitizer/basic_outlined.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals +; Test basic type sanitizer instrumentation. +; +; RUN: opt -passes='tysan' -tysan-outline-instrumentation -S %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +;. +; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }] +; CHECK: @__tysan_v1_Simple_20C_2b_2b_20TBAA = linkonce_odr constant { i64, i64, [16 x i8] } { i64 2, i64 0, [16 x i8] c"Simple C++ TBAA\00" }, comdat +; CHECK: @__tysan_v1_omnipotent_20char = linkonce_odr constant { i64, i64, ptr, i64, [16 x i8] } { i64 2, i64 1, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, i64 0, [16 x i8] c"omnipotent char\00" }, comdat +; CHECK: @__tysan_v1_int = linkonce_odr constant { i64, i64, ptr, i64, [4 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [4 x i8] c"int\00" }, comdat +; CHECK: @__tysan_v1_int_o_0 = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1_int, ptr @__tysan_v1_int, i64 0 }, comdat +; CHECK: @__tysan_shadow_memory_address = external global i64 +; CHECK: @__tysan_app_memory_mask = external global i64 +; CHECK: @__tysan_v1___ZTS1x = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 2, ptr @__tysan_v1_int, i64 0, ptr @__tysan_v1_int, i64 4, [7 x i8] c"_ZTS1x\00" }, comdat +; CHECK: @__tysan_v1___ZTS1v = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 3, ptr @__tysan_v1_int, i64 8, ptr @__tysan_v1_int, i64 12, ptr @__tysan_v1___ZTS1x, i64 16, [7 x i8] c"_ZTS1v\00" }, comdat +; CHECK: @__tysan_v1___ZTS1v_o_12 = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1_int, i64 12 }, comdat +; CHECK: @llvm.used = appending global [8 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_int_o_0, ptr @__tysan_v1___ZTS1x, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1___ZTS1v_o_12], section "llvm.metadata" +;. +define i32 @test_load(ptr %a) sanitize_type { +; CHECK-LABEL: @test_load( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8 +; CHECK-NEXT: [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8 +; CHECK-NEXT: call void @__tysan_instrument_with_shadow_update(ptr [[A:%.*]], ptr @__tysan_v1_int_o_0, i1 true, i64 4, i32 1) +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[TBAA0:![0-9]+]] +; CHECK-NEXT: ret i32 [[TMP1]] +; +entry: + %tmp1 = load i32, ptr %a, align 4, !tbaa !3 + ret i32 %tmp1 +} + +define void @test_store(ptr %a) sanitize_type { +; CHECK-LABEL: @test_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8 +; CHECK-NEXT: [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8 +; CHECK-NEXT: call void @__tysan_instrument_with_shadow_update(ptr [[A:%.*]], ptr @__tysan_v1___ZTS1v_o_12, i1 true, i64 4, i32 2) +; CHECK-NEXT: store i32 42, ptr [[A]], align 4, !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: ret void +; + +entry: + store i32 42, ptr %a, align 4, !tbaa !6 + ret void +} + +!0 = !{!"Simple C++ TBAA"} +!1 = !{!"omnipotent char", !0, i64 0} +!2 = !{!"int", !1, i64 0} +!3 = !{!2, !2, i64 0} +!4 = !{!"_ZTS1x", !2, i64 0, !2, i64 4} +!5 = !{!"_ZTS1v", !2, i64 8, !2, i64 12, !4, i64 16} +!6 = !{!5, !2, i64 12} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { sanitize_type } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind } +;. +; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} +; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0} +; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0} +; CHECK: [[META3]] = !{!"Simple C++ TBAA"} +; CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META1]], i64 12} +; CHECK: [[META5]] = !{!"_ZTS1v", [[META1]], i64 8, [[META1]], i64 12, [[META6:![0-9]+]], i64 16} +; CHECK: [[META6]] = !{!"_ZTS1x", [[META1]], i64 0, [[META1]], i64 4} +;. diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic_verify_outlined.ll b/llvm/test/Instrumentation/TypeSanitizer/basic_verify_outlined.ll new file mode 100644 index 0000000000000..187a41ea8a825 --- /dev/null +++ b/llvm/test/Instrumentation/TypeSanitizer/basic_verify_outlined.ll @@ -0,0 +1,736 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals +; Test basic type sanitizer instrumentation. +; +; RUN: opt -passes='tysan' -S -tysan-outline-instrumentation -tysan-verify-outlined-instrumentation -S %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +;. +; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }] +; CHECK: @__tysan_v1_Simple_20C_2b_2b_20TBAA = linkonce_odr constant { i64, i64, [16 x i8] } { i64 2, i64 0, [16 x i8] c"Simple C++ TBAA\00" }, comdat +; CHECK: @__tysan_v1_omnipotent_20char = linkonce_odr constant { i64, i64, ptr, i64, [16 x i8] } { i64 2, i64 1, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, i64 0, [16 x i8] c"omnipotent char\00" }, comdat +; CHECK: @__tysan_v1_int = linkonce_odr constant { i64, i64, ptr, i64, [4 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [4 x i8] c"int\00" }, comdat +; CHECK: @__tysan_v1_int_o_0 = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1_int, ptr @__tysan_v1_int, i64 0 }, comdat +; CHECK: @__tysan_shadow_memory_address = external global i64 +; CHECK: @__tysan_app_memory_mask = external global i64 +; CHECK: @__tysan_v1___ZTS1x = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 2, ptr @__tysan_v1_int, i64 0, ptr @__tysan_v1_int, i64 4, [7 x i8] c"_ZTS1x\00" }, comdat +; CHECK: @__tysan_v1___ZTS1v = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 3, ptr @__tysan_v1_int, i64 8, ptr @__tysan_v1_int, i64 12, ptr @__tysan_v1___ZTS1x, i64 16, [7 x i8] c"_ZTS1v\00" }, comdat +; CHECK: @__tysan_v1___ZTS1v_o_12 = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1_int, i64 12 }, comdat +; CHECK: @llvm.used = appending global [8 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_int_o_0, ptr @__tysan_v1___ZTS1x, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1___ZTS1v_o_12], section "llvm.metadata" +;. +define i32 @test_load(ptr %a) sanitize_type { +; CHECK-LABEL: @test_load( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[APP_MEM_MASK2:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8 +; CHECK-NEXT: [[SHADOW_BASE1:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8 +; CHECK-NEXT: [[APP_PTR_MASKED:%.*]] = and i64 ptrtoint (ptr @__tysan_app_memory_mask to i64), [[APP_MEM_MASK2]] +; CHECK-NEXT: [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3 +; CHECK-NEXT: [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE1]] +; CHECK-NEXT: [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr +; CHECK-NEXT: [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8 +; CHECK-NEXT: [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], null +; CHECK-NEXT: br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP42:%.*]], !prof [[PROF0:![0-9]+]] +; CHECK: 0: +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null +; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]] +; CHECK: 2: +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr +; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne ptr [[TMP5]], null +; CHECK-NEXT: [[TMP7:%.*]] = or i1 false, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[SHADOW_PTR_INT]], 16 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne ptr [[TMP10]], null +; CHECK-NEXT: [[TMP12:%.*]] = or i1 [[TMP7]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[SHADOW_PTR_INT]], 24 +; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr +; CHECK-NEXT: [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null +; CHECK-NEXT: [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[SHADOW_PTR_INT]], 32 +; CHECK-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr +; CHECK-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne ptr [[TMP20]], null +; CHECK-NEXT: [[TMP22:%.*]] = or i1 [[TMP17]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 40 +; CHECK-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr +; CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8 +; CHECK-NEXT: [[TMP26:%.*]] = icmp ne ptr [[TMP25]], null +; CHECK-NEXT: [[TMP27:%.*]] = or i1 [[TMP22]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[SHADOW_PTR_INT]], 48 +; CHECK-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr +; CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[TMP29]], align 8 +; CHECK-NEXT: [[TMP31:%.*]] = icmp ne ptr [[TMP30]], null +; CHECK-NEXT: [[TMP32:%.*]] = or i1 [[TMP27]], [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[SHADOW_PTR_INT]], 56 +; CHECK-NEXT: [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr +; CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[TMP34]], align 8 +; CHECK-NEXT: [[TMP36:%.*]] = icmp ne ptr [[TMP35]], null +; CHECK-NEXT: [[TMP37:%.*]] = or i1 [[TMP32]], [[TMP36]] +; CHECK-NEXT: br i1 [[TMP37]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF0]] +; CHECK: 38: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP39]] +; CHECK: 39: +; CHECK-NEXT: store ptr null, ptr [[SHADOW_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16 +; CHECK-NEXT: [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24 +; CHECK-NEXT: [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_4_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 32 +; CHECK-NEXT: [[SHADOW_BYTE_4_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_5_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 40 +; CHECK-NEXT: [[SHADOW_BYTE_5_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_6_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 48 +; CHECK-NEXT: [[SHADOW_BYTE_6_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_7_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 56 +; CHECK-NEXT: [[SHADOW_BYTE_7_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR]], align 8 +; CHECK-NEXT: br label [[TMP41:%.*]] +; CHECK: 40: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP41]] +; CHECK: 41: +; CHECK-NEXT: br label [[TMP87:%.*]] +; CHECK: 42: +; CHECK-NEXT: [[TMP43:%.*]] = add i64 [[SHADOW_PTR_INT]], 8 +; CHECK-NEXT: [[TMP44:%.*]] = inttoptr i64 [[TMP43]] to ptr +; CHECK-NEXT: [[TMP45:%.*]] = load ptr, ptr [[TMP44]], align 8 +; CHECK-NEXT: [[TMP46:%.*]] = ptrtoint ptr [[TMP45]] to i64 +; CHECK-NEXT: [[TMP47:%.*]] = icmp sge i64 [[TMP46]], 0 +; CHECK-NEXT: [[TMP48:%.*]] = or i1 false, [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = add i64 [[SHADOW_PTR_INT]], 16 +; CHECK-NEXT: [[TMP50:%.*]] = inttoptr i64 [[TMP49]] to ptr +; CHECK-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP50]], align 8 +; CHECK-NEXT: [[TMP52:%.*]] = ptrtoint ptr [[TMP51]] to i64 +; CHECK-NEXT: [[TMP53:%.*]] = icmp sge i64 [[TMP52]], 0 +; CHECK-NEXT: [[TMP54:%.*]] = or i1 [[TMP48]], [[TMP53]] +; CHECK-NEXT: [[TMP55:%.*]] = add i64 [[SHADOW_PTR_INT]], 24 +; CHECK-NEXT: [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr +; CHECK-NEXT: [[TMP57:%.*]] = load ptr, ptr [[TMP56]], align 8 +; CHECK-NEXT: [[TMP58:%.*]] = ptrtoint ptr [[TMP57]] to i64 +; CHECK-NEXT: [[TMP59:%.*]] = icmp sge i64 [[TMP58]], 0 +; CHECK-NEXT: [[TMP60:%.*]] = or i1 [[TMP54]], [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = add i64 [[SHADOW_PTR_INT]], 32 +; CHECK-NEXT: [[TMP62:%.*]] = inttoptr i64 [[TMP61]] to ptr +; CHECK-NEXT: [[TMP63:%.*]] = load ptr, ptr [[TMP62]], align 8 +; CHECK-NEXT: [[TMP64:%.*]] = ptrtoint ptr [[TMP63]] to i64 +; CHECK-NEXT: [[TMP65:%.*]] = icmp sge i64 [[TMP64]], 0 +; CHECK-NEXT: [[TMP66:%.*]] = or i1 [[TMP60]], [[TMP65]] +; CHECK-NEXT: [[TMP67:%.*]] = add i64 [[SHADOW_PTR_INT]], 40 +; CHECK-NEXT: [[TMP68:%.*]] = inttoptr i64 [[TMP67]] to ptr +; CHECK-NEXT: [[TMP69:%.*]] = load ptr, ptr [[TMP68]], align 8 +; CHECK-NEXT: [[TMP70:%.*]] = ptrtoint ptr [[TMP69]] to i64 +; CHECK-NEXT: [[TMP71:%.*]] = icmp sge i64 [[TMP70]], 0 +; CHECK-NEXT: [[TMP72:%.*]] = or i1 [[TMP66]], [[TMP71]] +; CHECK-NEXT: [[TMP73:%.*]] = add i64 [[SHADOW_PTR_INT]], 48 +; CHECK-NEXT: [[TMP74:%.*]] = inttoptr i64 [[TMP73]] to ptr +; CHECK-NEXT: [[TMP75:%.*]] = load ptr, ptr [[TMP74]], align 8 +; CHECK-NEXT: [[TMP76:%.*]] = ptrtoint ptr [[TMP75]] to i64 +; CHECK-NEXT: [[TMP77:%.*]] = icmp sge i64 [[TMP76]], 0 +; CHECK-NEXT: [[TMP78:%.*]] = or i1 [[TMP72]], [[TMP77]] +; CHECK-NEXT: [[TMP79:%.*]] = add i64 [[SHADOW_PTR_INT]], 56 +; CHECK-NEXT: [[TMP80:%.*]] = inttoptr i64 [[TMP79]] to ptr +; CHECK-NEXT: [[TMP81:%.*]] = load ptr, ptr [[TMP80]], align 8 +; CHECK-NEXT: [[TMP82:%.*]] = ptrtoint ptr [[TMP81]] to i64 +; CHECK-NEXT: [[TMP83:%.*]] = icmp sge i64 [[TMP82]], 0 +; CHECK-NEXT: [[TMP84:%.*]] = or i1 [[TMP78]], [[TMP83]] +; CHECK-NEXT: br i1 [[TMP84]], label [[TMP85:%.*]], label [[TMP86:%.*]], !prof [[PROF0]] +; CHECK: 85: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP86]] +; CHECK: 86: +; CHECK-NEXT: br label [[TMP87]] +; CHECK: 87: +; CHECK-NEXT: [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8 +; CHECK-NEXT: [[APP_PTR_MASKED3:%.*]] = and i64 ptrtoint (ptr @__tysan_shadow_memory_address to i64), [[APP_MEM_MASK2]] +; CHECK-NEXT: [[APP_PTR_SHIFTED4:%.*]] = shl i64 [[APP_PTR_MASKED3]], 3 +; CHECK-NEXT: [[SHADOW_PTR_INT5:%.*]] = add i64 [[APP_PTR_SHIFTED4]], [[SHADOW_BASE1]] +; CHECK-NEXT: [[SHADOW_PTR6:%.*]] = inttoptr i64 [[SHADOW_PTR_INT5]] to ptr +; CHECK-NEXT: [[SHADOW_DESC7:%.*]] = load ptr, ptr [[SHADOW_PTR6]], align 8 +; CHECK-NEXT: [[BAD_DESC8:%.*]] = icmp ne ptr [[SHADOW_DESC7]], null +; CHECK-NEXT: br i1 [[BAD_DESC8]], label [[TMP88:%.*]], label [[TMP130:%.*]], !prof [[PROF0]] +; CHECK: 88: +; CHECK-NEXT: [[TMP89:%.*]] = icmp eq ptr [[SHADOW_DESC7]], null +; CHECK-NEXT: br i1 [[TMP89]], label [[TMP90:%.*]], label [[TMP128:%.*]] +; CHECK: 90: +; CHECK-NEXT: [[TMP91:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8 +; CHECK-NEXT: [[TMP92:%.*]] = inttoptr i64 [[TMP91]] to ptr +; CHECK-NEXT: [[TMP93:%.*]] = load ptr, ptr [[TMP92]], align 8 +; CHECK-NEXT: [[TMP94:%.*]] = icmp ne ptr [[TMP93]], null +; CHECK-NEXT: [[TMP95:%.*]] = or i1 false, [[TMP94]] +; CHECK-NEXT: [[TMP96:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16 +; CHECK-NEXT: [[TMP97:%.*]] = inttoptr i64 [[TMP96]] to ptr +; CHECK-NEXT: [[TMP98:%.*]] = load ptr, ptr [[TMP97]], align 8 +; CHECK-NEXT: [[TMP99:%.*]] = icmp ne ptr [[TMP98]], null +; CHECK-NEXT: [[TMP100:%.*]] = or i1 [[TMP95]], [[TMP99]] +; CHECK-NEXT: [[TMP101:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24 +; CHECK-NEXT: [[TMP102:%.*]] = inttoptr i64 [[TMP101]] to ptr +; CHECK-NEXT: [[TMP103:%.*]] = load ptr, ptr [[TMP102]], align 8 +; CHECK-NEXT: [[TMP104:%.*]] = icmp ne ptr [[TMP103]], null +; CHECK-NEXT: [[TMP105:%.*]] = or i1 [[TMP100]], [[TMP104]] +; CHECK-NEXT: [[TMP106:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32 +; CHECK-NEXT: [[TMP107:%.*]] = inttoptr i64 [[TMP106]] to ptr +; CHECK-NEXT: [[TMP108:%.*]] = load ptr, ptr [[TMP107]], align 8 +; CHECK-NEXT: [[TMP109:%.*]] = icmp ne ptr [[TMP108]], null +; CHECK-NEXT: [[TMP110:%.*]] = or i1 [[TMP105]], [[TMP109]] +; CHECK-NEXT: [[TMP111:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40 +; CHECK-NEXT: [[TMP112:%.*]] = inttoptr i64 [[TMP111]] to ptr +; CHECK-NEXT: [[TMP113:%.*]] = load ptr, ptr [[TMP112]], align 8 +; CHECK-NEXT: [[TMP114:%.*]] = icmp ne ptr [[TMP113]], null +; CHECK-NEXT: [[TMP115:%.*]] = or i1 [[TMP110]], [[TMP114]] +; CHECK-NEXT: [[TMP116:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48 +; CHECK-NEXT: [[TMP117:%.*]] = inttoptr i64 [[TMP116]] to ptr +; CHECK-NEXT: [[TMP118:%.*]] = load ptr, ptr [[TMP117]], align 8 +; CHECK-NEXT: [[TMP119:%.*]] = icmp ne ptr [[TMP118]], null +; CHECK-NEXT: [[TMP120:%.*]] = or i1 [[TMP115]], [[TMP119]] +; CHECK-NEXT: [[TMP121:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56 +; CHECK-NEXT: [[TMP122:%.*]] = inttoptr i64 [[TMP121]] to ptr +; CHECK-NEXT: [[TMP123:%.*]] = load ptr, ptr [[TMP122]], align 8 +; CHECK-NEXT: [[TMP124:%.*]] = icmp ne ptr [[TMP123]], null +; CHECK-NEXT: [[TMP125:%.*]] = or i1 [[TMP120]], [[TMP124]] +; CHECK-NEXT: br i1 [[TMP125]], label [[TMP126:%.*]], label [[TMP127:%.*]], !prof [[PROF0]] +; CHECK: 126: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP127]] +; CHECK: 127: +; CHECK-NEXT: store ptr null, ptr [[SHADOW_PTR6]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_OFFSET9:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_PTR10:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET9]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR10]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_2_OFFSET11:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16 +; CHECK-NEXT: [[SHADOW_BYTE_2_PTR12:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET11]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR12]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_3_OFFSET13:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24 +; CHECK-NEXT: [[SHADOW_BYTE_3_PTR14:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET13]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR14]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_4_OFFSET15:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32 +; CHECK-NEXT: [[SHADOW_BYTE_4_PTR16:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET15]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR16]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_5_OFFSET17:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40 +; CHECK-NEXT: [[SHADOW_BYTE_5_PTR18:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET17]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR18]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_6_OFFSET19:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48 +; CHECK-NEXT: [[SHADOW_BYTE_6_PTR20:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET19]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR20]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_7_OFFSET21:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56 +; CHECK-NEXT: [[SHADOW_BYTE_7_PTR22:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET21]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR22]], align 8 +; CHECK-NEXT: br label [[TMP129:%.*]] +; CHECK: 128: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP129]] +; CHECK: 129: +; CHECK-NEXT: br label [[TMP175:%.*]] +; CHECK: 130: +; CHECK-NEXT: [[TMP131:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8 +; CHECK-NEXT: [[TMP132:%.*]] = inttoptr i64 [[TMP131]] to ptr +; CHECK-NEXT: [[TMP133:%.*]] = load ptr, ptr [[TMP132]], align 8 +; CHECK-NEXT: [[TMP134:%.*]] = ptrtoint ptr [[TMP133]] to i64 +; CHECK-NEXT: [[TMP135:%.*]] = icmp sge i64 [[TMP134]], 0 +; CHECK-NEXT: [[TMP136:%.*]] = or i1 false, [[TMP135]] +; CHECK-NEXT: [[TMP137:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16 +; CHECK-NEXT: [[TMP138:%.*]] = inttoptr i64 [[TMP137]] to ptr +; CHECK-NEXT: [[TMP139:%.*]] = load ptr, ptr [[TMP138]], align 8 +; CHECK-NEXT: [[TMP140:%.*]] = ptrtoint ptr [[TMP139]] to i64 +; CHECK-NEXT: [[TMP141:%.*]] = icmp sge i64 [[TMP140]], 0 +; CHECK-NEXT: [[TMP142:%.*]] = or i1 [[TMP136]], [[TMP141]] +; CHECK-NEXT: [[TMP143:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24 +; CHECK-NEXT: [[TMP144:%.*]] = inttoptr i64 [[TMP143]] to ptr +; CHECK-NEXT: [[TMP145:%.*]] = load ptr, ptr [[TMP144]], align 8 +; CHECK-NEXT: [[TMP146:%.*]] = ptrtoint ptr [[TMP145]] to i64 +; CHECK-NEXT: [[TMP147:%.*]] = icmp sge i64 [[TMP146]], 0 +; CHECK-NEXT: [[TMP148:%.*]] = or i1 [[TMP142]], [[TMP147]] +; CHECK-NEXT: [[TMP149:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32 +; CHECK-NEXT: [[TMP150:%.*]] = inttoptr i64 [[TMP149]] to ptr +; CHECK-NEXT: [[TMP151:%.*]] = load ptr, ptr [[TMP150]], align 8 +; CHECK-NEXT: [[TMP152:%.*]] = ptrtoint ptr [[TMP151]] to i64 +; CHECK-NEXT: [[TMP153:%.*]] = icmp sge i64 [[TMP152]], 0 +; CHECK-NEXT: [[TMP154:%.*]] = or i1 [[TMP148]], [[TMP153]] +; CHECK-NEXT: [[TMP155:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40 +; CHECK-NEXT: [[TMP156:%.*]] = inttoptr i64 [[TMP155]] to ptr +; CHECK-NEXT: [[TMP157:%.*]] = load ptr, ptr [[TMP156]], align 8 +; CHECK-NEXT: [[TMP158:%.*]] = ptrtoint ptr [[TMP157]] to i64 +; CHECK-NEXT: [[TMP159:%.*]] = icmp sge i64 [[TMP158]], 0 +; CHECK-NEXT: [[TMP160:%.*]] = or i1 [[TMP154]], [[TMP159]] +; CHECK-NEXT: [[TMP161:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48 +; CHECK-NEXT: [[TMP162:%.*]] = inttoptr i64 [[TMP161]] to ptr +; CHECK-NEXT: [[TMP163:%.*]] = load ptr, ptr [[TMP162]], align 8 +; CHECK-NEXT: [[TMP164:%.*]] = ptrtoint ptr [[TMP163]] to i64 +; CHECK-NEXT: [[TMP165:%.*]] = icmp sge i64 [[TMP164]], 0 +; CHECK-NEXT: [[TMP166:%.*]] = or i1 [[TMP160]], [[TMP165]] +; CHECK-NEXT: [[TMP167:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56 +; CHECK-NEXT: [[TMP168:%.*]] = inttoptr i64 [[TMP167]] to ptr +; CHECK-NEXT: [[TMP169:%.*]] = load ptr, ptr [[TMP168]], align 8 +; CHECK-NEXT: [[TMP170:%.*]] = ptrtoint ptr [[TMP169]] to i64 +; CHECK-NEXT: [[TMP171:%.*]] = icmp sge i64 [[TMP170]], 0 +; CHECK-NEXT: [[TMP172:%.*]] = or i1 [[TMP166]], [[TMP171]] +; CHECK-NEXT: br i1 [[TMP172]], label [[TMP173:%.*]], label [[TMP174:%.*]], !prof [[PROF0]] +; CHECK: 173: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP174]] +; CHECK: 174: +; CHECK-NEXT: br label [[TMP175]] +; CHECK: 175: +; CHECK-NEXT: [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8 +; CHECK-NEXT: call void @__tysan_instrument_with_shadow_update(ptr [[A:%.*]], ptr @__tysan_v1_int_o_0, i1 true, i64 4, i32 1) +; CHECK-NEXT: [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[APP_PTR_MASKED23:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK2]] +; CHECK-NEXT: [[APP_PTR_SHIFTED24:%.*]] = shl i64 [[APP_PTR_MASKED23]], 3 +; CHECK-NEXT: [[SHADOW_PTR_INT25:%.*]] = add i64 [[APP_PTR_SHIFTED24]], [[SHADOW_BASE1]] +; CHECK-NEXT: [[SHADOW_PTR26:%.*]] = inttoptr i64 [[SHADOW_PTR_INT25]] to ptr +; CHECK-NEXT: [[SHADOW_DESC27:%.*]] = load ptr, ptr [[SHADOW_PTR26]], align 8 +; CHECK-NEXT: [[BAD_DESC28:%.*]] = icmp ne ptr [[SHADOW_DESC27]], @__tysan_v1_int_o_0 +; CHECK-NEXT: br i1 [[BAD_DESC28]], label [[TMP176:%.*]], label [[TMP198:%.*]], !prof [[PROF0]] +; CHECK: 176: +; CHECK-NEXT: [[TMP177:%.*]] = icmp eq ptr [[SHADOW_DESC27]], null +; CHECK-NEXT: br i1 [[TMP177]], label [[TMP178:%.*]], label [[TMP196:%.*]] +; CHECK: 178: +; CHECK-NEXT: [[TMP179:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8 +; CHECK-NEXT: [[TMP180:%.*]] = inttoptr i64 [[TMP179]] to ptr +; CHECK-NEXT: [[TMP181:%.*]] = load ptr, ptr [[TMP180]], align 8 +; CHECK-NEXT: [[TMP182:%.*]] = icmp ne ptr [[TMP181]], null +; CHECK-NEXT: [[TMP183:%.*]] = or i1 false, [[TMP182]] +; CHECK-NEXT: [[TMP184:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16 +; CHECK-NEXT: [[TMP185:%.*]] = inttoptr i64 [[TMP184]] to ptr +; CHECK-NEXT: [[TMP186:%.*]] = load ptr, ptr [[TMP185]], align 8 +; CHECK-NEXT: [[TMP187:%.*]] = icmp ne ptr [[TMP186]], null +; CHECK-NEXT: [[TMP188:%.*]] = or i1 [[TMP183]], [[TMP187]] +; CHECK-NEXT: [[TMP189:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24 +; CHECK-NEXT: [[TMP190:%.*]] = inttoptr i64 [[TMP189]] to ptr +; CHECK-NEXT: [[TMP191:%.*]] = load ptr, ptr [[TMP190]], align 8 +; CHECK-NEXT: [[TMP192:%.*]] = icmp ne ptr [[TMP191]], null +; CHECK-NEXT: [[TMP193:%.*]] = or i1 [[TMP188]], [[TMP192]] +; CHECK-NEXT: br i1 [[TMP193]], label [[TMP194:%.*]], label [[TMP195:%.*]], !prof [[PROF0]] +; CHECK: 194: +; CHECK-NEXT: call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1) +; CHECK-NEXT: br label [[TMP195]] +; CHECK: 195: +; CHECK-NEXT: store ptr @__tysan_v1_int_o_0, ptr [[SHADOW_PTR26]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_OFFSET29:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_PTR30:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET29]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR30]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_2_OFFSET31:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16 +; CHECK-NEXT: [[SHADOW_BYTE_2_PTR32:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET31]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR32]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_3_OFFSET33:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24 +; CHECK-NEXT: [[SHADOW_BYTE_3_PTR34:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET33]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR34]], align 8 +; CHECK-NEXT: br label [[TMP197:%.*]] +; CHECK: 196: +; CHECK-NEXT: call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1) +; CHECK-NEXT: br label [[TMP197]] +; CHECK: 197: +; CHECK-NEXT: br label [[TMP219:%.*]] +; CHECK: 198: +; CHECK-NEXT: [[TMP199:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8 +; CHECK-NEXT: [[TMP200:%.*]] = inttoptr i64 [[TMP199]] to ptr +; CHECK-NEXT: [[TMP201:%.*]] = load ptr, ptr [[TMP200]], align 8 +; CHECK-NEXT: [[TMP202:%.*]] = ptrtoint ptr [[TMP201]] to i64 +; CHECK-NEXT: [[TMP203:%.*]] = icmp sge i64 [[TMP202]], 0 +; CHECK-NEXT: [[TMP204:%.*]] = or i1 false, [[TMP203]] +; CHECK-NEXT: [[TMP205:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16 +; CHECK-NEXT: [[TMP206:%.*]] = inttoptr i64 [[TMP205]] to ptr +; CHECK-NEXT: [[TMP207:%.*]] = load ptr, ptr [[TMP206]], align 8 +; CHECK-NEXT: [[TMP208:%.*]] = ptrtoint ptr [[TMP207]] to i64 +; CHECK-NEXT: [[TMP209:%.*]] = icmp sge i64 [[TMP208]], 0 +; CHECK-NEXT: [[TMP210:%.*]] = or i1 [[TMP204]], [[TMP209]] +; CHECK-NEXT: [[TMP211:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24 +; CHECK-NEXT: [[TMP212:%.*]] = inttoptr i64 [[TMP211]] to ptr +; CHECK-NEXT: [[TMP213:%.*]] = load ptr, ptr [[TMP212]], align 8 +; CHECK-NEXT: [[TMP214:%.*]] = ptrtoint ptr [[TMP213]] to i64 +; CHECK-NEXT: [[TMP215:%.*]] = icmp sge i64 [[TMP214]], 0 +; CHECK-NEXT: [[TMP216:%.*]] = or i1 [[TMP210]], [[TMP215]] +; CHECK-NEXT: br i1 [[TMP216]], label [[TMP217:%.*]], label [[TMP218:%.*]], !prof [[PROF0]] +; CHECK: 217: +; CHECK-NEXT: call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1) +; CHECK-NEXT: br label [[TMP218]] +; CHECK: 218: +; CHECK-NEXT: br label [[TMP219]] +; CHECK: 219: +; CHECK-NEXT: [[WAA:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[TBAA1:![0-9]+]] +; CHECK-NEXT: ret i32 [[WAA]] +; +entry: + %WAA = load i32, ptr %a, align 4, !tbaa !3 + ret i32 %WAA +} + +define void @test_store(ptr %a) sanitize_type { +; CHECK-LABEL: @test_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[APP_MEM_MASK2:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8 +; CHECK-NEXT: [[SHADOW_BASE1:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8 +; CHECK-NEXT: [[APP_PTR_MASKED:%.*]] = and i64 ptrtoint (ptr @__tysan_app_memory_mask to i64), [[APP_MEM_MASK2]] +; CHECK-NEXT: [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3 +; CHECK-NEXT: [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE1]] +; CHECK-NEXT: [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr +; CHECK-NEXT: [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8 +; CHECK-NEXT: [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], null +; CHECK-NEXT: br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP42:%.*]], !prof [[PROF0]] +; CHECK: 0: +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null +; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]] +; CHECK: 2: +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr +; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne ptr [[TMP5]], null +; CHECK-NEXT: [[TMP7:%.*]] = or i1 false, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[SHADOW_PTR_INT]], 16 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne ptr [[TMP10]], null +; CHECK-NEXT: [[TMP12:%.*]] = or i1 [[TMP7]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[SHADOW_PTR_INT]], 24 +; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr +; CHECK-NEXT: [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null +; CHECK-NEXT: [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[SHADOW_PTR_INT]], 32 +; CHECK-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr +; CHECK-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne ptr [[TMP20]], null +; CHECK-NEXT: [[TMP22:%.*]] = or i1 [[TMP17]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 40 +; CHECK-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr +; CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8 +; CHECK-NEXT: [[TMP26:%.*]] = icmp ne ptr [[TMP25]], null +; CHECK-NEXT: [[TMP27:%.*]] = or i1 [[TMP22]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[SHADOW_PTR_INT]], 48 +; CHECK-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr +; CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[TMP29]], align 8 +; CHECK-NEXT: [[TMP31:%.*]] = icmp ne ptr [[TMP30]], null +; CHECK-NEXT: [[TMP32:%.*]] = or i1 [[TMP27]], [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[SHADOW_PTR_INT]], 56 +; CHECK-NEXT: [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr +; CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[TMP34]], align 8 +; CHECK-NEXT: [[TMP36:%.*]] = icmp ne ptr [[TMP35]], null +; CHECK-NEXT: [[TMP37:%.*]] = or i1 [[TMP32]], [[TMP36]] +; CHECK-NEXT: br i1 [[TMP37]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF0]] +; CHECK: 38: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP39]] +; CHECK: 39: +; CHECK-NEXT: store ptr null, ptr [[SHADOW_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16 +; CHECK-NEXT: [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24 +; CHECK-NEXT: [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_4_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 32 +; CHECK-NEXT: [[SHADOW_BYTE_4_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_5_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 40 +; CHECK-NEXT: [[SHADOW_BYTE_5_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_6_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 48 +; CHECK-NEXT: [[SHADOW_BYTE_6_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_7_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 56 +; CHECK-NEXT: [[SHADOW_BYTE_7_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR]], align 8 +; CHECK-NEXT: br label [[TMP41:%.*]] +; CHECK: 40: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP41]] +; CHECK: 41: +; CHECK-NEXT: br label [[TMP87:%.*]] +; CHECK: 42: +; CHECK-NEXT: [[TMP43:%.*]] = add i64 [[SHADOW_PTR_INT]], 8 +; CHECK-NEXT: [[TMP44:%.*]] = inttoptr i64 [[TMP43]] to ptr +; CHECK-NEXT: [[TMP45:%.*]] = load ptr, ptr [[TMP44]], align 8 +; CHECK-NEXT: [[TMP46:%.*]] = ptrtoint ptr [[TMP45]] to i64 +; CHECK-NEXT: [[TMP47:%.*]] = icmp sge i64 [[TMP46]], 0 +; CHECK-NEXT: [[TMP48:%.*]] = or i1 false, [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = add i64 [[SHADOW_PTR_INT]], 16 +; CHECK-NEXT: [[TMP50:%.*]] = inttoptr i64 [[TMP49]] to ptr +; CHECK-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP50]], align 8 +; CHECK-NEXT: [[TMP52:%.*]] = ptrtoint ptr [[TMP51]] to i64 +; CHECK-NEXT: [[TMP53:%.*]] = icmp sge i64 [[TMP52]], 0 +; CHECK-NEXT: [[TMP54:%.*]] = or i1 [[TMP48]], [[TMP53]] +; CHECK-NEXT: [[TMP55:%.*]] = add i64 [[SHADOW_PTR_INT]], 24 +; CHECK-NEXT: [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr +; CHECK-NEXT: [[TMP57:%.*]] = load ptr, ptr [[TMP56]], align 8 +; CHECK-NEXT: [[TMP58:%.*]] = ptrtoint ptr [[TMP57]] to i64 +; CHECK-NEXT: [[TMP59:%.*]] = icmp sge i64 [[TMP58]], 0 +; CHECK-NEXT: [[TMP60:%.*]] = or i1 [[TMP54]], [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = add i64 [[SHADOW_PTR_INT]], 32 +; CHECK-NEXT: [[TMP62:%.*]] = inttoptr i64 [[TMP61]] to ptr +; CHECK-NEXT: [[TMP63:%.*]] = load ptr, ptr [[TMP62]], align 8 +; CHECK-NEXT: [[TMP64:%.*]] = ptrtoint ptr [[TMP63]] to i64 +; CHECK-NEXT: [[TMP65:%.*]] = icmp sge i64 [[TMP64]], 0 +; CHECK-NEXT: [[TMP66:%.*]] = or i1 [[TMP60]], [[TMP65]] +; CHECK-NEXT: [[TMP67:%.*]] = add i64 [[SHADOW_PTR_INT]], 40 +; CHECK-NEXT: [[TMP68:%.*]] = inttoptr i64 [[TMP67]] to ptr +; CHECK-NEXT: [[TMP69:%.*]] = load ptr, ptr [[TMP68]], align 8 +; CHECK-NEXT: [[TMP70:%.*]] = ptrtoint ptr [[TMP69]] to i64 +; CHECK-NEXT: [[TMP71:%.*]] = icmp sge i64 [[TMP70]], 0 +; CHECK-NEXT: [[TMP72:%.*]] = or i1 [[TMP66]], [[TMP71]] +; CHECK-NEXT: [[TMP73:%.*]] = add i64 [[SHADOW_PTR_INT]], 48 +; CHECK-NEXT: [[TMP74:%.*]] = inttoptr i64 [[TMP73]] to ptr +; CHECK-NEXT: [[TMP75:%.*]] = load ptr, ptr [[TMP74]], align 8 +; CHECK-NEXT: [[TMP76:%.*]] = ptrtoint ptr [[TMP75]] to i64 +; CHECK-NEXT: [[TMP77:%.*]] = icmp sge i64 [[TMP76]], 0 +; CHECK-NEXT: [[TMP78:%.*]] = or i1 [[TMP72]], [[TMP77]] +; CHECK-NEXT: [[TMP79:%.*]] = add i64 [[SHADOW_PTR_INT]], 56 +; CHECK-NEXT: [[TMP80:%.*]] = inttoptr i64 [[TMP79]] to ptr +; CHECK-NEXT: [[TMP81:%.*]] = load ptr, ptr [[TMP80]], align 8 +; CHECK-NEXT: [[TMP82:%.*]] = ptrtoint ptr [[TMP81]] to i64 +; CHECK-NEXT: [[TMP83:%.*]] = icmp sge i64 [[TMP82]], 0 +; CHECK-NEXT: [[TMP84:%.*]] = or i1 [[TMP78]], [[TMP83]] +; CHECK-NEXT: br i1 [[TMP84]], label [[TMP85:%.*]], label [[TMP86:%.*]], !prof [[PROF0]] +; CHECK: 85: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP86]] +; CHECK: 86: +; CHECK-NEXT: br label [[TMP87]] +; CHECK: 87: +; CHECK-NEXT: [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8 +; CHECK-NEXT: [[APP_PTR_MASKED3:%.*]] = and i64 ptrtoint (ptr @__tysan_shadow_memory_address to i64), [[APP_MEM_MASK2]] +; CHECK-NEXT: [[APP_PTR_SHIFTED4:%.*]] = shl i64 [[APP_PTR_MASKED3]], 3 +; CHECK-NEXT: [[SHADOW_PTR_INT5:%.*]] = add i64 [[APP_PTR_SHIFTED4]], [[SHADOW_BASE1]] +; CHECK-NEXT: [[SHADOW_PTR6:%.*]] = inttoptr i64 [[SHADOW_PTR_INT5]] to ptr +; CHECK-NEXT: [[SHADOW_DESC7:%.*]] = load ptr, ptr [[SHADOW_PTR6]], align 8 +; CHECK-NEXT: [[BAD_DESC8:%.*]] = icmp ne ptr [[SHADOW_DESC7]], null +; CHECK-NEXT: br i1 [[BAD_DESC8]], label [[TMP88:%.*]], label [[TMP130:%.*]], !prof [[PROF0]] +; CHECK: 88: +; CHECK-NEXT: [[TMP89:%.*]] = icmp eq ptr [[SHADOW_DESC7]], null +; CHECK-NEXT: br i1 [[TMP89]], label [[TMP90:%.*]], label [[TMP128:%.*]] +; CHECK: 90: +; CHECK-NEXT: [[TMP91:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8 +; CHECK-NEXT: [[TMP92:%.*]] = inttoptr i64 [[TMP91]] to ptr +; CHECK-NEXT: [[TMP93:%.*]] = load ptr, ptr [[TMP92]], align 8 +; CHECK-NEXT: [[TMP94:%.*]] = icmp ne ptr [[TMP93]], null +; CHECK-NEXT: [[TMP95:%.*]] = or i1 false, [[TMP94]] +; CHECK-NEXT: [[TMP96:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16 +; CHECK-NEXT: [[TMP97:%.*]] = inttoptr i64 [[TMP96]] to ptr +; CHECK-NEXT: [[TMP98:%.*]] = load ptr, ptr [[TMP97]], align 8 +; CHECK-NEXT: [[TMP99:%.*]] = icmp ne ptr [[TMP98]], null +; CHECK-NEXT: [[TMP100:%.*]] = or i1 [[TMP95]], [[TMP99]] +; CHECK-NEXT: [[TMP101:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24 +; CHECK-NEXT: [[TMP102:%.*]] = inttoptr i64 [[TMP101]] to ptr +; CHECK-NEXT: [[TMP103:%.*]] = load ptr, ptr [[TMP102]], align 8 +; CHECK-NEXT: [[TMP104:%.*]] = icmp ne ptr [[TMP103]], null +; CHECK-NEXT: [[TMP105:%.*]] = or i1 [[TMP100]], [[TMP104]] +; CHECK-NEXT: [[TMP106:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32 +; CHECK-NEXT: [[TMP107:%.*]] = inttoptr i64 [[TMP106]] to ptr +; CHECK-NEXT: [[TMP108:%.*]] = load ptr, ptr [[TMP107]], align 8 +; CHECK-NEXT: [[TMP109:%.*]] = icmp ne ptr [[TMP108]], null +; CHECK-NEXT: [[TMP110:%.*]] = or i1 [[TMP105]], [[TMP109]] +; CHECK-NEXT: [[TMP111:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40 +; CHECK-NEXT: [[TMP112:%.*]] = inttoptr i64 [[TMP111]] to ptr +; CHECK-NEXT: [[TMP113:%.*]] = load ptr, ptr [[TMP112]], align 8 +; CHECK-NEXT: [[TMP114:%.*]] = icmp ne ptr [[TMP113]], null +; CHECK-NEXT: [[TMP115:%.*]] = or i1 [[TMP110]], [[TMP114]] +; CHECK-NEXT: [[TMP116:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48 +; CHECK-NEXT: [[TMP117:%.*]] = inttoptr i64 [[TMP116]] to ptr +; CHECK-NEXT: [[TMP118:%.*]] = load ptr, ptr [[TMP117]], align 8 +; CHECK-NEXT: [[TMP119:%.*]] = icmp ne ptr [[TMP118]], null +; CHECK-NEXT: [[TMP120:%.*]] = or i1 [[TMP115]], [[TMP119]] +; CHECK-NEXT: [[TMP121:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56 +; CHECK-NEXT: [[TMP122:%.*]] = inttoptr i64 [[TMP121]] to ptr +; CHECK-NEXT: [[TMP123:%.*]] = load ptr, ptr [[TMP122]], align 8 +; CHECK-NEXT: [[TMP124:%.*]] = icmp ne ptr [[TMP123]], null +; CHECK-NEXT: [[TMP125:%.*]] = or i1 [[TMP120]], [[TMP124]] +; CHECK-NEXT: br i1 [[TMP125]], label [[TMP126:%.*]], label [[TMP127:%.*]], !prof [[PROF0]] +; CHECK: 126: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP127]] +; CHECK: 127: +; CHECK-NEXT: store ptr null, ptr [[SHADOW_PTR6]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_OFFSET9:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_PTR10:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET9]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR10]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_2_OFFSET11:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16 +; CHECK-NEXT: [[SHADOW_BYTE_2_PTR12:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET11]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR12]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_3_OFFSET13:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24 +; CHECK-NEXT: [[SHADOW_BYTE_3_PTR14:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET13]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR14]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_4_OFFSET15:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32 +; CHECK-NEXT: [[SHADOW_BYTE_4_PTR16:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET15]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR16]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_5_OFFSET17:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40 +; CHECK-NEXT: [[SHADOW_BYTE_5_PTR18:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET17]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR18]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_6_OFFSET19:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48 +; CHECK-NEXT: [[SHADOW_BYTE_6_PTR20:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET19]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR20]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_7_OFFSET21:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56 +; CHECK-NEXT: [[SHADOW_BYTE_7_PTR22:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET21]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR22]], align 8 +; CHECK-NEXT: br label [[TMP129:%.*]] +; CHECK: 128: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP129]] +; CHECK: 129: +; CHECK-NEXT: br label [[TMP175:%.*]] +; CHECK: 130: +; CHECK-NEXT: [[TMP131:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8 +; CHECK-NEXT: [[TMP132:%.*]] = inttoptr i64 [[TMP131]] to ptr +; CHECK-NEXT: [[TMP133:%.*]] = load ptr, ptr [[TMP132]], align 8 +; CHECK-NEXT: [[TMP134:%.*]] = ptrtoint ptr [[TMP133]] to i64 +; CHECK-NEXT: [[TMP135:%.*]] = icmp sge i64 [[TMP134]], 0 +; CHECK-NEXT: [[TMP136:%.*]] = or i1 false, [[TMP135]] +; CHECK-NEXT: [[TMP137:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16 +; CHECK-NEXT: [[TMP138:%.*]] = inttoptr i64 [[TMP137]] to ptr +; CHECK-NEXT: [[TMP139:%.*]] = load ptr, ptr [[TMP138]], align 8 +; CHECK-NEXT: [[TMP140:%.*]] = ptrtoint ptr [[TMP139]] to i64 +; CHECK-NEXT: [[TMP141:%.*]] = icmp sge i64 [[TMP140]], 0 +; CHECK-NEXT: [[TMP142:%.*]] = or i1 [[TMP136]], [[TMP141]] +; CHECK-NEXT: [[TMP143:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24 +; CHECK-NEXT: [[TMP144:%.*]] = inttoptr i64 [[TMP143]] to ptr +; CHECK-NEXT: [[TMP145:%.*]] = load ptr, ptr [[TMP144]], align 8 +; CHECK-NEXT: [[TMP146:%.*]] = ptrtoint ptr [[TMP145]] to i64 +; CHECK-NEXT: [[TMP147:%.*]] = icmp sge i64 [[TMP146]], 0 +; CHECK-NEXT: [[TMP148:%.*]] = or i1 [[TMP142]], [[TMP147]] +; CHECK-NEXT: [[TMP149:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32 +; CHECK-NEXT: [[TMP150:%.*]] = inttoptr i64 [[TMP149]] to ptr +; CHECK-NEXT: [[TMP151:%.*]] = load ptr, ptr [[TMP150]], align 8 +; CHECK-NEXT: [[TMP152:%.*]] = ptrtoint ptr [[TMP151]] to i64 +; CHECK-NEXT: [[TMP153:%.*]] = icmp sge i64 [[TMP152]], 0 +; CHECK-NEXT: [[TMP154:%.*]] = or i1 [[TMP148]], [[TMP153]] +; CHECK-NEXT: [[TMP155:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40 +; CHECK-NEXT: [[TMP156:%.*]] = inttoptr i64 [[TMP155]] to ptr +; CHECK-NEXT: [[TMP157:%.*]] = load ptr, ptr [[TMP156]], align 8 +; CHECK-NEXT: [[TMP158:%.*]] = ptrtoint ptr [[TMP157]] to i64 +; CHECK-NEXT: [[TMP159:%.*]] = icmp sge i64 [[TMP158]], 0 +; CHECK-NEXT: [[TMP160:%.*]] = or i1 [[TMP154]], [[TMP159]] +; CHECK-NEXT: [[TMP161:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48 +; CHECK-NEXT: [[TMP162:%.*]] = inttoptr i64 [[TMP161]] to ptr +; CHECK-NEXT: [[TMP163:%.*]] = load ptr, ptr [[TMP162]], align 8 +; CHECK-NEXT: [[TMP164:%.*]] = ptrtoint ptr [[TMP163]] to i64 +; CHECK-NEXT: [[TMP165:%.*]] = icmp sge i64 [[TMP164]], 0 +; CHECK-NEXT: [[TMP166:%.*]] = or i1 [[TMP160]], [[TMP165]] +; CHECK-NEXT: [[TMP167:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56 +; CHECK-NEXT: [[TMP168:%.*]] = inttoptr i64 [[TMP167]] to ptr +; CHECK-NEXT: [[TMP169:%.*]] = load ptr, ptr [[TMP168]], align 8 +; CHECK-NEXT: [[TMP170:%.*]] = ptrtoint ptr [[TMP169]] to i64 +; CHECK-NEXT: [[TMP171:%.*]] = icmp sge i64 [[TMP170]], 0 +; CHECK-NEXT: [[TMP172:%.*]] = or i1 [[TMP166]], [[TMP171]] +; CHECK-NEXT: br i1 [[TMP172]], label [[TMP173:%.*]], label [[TMP174:%.*]], !prof [[PROF0]] +; CHECK: 173: +; CHECK-NEXT: call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1) +; CHECK-NEXT: br label [[TMP174]] +; CHECK: 174: +; CHECK-NEXT: br label [[TMP175]] +; CHECK: 175: +; CHECK-NEXT: [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8 +; CHECK-NEXT: call void @__tysan_instrument_with_shadow_update(ptr [[A:%.*]], ptr @__tysan_v1___ZTS1v_o_12, i1 true, i64 4, i32 2) +; CHECK-NEXT: [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[APP_PTR_MASKED23:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK2]] +; CHECK-NEXT: [[APP_PTR_SHIFTED24:%.*]] = shl i64 [[APP_PTR_MASKED23]], 3 +; CHECK-NEXT: [[SHADOW_PTR_INT25:%.*]] = add i64 [[APP_PTR_SHIFTED24]], [[SHADOW_BASE1]] +; CHECK-NEXT: [[SHADOW_PTR26:%.*]] = inttoptr i64 [[SHADOW_PTR_INT25]] to ptr +; CHECK-NEXT: [[SHADOW_DESC27:%.*]] = load ptr, ptr [[SHADOW_PTR26]], align 8 +; CHECK-NEXT: [[BAD_DESC28:%.*]] = icmp ne ptr [[SHADOW_DESC27]], @__tysan_v1___ZTS1v_o_12 +; CHECK-NEXT: br i1 [[BAD_DESC28]], label [[TMP176:%.*]], label [[TMP198:%.*]], !prof [[PROF0]] +; CHECK: 176: +; CHECK-NEXT: [[TMP177:%.*]] = icmp eq ptr [[SHADOW_DESC27]], null +; CHECK-NEXT: br i1 [[TMP177]], label [[TMP178:%.*]], label [[TMP196:%.*]] +; CHECK: 178: +; CHECK-NEXT: [[TMP179:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8 +; CHECK-NEXT: [[TMP180:%.*]] = inttoptr i64 [[TMP179]] to ptr +; CHECK-NEXT: [[TMP181:%.*]] = load ptr, ptr [[TMP180]], align 8 +; CHECK-NEXT: [[TMP182:%.*]] = icmp ne ptr [[TMP181]], null +; CHECK-NEXT: [[TMP183:%.*]] = or i1 false, [[TMP182]] +; CHECK-NEXT: [[TMP184:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16 +; CHECK-NEXT: [[TMP185:%.*]] = inttoptr i64 [[TMP184]] to ptr +; CHECK-NEXT: [[TMP186:%.*]] = load ptr, ptr [[TMP185]], align 8 +; CHECK-NEXT: [[TMP187:%.*]] = icmp ne ptr [[TMP186]], null +; CHECK-NEXT: [[TMP188:%.*]] = or i1 [[TMP183]], [[TMP187]] +; CHECK-NEXT: [[TMP189:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24 +; CHECK-NEXT: [[TMP190:%.*]] = inttoptr i64 [[TMP189]] to ptr +; CHECK-NEXT: [[TMP191:%.*]] = load ptr, ptr [[TMP190]], align 8 +; CHECK-NEXT: [[TMP192:%.*]] = icmp ne ptr [[TMP191]], null +; CHECK-NEXT: [[TMP193:%.*]] = or i1 [[TMP188]], [[TMP192]] +; CHECK-NEXT: br i1 [[TMP193]], label [[TMP194:%.*]], label [[TMP195:%.*]], !prof [[PROF0]] +; CHECK: 194: +; CHECK-NEXT: call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2) +; CHECK-NEXT: br label [[TMP195]] +; CHECK: 195: +; CHECK-NEXT: store ptr @__tysan_v1___ZTS1v_o_12, ptr [[SHADOW_PTR26]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_OFFSET29:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8 +; CHECK-NEXT: [[SHADOW_BYTE_1_PTR30:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET29]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR30]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_2_OFFSET31:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16 +; CHECK-NEXT: [[SHADOW_BYTE_2_PTR32:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET31]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR32]], align 8 +; CHECK-NEXT: [[SHADOW_BYTE_3_OFFSET33:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24 +; CHECK-NEXT: [[SHADOW_BYTE_3_PTR34:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET33]] to ptr +; CHECK-NEXT: store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR34]], align 8 +; CHECK-NEXT: br label [[TMP197:%.*]] +; CHECK: 196: +; CHECK-NEXT: call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2) +; CHECK-NEXT: br label [[TMP197]] +; CHECK: 197: +; CHECK-NEXT: br label [[TMP219:%.*]] +; CHECK: 198: +; CHECK-NEXT: [[TMP199:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8 +; CHECK-NEXT: [[TMP200:%.*]] = inttoptr i64 [[TMP199]] to ptr +; CHECK-NEXT: [[TMP201:%.*]] = load ptr, ptr [[TMP200]], align 8 +; CHECK-NEXT: [[TMP202:%.*]] = ptrtoint ptr [[TMP201]] to i64 +; CHECK-NEXT: [[TMP203:%.*]] = icmp sge i64 [[TMP202]], 0 +; CHECK-NEXT: [[TMP204:%.*]] = or i1 false, [[TMP203]] +; CHECK-NEXT: [[TMP205:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16 +; CHECK-NEXT: [[TMP206:%.*]] = inttoptr i64 [[TMP205]] to ptr +; CHECK-NEXT: [[TMP207:%.*]] = load ptr, ptr [[TMP206]], align 8 +; CHECK-NEXT: [[TMP208:%.*]] = ptrtoint ptr [[TMP207]] to i64 +; CHECK-NEXT: [[TMP209:%.*]] = icmp sge i64 [[TMP208]], 0 +; CHECK-NEXT: [[TMP210:%.*]] = or i1 [[TMP204]], [[TMP209]] +; CHECK-NEXT: [[TMP211:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24 +; CHECK-NEXT: [[TMP212:%.*]] = inttoptr i64 [[TMP211]] to ptr +; CHECK-NEXT: [[TMP213:%.*]] = load ptr, ptr [[TMP212]], align 8 +; CHECK-NEXT: [[TMP214:%.*]] = ptrtoint ptr [[TMP213]] to i64 +; CHECK-NEXT: [[TMP215:%.*]] = icmp sge i64 [[TMP214]], 0 +; CHECK-NEXT: [[TMP216:%.*]] = or i1 [[TMP210]], [[TMP215]] +; CHECK-NEXT: br i1 [[TMP216]], label [[TMP217:%.*]], label [[TMP218:%.*]], !prof [[PROF0]] +; CHECK: 217: +; CHECK-NEXT: call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2) +; CHECK-NEXT: br label [[TMP218]] +; CHECK: 218: +; CHECK-NEXT: br label [[TMP219]] +; CHECK: 219: +; CHECK-NEXT: store i32 42, ptr [[A]], align 4, !tbaa [[TBAA5:![0-9]+]] +; CHECK-NEXT: ret void +; +entry: + store i32 42, ptr %a, align 4, !tbaa !6 + ret void +} + +!0 = !{!"Simple C++ TBAA"} +!1 = !{!"omnipotent char", !0, i64 0} +!2 = !{!"int", !1, i64 0} +!3 = !{!2, !2, i64 0} +!4 = !{!"_ZTS1x", !2, i64 0, !2, i64 4} +!5 = !{!"_ZTS1v", !2, i64 8, !2, i64 12, !4, i64 16} +!6 = !{!5, !2, i64 12} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { sanitize_type } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind } +;. +; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 100000} +; CHECK: [[TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0} +; CHECK: [[META2]] = !{!"int", [[META3:![0-9]+]], i64 0} +; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0} +; CHECK: [[META4]] = !{!"Simple C++ TBAA"} +; CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META2]], i64 12} +; CHECK: [[META6]] = !{!"_ZTS1v", [[META2]], i64 8, [[META2]], i64 12, [[META7:![0-9]+]], i64 16} +; CHECK: [[META7]] = !{!"_ZTS1x", [[META2]], i64 0, [[META2]], i64 4} +;. diff --git a/llvm/test/Instrumentation/TypeSanitizer/globals_outlined.ll b/llvm/test/Instrumentation/TypeSanitizer/globals_outlined.ll new file mode 100644 index 0000000000000..0bd7940467415 --- /dev/null +++ b/llvm/test/Instrumentation/TypeSanitizer/globals_outlined.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs +; RUN: opt -passes='tysan' -tysan-outline-instrumentation -S %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@global1 = global i32 0, align 4 +@global2 = global i32 0, align 4 + + +; CHECK-LABEL: define internal void @__tysan_set_globals_types( +; CHECK-NEXT: %app.mem.mask = load i64, ptr @__tysan_app_memory_mask, align 8 +; CHECK-NEXT: %shadow.base = load i64, ptr @__tysan_shadow_memory_address, align 8 +; CHECK-NEXT: call void @__tysan_set_shadow_type(ptr @global1, ptr @__tysan_v1_int, i64 4) +; CHECK-NEXT: call void @__tysan_set_shadow_type(ptr @global1, ptr @__tysan_v1_int, i64 4) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +!llvm.tysan.globals = !{!13, !14} + +!0 = !{!"Simple C++ TBAA"} +!1 = !{!"omnipotent char", !0, i64 0} +!2 = !{!"int", !1, i64 0} +!13 = !{ptr @global1, !2} +!14 = !{ptr @global1, !2} From 35a3135eaa4ae81be8aee4627410becee1da6092 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 31 Oct 2025 09:53:47 -0700 Subject: [PATCH 366/539] [LLDB] Fix ASAN tests on newer versions of macOS (#165883) macOS forbids injecting the ASAN runtime into system processes when SIP is enabled. That includes the just-built libLTO that the just-built clang injects into the system linker. Since we don't test the compiler here, just use the system (non-asanified) LTO library to make ASAN tests work for most users, including the bots. --- .../Python/lldbsuite/test/make/Makefile.rules | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules index 09939e29e5b75..28cae54776ac8 100644 --- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules +++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules @@ -322,6 +322,17 @@ ifeq (,$(filter $(OS), Windows_NT Android Darwin)) LDFLAGS += -pthread endif endif + +# macOS forbids injecting the ASAN runtime into system processes when +# SIP is enabled. That includes the just-built libLTO that the +# just-built clang injects into the system linker. Since we don't +# test the compiler here, just use the system (non-asanified) LTO +# library to make ASAN tests work for most users, including the bots. +ifeq "$(OS)" "Darwin" +ifneq "$(ASAN_OPTIONS)" "" +LD_FLAGS += -Wl,-lto_library -Wl,$(shell dirname $(shell xcrun -find clang))/../lib/libLTO.dylib +endif +endif OBJECTS = EXE ?= a.out From 5da1551e8a6e64e6010e937a722d8c7a70b8b080 Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Fri, 31 Oct 2025 09:56:55 -0700 Subject: [PATCH 367/539] [Hexagon] Optimize sfclass/dfclass compares (#165735) fclass intrinsics generate a sub-optimal code by doing a predicate transfer and compare. This patch optimizes out and directly uses the predicate. --- llvm/lib/Target/Hexagon/HexagonPatterns.td | 13 ++++ llvm/test/CodeGen/Hexagon/isel-fclass.ll | 86 ++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 llvm/test/CodeGen/Hexagon/isel-fclass.ll diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td index 85ce9447c2028..e40dbd251b5b7 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatterns.td +++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td @@ -3434,6 +3434,19 @@ let AddedComplexity = 100 in { (C2_not (S4_stored_locked I32:$Rs, I64:$Rt))>; } +multiclass FloatClass { + let AddedComplexity = 100 in { + def: Pat<(i1 (seteq (IntOp RegPred:$Rs, u5_0ImmPred_timm:$u5), 0)), + (C2_not (MI RegPred:$Rs, u5_0ImmPred_timm:$u5))>; + def: Pat<(i1 (setne (IntOp RegPred:$Rs, u5_0ImmPred_timm:$u5), 0)), + (MI RegPred:$Rs, u5_0ImmPred_timm:$u5)>; + } +} + +defm : FloatClass; +defm : FloatClass; + def: Pat<(int_hexagon_instrprof_custom (HexagonAtPcrel tglobaladdr:$addr), u32_0ImmPred:$I), (PS_call_instrprof_custom tglobaladdr:$addr, imm:$I)>; diff --git a/llvm/test/CodeGen/Hexagon/isel-fclass.ll b/llvm/test/CodeGen/Hexagon/isel-fclass.ll new file mode 100644 index 0000000000000..96b02106fa807 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/isel-fclass.ll @@ -0,0 +1,86 @@ +; Tests lowering of sfclass/dfclass compares. +; Sub-optimal code +; { +; p0 = sfclass(r0,#16) +; r0 = sfadd(r0,r0) +; } +; { +; r2 = p0 +; } +; { +; if (p0.new) r0 = ##1065353216 +; p0 = cmp.eq(r2,#0) +; jumpr r31 +; } +; With the patterns added, we should be generating +; { +; p0 = sfclass(r0,#16) +; r0 = sfadd(r0,r0) +; } +; { +; if (!p0) r0 = ##1065353216 +; jumpr r31 +; } + +; RUN: llc -march=hexagon -stop-after=hexagon-isel %s -o - | FileCheck %s + +; CHECK: bb.0.entry1 +; CHECK: F2_sfclass +; CHECK-NOT: C2_cmp +; CHECK: C2_not +; CHECK: F2_sfadd +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +define float @test1(float noundef %x) { +entry1: + %0 = tail call i32 @llvm.hexagon.F2.sfclass(float %x, i32 16) + %tobool.not = icmp eq i32 %0, 0 + %add = fadd float %x, %x + %spec.select = select i1 %tobool.not, float 1.000000e+00, float %add + ret float %spec.select +} + +; CHECK: bb.0.entry2 +; CHECK: F2_sfclass +; CHECK-NOT: C2_cmp +; CHECK: F2_sfadd +define float @test2(float noundef %x) { +entry2: + %0 = tail call i32 @llvm.hexagon.F2.sfclass(float %x, i32 16) + %tobool.not = icmp eq i32 %0, 0 + %add = fadd float %x, %x + %spec.select = select i1 %tobool.not, float %add, float 1.000000e+00 + ret float %spec.select +} + +; CHECK: bb.0.entry3 +; CHECK: F2_dfclass +; CHECK-NOT: C2_cmp +; CHECK: C2_not +; CHECK: F2_dfadd +define double @test3(double noundef %x) { +entry3: + %0 = tail call i32 @llvm.hexagon.F2.dfclass(double %x, i32 16) + %tobool.not = icmp eq i32 %0, 0 + %add = fadd double %x, %x + %spec.select = select i1 %tobool.not, double 1.000000e+00, double %add + ret double %spec.select +} + +; CHECK: bb.0.entry4 +; CHECK: F2_dfclass +; CHECK-NOT: C2_cmp +; CHECK: F2_dfadd +define double @test4(double noundef %x) { +entry4: + %0 = tail call i32 @llvm.hexagon.F2.dfclass(double %x, i32 16) + %tobool.not = icmp eq i32 %0, 0 + %add = fadd double %x, %x + %spec.select = select i1 %tobool.not, double %add, double 1.000000e+00 + ret double %spec.select +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare i32 @llvm.hexagon.F2.dfclass(double, i32 immarg) + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare i32 @llvm.hexagon.F2.sfclass(float, i32 immarg) From 5f3bd370d13d82f8cb11ff2171ceb22e351516ba Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 31 Oct 2025 10:05:14 -0700 Subject: [PATCH 368/539] [RISCV] Reduce number of GPRs needed by lowerSegmentSpillReload. (#165337) Previously, we kept VLENB unaltered in register and used a temporary register to shift it. Now we store the shifted value in the VLENB register and keep track of how much it has been shifted. If we need a smaller multiple of VLENB we can shift right. Fixes #165232. --- llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 31 ++- llvm/test/CodeGen/RISCV/rvv/pr165232.ll | 244 ++++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir | 12 +- 3 files changed, 269 insertions(+), 18 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/pr165232.ll diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index e9f43b9a71648..84bb29433fb3b 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -438,18 +438,19 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II, TypeSize VRegSize = OldLoc.getValue().divideCoefficientBy(NumRegs); Register VLENB = 0; - unsigned PreHandledNum = 0; + unsigned VLENBShift = 0; + unsigned PrevHandledNum = 0; unsigned I = 0; while (I != NumRegs) { auto [LMulHandled, RegClass, Opcode] = getSpillReloadInfo(NumRegs - I, RegEncoding, IsSpill); auto [RegNumHandled, _] = RISCVVType::decodeVLMUL(LMulHandled); bool IsLast = I + RegNumHandled == NumRegs; - if (PreHandledNum) { + if (PrevHandledNum) { Register Step; // Optimize for constant VLEN. if (auto VLEN = STI.getRealVLen()) { - int64_t Offset = *VLEN / 8 * PreHandledNum; + int64_t Offset = *VLEN / 8 * PrevHandledNum; Step = MRI.createVirtualRegister(&RISCV::GPRRegClass); STI.getInstrInfo()->movImm(MBB, II, DL, Step, Offset); } else { @@ -457,15 +458,21 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II, VLENB = MRI.createVirtualRegister(&RISCV::GPRRegClass); BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VLENB); } - uint32_t ShiftAmount = Log2_32(PreHandledNum); - if (ShiftAmount == 0) - Step = VLENB; - else { - Step = MRI.createVirtualRegister(&RISCV::GPRRegClass); - BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), Step) - .addReg(VLENB, getKillRegState(IsLast)) - .addImm(ShiftAmount); + uint32_t ShiftAmount = Log2_32(PrevHandledNum); + // To avoid using an extra register, we shift the VLENB register and + // remember how much it has been shifted. We can then use relative + // shifts to adjust to the desired shift amount. + if (VLENBShift > ShiftAmount) { + BuildMI(MBB, II, DL, TII->get(RISCV::SRLI), VLENB) + .addReg(VLENB, RegState::Kill) + .addImm(VLENBShift - ShiftAmount); + } else if (VLENBShift < ShiftAmount) { + BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), VLENB) + .addReg(VLENB, RegState::Kill) + .addImm(ShiftAmount - VLENBShift); } + VLENBShift = ShiftAmount; + Step = VLENB; } BuildMI(MBB, II, DL, TII->get(RISCV::ADD), NewBase) @@ -489,7 +496,7 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II, if (IsSpill) MIB.addReg(Reg, RegState::Implicit); - PreHandledNum = RegNumHandled; + PrevHandledNum = RegNumHandled; RegEncoding += RegNumHandled; I += RegNumHandled; } diff --git a/llvm/test/CodeGen/RISCV/rvv/pr165232.ll b/llvm/test/CodeGen/RISCV/rvv/pr165232.ll new file mode 100644 index 0000000000000..bef53c6a5ae62 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/pr165232.ll @@ -0,0 +1,244 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s + +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-linux-gnu" + +define i1 @main(ptr %var_117, ptr %arrayinit.element3045, ptr %arrayinit.element3047, ptr %arrayinit.element3049, ptr %arrayinit.element3051, ptr %arrayinit.element3053, ptr %arrayinit.element3055, ptr %arrayinit.element3057, ptr %arrayinit.element3059, ptr %arrayinit.element3061, ptr %arrayinit.element3063, ptr %arrayinit.element3065, ptr %arrayinit.element3067, i64 %var_94_i.07698, target("riscv.vector.tuple", , 2) %0, target("riscv.vector.tuple", , 4) %1) { +; CHECK-LABEL: main: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr t0, vlenb +; CHECK-NEXT: slli t0, t0, 3 +; CHECK-NEXT: mv t1, t0 +; CHECK-NEXT: slli t0, t0, 1 +; CHECK-NEXT: add t0, t0, t1 +; CHECK-NEXT: sub sp, sp, t0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: sd a1, 8(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd a2, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs4r.v v12, (a1) # vscale x 32-byte Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: vs4r.v v16, (a1) # vscale x 32-byte Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t0, 56(a1) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t1, 48(a1) +; CHECK-NEXT: vsetvli t2, zero, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t2, 40(a1) +; CHECK-NEXT: # kill: def $v10 killed $v9 killed $vtype +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t3, 32(a1) +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t4, 16(a1) +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t5, 24(a1) +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli t6, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v22, 0 +; CHECK-NEXT: vmv1r.v v14, v9 +; CHECK-NEXT: sd zero, 0(a0) +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vmv1r.v v15, v9 +; CHECK-NEXT: vmv1r.v v18, v9 +; CHECK-NEXT: li t6, 1023 +; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vmv1r.v v19, v9 +; CHECK-NEXT: slli t6, t6, 52 +; CHECK-NEXT: vmv.v.i v28, 0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs2r.v v22, (a1) # vscale x 16-byte Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: vs4r.v v24, (a1) # vscale x 32-byte Folded Spill +; CHECK-NEXT: slli a2, a2, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: ld a2, 0(sp) # 8-byte Folded Reload +; CHECK-NEXT: vs2r.v v28, (a1) # vscale x 16-byte Folded Spill +; CHECK-NEXT: ld a1, 8(sp) # 8-byte Folded Reload +; CHECK-NEXT: vmv1r.v v20, v9 +; CHECK-NEXT: sd t6, 0(t5) +; CHECK-NEXT: vmv2r.v v16, v14 +; CHECK-NEXT: vmv2r.v v14, v12 +; CHECK-NEXT: vmv2r.v v12, v10 +; CHECK-NEXT: vmv1r.v v11, v9 +; CHECK-NEXT: vmv1r.v v21, v9 +; CHECK-NEXT: csrr t5, vlenb +; CHECK-NEXT: slli t5, t5, 3 +; CHECK-NEXT: add t5, sp, t5 +; CHECK-NEXT: addi t5, t5, 16 +; CHECK-NEXT: vs2r.v v18, (t5) # vscale x 16-byte Folded Spill +; CHECK-NEXT: csrr t6, vlenb +; CHECK-NEXT: slli t6, t6, 1 +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: vs2r.v v20, (t5) # vscale x 16-byte Folded Spill +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v19, 0 +; CHECK-NEXT: vmclr.m v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vmv.v.i v6, 0 +; CHECK-NEXT: .LBB0_1: # %for.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmv1r.v v20, v19 +; CHECK-NEXT: vmv1r.v v3, v19 +; CHECK-NEXT: vmv1r.v v5, v19 +; CHECK-NEXT: vmv1r.v v2, v19 +; CHECK-NEXT: vmv1r.v v31, v19 +; CHECK-NEXT: vmv1r.v v30, v19 +; CHECK-NEXT: vmv1r.v v4, v19 +; CHECK-NEXT: vmv2r.v v22, v10 +; CHECK-NEXT: vmv4r.v v24, v12 +; CHECK-NEXT: vmv2r.v v28, v16 +; CHECK-NEXT: vmv2r.v v8, v6 +; CHECK-NEXT: vmv1r.v v18, v19 +; CHECK-NEXT: vmv1r.v v21, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vle32.v v20, (t4) +; CHECK-NEXT: vle32.v v3, (t1) +; CHECK-NEXT: vle32.v v30, (a7) +; CHECK-NEXT: vle64.v v8, (a4) +; CHECK-NEXT: vle32.v v5, (t2) +; CHECK-NEXT: vle32.v v2, (t3) +; CHECK-NEXT: vle32.v v31, (a6) +; CHECK-NEXT: vmv1r.v v24, v30 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vmflt.vv v21, v8, v6, v0.t +; CHECK-NEXT: vmv1r.v v8, v19 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu +; CHECK-NEXT: vle32.v v18, (a2) +; CHECK-NEXT: vle32.v v8, (a3) +; CHECK-NEXT: vle32.v v4, (a5) +; CHECK-NEXT: vmv1r.v v22, v20 +; CHECK-NEXT: csrr t5, vlenb +; CHECK-NEXT: slli t5, t5, 3 +; CHECK-NEXT: add t5, sp, t5 +; CHECK-NEXT: addi t5, t5, 16 +; CHECK-NEXT: vl1r.v v1, (t5) # vscale x 8-byte Folded Reload +; CHECK-NEXT: csrr t6, vlenb +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: vl2r.v v2, (t5) # vscale x 16-byte Folded Reload +; CHECK-NEXT: slli t6, t6, 1 +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: vl1r.v v4, (t5) # vscale x 8-byte Folded Reload +; CHECK-NEXT: vsseg4e32.v v1, (zero) +; CHECK-NEXT: vsseg8e32.v v22, (a1) +; CHECK-NEXT: vmv1r.v v0, v21 +; CHECK-NEXT: vssub.vv v8, v19, v18, v0.t +; CHECK-NEXT: csrr t5, vlenb +; CHECK-NEXT: slli t5, t5, 2 +; CHECK-NEXT: mv t6, t5 +; CHECK-NEXT: slli t5, t5, 1 +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: add t5, sp, t5 +; CHECK-NEXT: addi t5, t5, 16 +; CHECK-NEXT: vl4r.v v20, (t5) # vscale x 32-byte Folded Reload +; CHECK-NEXT: vsetvli zero, t0, e64, m2, ta, ma +; CHECK-NEXT: vsseg2e64.v v20, (zero) +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: addi t5, sp, 16 +; CHECK-NEXT: vl4r.v v20, (t5) # vscale x 32-byte Folded Reload +; CHECK-NEXT: csrr t6, vlenb +; CHECK-NEXT: slli t6, t6, 2 +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: vl4r.v v24, (t5) # vscale x 32-byte Folded Reload +; CHECK-NEXT: vsetivli zero, 0, e64, m2, ta, ma +; CHECK-NEXT: vsseg4e64.v v20, (zero), v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg8e32.v v8, (a0) +; CHECK-NEXT: csrr t5, vlenb +; CHECK-NEXT: slli t5, t5, 4 +; CHECK-NEXT: add t5, sp, t5 +; CHECK-NEXT: addi t5, t5, 16 +; CHECK-NEXT: vl4r.v v20, (t5) # vscale x 32-byte Folded Reload +; CHECK-NEXT: csrr t6, vlenb +; CHECK-NEXT: slli t6, t6, 2 +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: vl4r.v v24, (t5) # vscale x 32-byte Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vsseg4e64.v v20, (zero) +; CHECK-NEXT: j .LBB0_1 +entry: + store double 0.000000e+00, ptr %var_117, align 8 + store double 1.000000e+00, ptr %arrayinit.element3061, align 8 + br label %for.body + +for.body: ; preds = %for.body, %entry + %2 = call @llvm.riscv.vle.nxv2f32.p0.i64( zeroinitializer, ptr %arrayinit.element3059, i64 0) + %3 = call @llvm.riscv.vle.nxv2f32.p0.i64( zeroinitializer, ptr %arrayinit.element3067, i64 0) + %4 = call @llvm.riscv.vle.nxv2f32.p0.i64( zeroinitializer, ptr %arrayinit.element3065, i64 0) + %5 = call @llvm.riscv.vle.nxv2f32.p0.i64( zeroinitializer, ptr %arrayinit.element3063, i64 0) + %6 = call @llvm.riscv.vle.nxv2f32.p0.i64( zeroinitializer, ptr %arrayinit.element3055, i64 0) + %7 = call @llvm.riscv.vle.nxv2f32.p0.i64( zeroinitializer, ptr %arrayinit.element3057, i64 0) + %8 = call @llvm.riscv.vle.nxv2f32.p0.i64( zeroinitializer, ptr %arrayinit.element3053, i64 0) + %9 = call @llvm.riscv.vle.nxv2f64.p0.i64( zeroinitializer, ptr %arrayinit.element3051, i64 0) + %10 = tail call @llvm.riscv.vle.nxv2i32.p0.i64( zeroinitializer, ptr %arrayinit.element3047, i64 0) + %11 = tail call @llvm.riscv.vle.nxv2i32.p0.i64( zeroinitializer, ptr %arrayinit.element3049, i64 0) + call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i64(target("riscv.vector.tuple", , 4) zeroinitializer, ptr null, i64 0, i64 5) + %12 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", , 8) zeroinitializer, %8, i32 0) + %13 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", , 8) %12, %7, i32 2) + %14 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", , 8) %13, %6, i32 0) + %15 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", , 8) %14, %5, i32 0) + %16 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", , 8) %15, %4, i32 0) + %17 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", , 8) %16, %3, i32 0) + %18 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", , 8) %17, %2, i32 0) + call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", , 8) %18, ptr %arrayinit.element3045, i64 0, i64 5) + %19 = tail call @llvm.riscv.vmfgt.mask.nxv2f64.nxv2f64.i64( zeroinitializer, zeroinitializer, %9, zeroinitializer, i64 0) + %20 = tail call @llvm.riscv.vssub.mask.nxv2i32.nxv2i32.i64( %11, zeroinitializer, %10, %19, i64 0, i64 0) + call void @llvm.riscv.vsseg2.triscv.vector.tuple_nxv16i8_2t.p0.i64(target("riscv.vector.tuple", , 2) %0, ptr null, i64 %var_94_i.07698, i64 6) + call void @llvm.riscv.vsseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv2i1.i64(target("riscv.vector.tuple", , 4) zeroinitializer, ptr null, zeroinitializer, i64 0, i64 6) + %21 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, %20, i32 0) + call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", , 8) %21, ptr %var_117, i64 0, i64 5) + call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv16i8_4t.p0.i64(target("riscv.vector.tuple", , 4) %1, ptr null, i64 0, i64 6) + br label %for.body +} diff --git a/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir b/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir index dd9960d17af43..9c2fa9d0009a7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir +++ b/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir @@ -32,10 +32,10 @@ body: | ; CHECK-NEXT: $x11 = ADDI $x2, 16 ; CHECK-NEXT: VS4R_V $v0m4, $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store () into %stack.0, align 8) ; CHECK-NEXT: $x12 = PseudoReadVLENB - ; CHECK-NEXT: $x13 = SLLI $x12, 2 - ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x13 + ; CHECK-NEXT: $x12 = SLLI killed $x12, 2 + ; CHECK-NEXT: $x11 = ADD killed $x11, $x12 ; CHECK-NEXT: VS2R_V $v4m2, $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store () into %stack.0, align 8) - ; CHECK-NEXT: $x12 = SLLI killed $x12, 1 + ; CHECK-NEXT: $x12 = SRLI killed $x12, 1 ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x12 ; CHECK-NEXT: VS1R_V $v6, killed $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store () into %stack.0) ; CHECK-NEXT: $x11 = ADDI $x2, 16 @@ -93,10 +93,10 @@ body: | ; CHECK-NEXT: $x11 = ADDI $x2, 16 ; CHECK-NEXT: $v10m2 = VL2RE8_V $x11 :: (load () from %stack.0, align 8) ; CHECK-NEXT: $x12 = PseudoReadVLENB - ; CHECK-NEXT: $x13 = SLLI $x12, 1 - ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x13 + ; CHECK-NEXT: $x12 = SLLI killed $x12, 1 + ; CHECK-NEXT: $x11 = ADD killed $x11, $x12 ; CHECK-NEXT: $v12m4 = VL4RE8_V $x11 :: (load () from %stack.0, align 8) - ; CHECK-NEXT: $x12 = SLLI killed $x12, 2 + ; CHECK-NEXT: $x12 = SLLI killed $x12, 1 ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x12 ; CHECK-NEXT: $v16 = VL1RE8_V killed $x11 :: (load () from %stack.0) ; CHECK-NEXT: VS1R_V killed $v10, killed renamable $x10 From 6ce315a770811ea237ae7eabc6e35b97443b53a8 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 31 Oct 2025 10:06:55 -0700 Subject: [PATCH 369/539] [RISCV] Fix misuse of EVT::changeVectorElementType() in legalizeScatterGatherIndexType. (#165829) This function doesn't work well when the type is a SimpleVT, but the changed type isn't. We need an LLVMContext to make an non-SimpleVT, but there's nowhere to get it from. Fix this by using EVT::getVectorVT instead. In the added test, v7i8 is a SimpleVT, but v7i64 is not. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 4 +- .../RISCV/rvv/fixed-vectors-masked-gather.ll | 253 ++++++++++++++++++ 2 files changed, 256 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 56881f71934c4..c6a8b8481c94f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -19794,7 +19794,9 @@ legalizeScatterGatherIndexType(SDLoc DL, SDValue &Index, // LLVM's legalization take care of the splitting. // FIXME: LLVM can't split VP_GATHER or VP_SCATTER yet. Index = DAG.getNode(ISD::SIGN_EXTEND, DL, - IndexVT.changeVectorElementType(XLenVT), Index); + EVT::getVectorVT(*DAG.getContext(), XLenVT, + IndexVT.getVectorElementCount()), + Index); } IndexType = ISD::UNSIGNED_SCALED; return true; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 4c35b2506d3e4..7e6f2c76e5881 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -15265,6 +15265,259 @@ define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) { ret <4 x i32> %x } +define <7 x i8> @mgather_baseidx_v7i8(ptr %base, <7 x i8> %idxs, <7 x i1> %m, <7 x i8> %passthru) { +; RV32-LABEL: mgather_baseidx_v7i8: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 127 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.s.x v10, a1 +; RV32-NEXT: vmand.mm v0, v0, v10 +; RV32-NEXT: vsext.vf4 v10, v8 +; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; RV32-NEXT: vluxei32.v v9, (a0), v10, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64V-LABEL: mgather_baseidx_v7i8: +; RV64V: # %bb.0: +; RV64V-NEXT: li a1, 127 +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64V-NEXT: vmv.s.x v10, a1 +; RV64V-NEXT: vmand.mm v0, v0, v10 +; RV64V-NEXT: vsext.vf8 v12, v8 +; RV64V-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; RV64V-NEXT: vluxei64.v v9, (a0), v12, v0.t +; RV64V-NEXT: vmv1r.v v8, v9 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v7i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: .cfi_remember_state +; RV64ZVE32F-NEXT: li a1, 64 +; RV64ZVE32F-NEXT: addi a2, sp, 8 +; RV64ZVE32F-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; RV64ZVE32F-NEXT: vsm.v v0, (a2) +; RV64ZVE32F-NEXT: ld a1, 8(sp) +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB132_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.v.x v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 3 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: .LBB132_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB132_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lbu a3, 0(a3) +; RV64ZVE32F-NEXT: vmv.v.x v10, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 3 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 4 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: .LBB132_4: # %else2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB132_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 3 +; RV64ZVE32F-NEXT: vmv.x.s a4, v11 +; RV64ZVE32F-NEXT: vmv.v.x v11, a3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 4 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v11, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: .LBB132_6: # %else5 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB132_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vmv.x.s a4, v11 +; RV64ZVE32F-NEXT: vmv.v.x v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 5 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lbu a3, 0(a3) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v11, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: .LBB132_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB132_13 +; RV64ZVE32F-NEXT: # %bb.9: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB132_14 +; RV64ZVE32F-NEXT: .LBB132_10: # %else14 +; RV64ZVE32F-NEXT: andi a1, a1, 64 +; RV64ZVE32F-NEXT: beqz a1, .LBB132_12 +; RV64ZVE32F-NEXT: .LBB132_11: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: add a0, a0, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 5 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: lbu a0, 0(a0) +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: .LBB132_12: # %else17 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 0 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB132_13: # %cond.load10 +; RV64ZVE32F-NEXT: .cfi_restore_state +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 +; RV64ZVE32F-NEXT: vmv.x.s a4, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 3 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB132_10 +; RV64ZVE32F-NEXT: .LBB132_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vmv.x.s a4, v11 +; RV64ZVE32F-NEXT: vmv.v.x v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 3 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lbu a3, 0(a3) +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v11, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: andi a1, a1, 64 +; RV64ZVE32F-NEXT: bnez a1, .LBB132_11 +; RV64ZVE32F-NEXT: j .LBB132_12 + %ptrs = getelementptr inbounds i8, ptr %base, <7 x i8> %idxs + %v = call <7 x i8> @llvm.masked.gather.v7i8.v7p0(<7 x ptr> %ptrs, i32 1, <7 x i1> %m, <7 x i8> %passthru) + ret <7 x i8> %v +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV32V-ZVFH: {{.*}} ; RV32V-ZVFHMIN: {{.*}} From df61153709cbee648ab74e352e77bd7d39e41116 Mon Sep 17 00:00:00 2001 From: Alireza Torabian Date: Fri, 31 Oct 2025 13:19:20 -0400 Subject: [PATCH 370/539] [DA] Check for overflow in strong SIV test (#164704) Rely on the product of `UpperBound` and `AbsCoeff` only if SCEV can prove that there is no overflow. Also the same about the result of the subtraction of `DstConst` from `SrcConst` to calculate `Delta`. --- llvm/lib/Analysis/DependenceAnalysis.cpp | 19 +++- .../SimpleSIVNoValidityCheck.ll | 2 +- .../Analysis/DependenceAnalysis/StrongSIV.ll | 86 ++++++++++++++++--- 3 files changed, 90 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 11d829492a10e..e45d1f79b3165 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -1587,6 +1587,15 @@ static const SCEV *minusSCEVNoSignedOverflow(const SCEV *A, const SCEV *B, return nullptr; } +/// Returns \p A * \p B if it guaranteed not to signed wrap. Otherwise returns +/// nullptr. \p A and \p B must have the same integer type. +static const SCEV *mulSCEVNoSignedOverflow(const SCEV *A, const SCEV *B, + ScalarEvolution &SE) { + if (SE.willNotOverflow(Instruction::Mul, /*Signed=*/true, A, B)) + return SE.getMulExpr(A, B); + return nullptr; +} + /// Returns the absolute value of \p A. In the context of dependence analysis, /// we need an absolute value in a mathematical sense. If \p A is the signed /// minimum value, we cannot represent it unless extending the original type. @@ -1686,7 +1695,11 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst, assert(0 < Level && Level <= CommonLevels && "level out of range"); Level--; - const SCEV *Delta = SE->getMinusSCEV(SrcConst, DstConst); + const SCEV *Delta = minusSCEVNoSignedOverflow(SrcConst, DstConst, *SE); + if (!Delta) { + Result.Consistent = false; + return false; + } LLVM_DEBUG(dbgs() << "\t Delta = " << *Delta); LLVM_DEBUG(dbgs() << ", " << *Delta->getType() << "\n"); @@ -1702,7 +1715,9 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst, const SCEV *AbsCoeff = absSCEVNoSignedOverflow(Coeff, *SE); if (!AbsDelta || !AbsCoeff) return false; - const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff); + const SCEV *Product = mulSCEVNoSignedOverflow(UpperBound, AbsCoeff, *SE); + if (!Product) + return false; return isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product); }(); if (IsDeltaLarge) { diff --git a/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll b/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll index 4346507ba8f90..181a4494b036e 100644 --- a/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll +++ b/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll @@ -210,7 +210,7 @@ define void @t3(i64 %n, i64 %m, i64 %lb, ptr %a) { ; CHECK-NEXT: Src: %2 = load i32, ptr %arrayidx6, align 4 --> Dst: %2 = load i32, ptr %arrayidx6, align 4 ; CHECK-NEXT: da analyze - none! ; CHECK-NEXT: Src: %2 = load i32, ptr %arrayidx6, align 4 --> Dst: store i32 %2, ptr %arrayidx8, align 4 -; CHECK-NEXT: da analyze - consistent anti [1 -2]! +; CHECK-NEXT: da analyze - anti [1 *]! ; CHECK-NEXT: Src: store i32 %2, ptr %arrayidx8, align 4 --> Dst: store i32 %2, ptr %arrayidx8, align 4 ; CHECK-NEXT: da analyze - none! ; diff --git a/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll b/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll index 44bd9b7727910..160196284f415 100644 --- a/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll +++ b/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ -; RUN: | FileCheck %s +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-ALL +; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa -da-enable-dependence-test=strong-siv 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-STRONG-SIV target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.6.0" @@ -423,19 +425,33 @@ for.end: ; preds = %for.body ;; *B++ = A[i + 2*n]; define void @strong9(ptr %A, ptr %B, i64 %n) nounwind uwtable ssp { -; CHECK-LABEL: 'strong9' -; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4 -; CHECK-NEXT: da analyze - none! -; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 -; CHECK-NEXT: da analyze - none! -; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 -; CHECK-NEXT: da analyze - confused! -; CHECK-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 -; CHECK-NEXT: da analyze - none! -; CHECK-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 -; CHECK-NEXT: da analyze - confused! -; CHECK-NEXT: Src: store i32 %0, ptr %B.addr.02, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 -; CHECK-NEXT: da analyze - none! +; CHECK-ALL-LABEL: 'strong9' +; CHECK-ALL-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 +; CHECK-ALL-NEXT: da analyze - confused! +; CHECK-ALL-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 +; CHECK-ALL-NEXT: da analyze - none! +; CHECK-ALL-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 +; CHECK-ALL-NEXT: da analyze - confused! +; CHECK-ALL-NEXT: Src: store i32 %0, ptr %B.addr.02, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 +; CHECK-ALL-NEXT: da analyze - none! +; +; CHECK-STRONG-SIV-LABEL: 'strong9' +; CHECK-STRONG-SIV-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4 +; CHECK-STRONG-SIV-NEXT: da analyze - none! +; CHECK-STRONG-SIV-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 +; CHECK-STRONG-SIV-NEXT: da analyze - flow [*|<]! +; CHECK-STRONG-SIV-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 +; CHECK-STRONG-SIV-NEXT: da analyze - confused! +; CHECK-STRONG-SIV-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 +; CHECK-STRONG-SIV-NEXT: da analyze - none! +; CHECK-STRONG-SIV-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 +; CHECK-STRONG-SIV-NEXT: da analyze - confused! +; CHECK-STRONG-SIV-NEXT: Src: store i32 %0, ptr %B.addr.02, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 +; CHECK-STRONG-SIV-NEXT: da analyze - none! ; entry: %cmp1 = icmp eq i64 %n, 0 @@ -512,3 +528,45 @@ for.body: ; preds = %entry, %for.body for.end: ; preds = %for.body ret void } + + +;; for (long unsigned i = 0; i < 9223372036854775806; i++) +;; for (long unsigned j = 0; j < 2147483640; j++) +;; if (i < 3000000000) +;; A[i] = 0; +; +; FIXME: DependenceAnalysis fails to detect the dependency between A[i] and +; itself, while Strong SIV has been able to prove it. +define void @strong11(ptr %A) nounwind uwtable ssp { +; CHECK-ALL-LABEL: 'strong11' +; CHECK-ALL-NEXT: Src: store i32 0, ptr %arrayidx, align 4 --> Dst: store i32 0, ptr %arrayidx, align 4 +; CHECK-ALL-NEXT: da analyze - none! +; +; CHECK-STRONG-SIV-LABEL: 'strong11' +; CHECK-STRONG-SIV-NEXT: Src: store i32 0, ptr %arrayidx, align 4 --> Dst: store i32 0, ptr %arrayidx, align 4 +; CHECK-STRONG-SIV-NEXT: da analyze - consistent output [0 S]! +; +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %entry, %for.cond.cleanup3 + %i.017 = phi i64 [ 0, %entry ], [ %inc8, %for.cond.cleanup3 ] + %cmp5 = icmp samesign ult i64 %i.017, 3000000000 + %arrayidx = getelementptr inbounds nuw i32, ptr %A, i64 %i.017 + br i1 %cmp5, label %for.body4.us, label %for.cond.cleanup3 + +for.body4.us: ; preds = %for.cond1.preheader, %for.body4.us + %j.016.us = phi i64 [ %inc.us, %for.body4.us ], [ 0, %for.cond1.preheader ] + store i32 0, ptr %arrayidx, align 4 + %inc.us = add nuw nsw i64 %j.016.us, 1 + %exitcond.not = icmp eq i64 %inc.us, 2147483640 + br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4.us + +for.cond.cleanup: ; preds = %for.cond.cleanup3 + ret void + +for.cond.cleanup3: ; preds = %for.body4.us, %for.cond1.preheader + %inc8 = add nuw nsw i64 %i.017, 1 + %exitcond19.not = icmp eq i64 %inc8, 9223372036854775806 + br i1 %exitcond19.not, label %for.cond.cleanup, label %for.cond1.preheader +} From 306b54518cd44f6f5dc0cfaf1b4094714b718d58 Mon Sep 17 00:00:00 2001 From: Andre Kuhlenschmidt Date: Fri, 31 Oct 2025 10:26:27 -0700 Subject: [PATCH 371/539] [flang][semantics] add semantic check that STAT and ERRMSG are not (de)allocated by same statement (#164529) Almost all compilers statically error on the following case even though it isn't a numbered constraint. Now we do to instead segfaulting at runtime. ```fortran integer,pointer:: i allocate(i,stat=i) end ``` --- flang/lib/Semantics/check-allocate.cpp | 33 ++++++- flang/lib/Semantics/check-allocate.h | 1 + flang/lib/Semantics/check-deallocate.cpp | 111 +++++++++++++---------- flang/test/Semantics/allocate14.f90 | 56 ++++++++++++ 4 files changed, 154 insertions(+), 47 deletions(-) create mode 100644 flang/test/Semantics/allocate14.f90 diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp index e019bbdfa27f6..a411e20557456 100644 --- a/flang/lib/Semantics/check-allocate.cpp +++ b/flang/lib/Semantics/check-allocate.cpp @@ -26,6 +26,10 @@ struct AllocateCheckerInfo { std::optional sourceExprType; std::optional sourceExprLoc; std::optional typeSpecLoc; + std::optional statSource; + std::optional msgSource; + const SomeExpr *statVar{nullptr}; + const SomeExpr *msgVar{nullptr}; int sourceExprRank{0}; // only valid if gotMold || gotSource bool gotStat{false}; bool gotMsg{false}; @@ -141,12 +145,15 @@ static std::optional CheckAllocateOptions( [&](const parser::StatOrErrmsg &statOrErr) { common::visit( common::visitors{ - [&](const parser::StatVariable &) { + [&](const parser::StatVariable &var) { if (info.gotStat) { // C943 context.Say( "STAT may not be duplicated in a ALLOCATE statement"_err_en_US); } info.gotStat = true; + info.statVar = GetExpr(context, var); + info.statSource = + parser::Unwrap(var)->GetSource(); }, [&](const parser::MsgVariable &var) { WarnOnDeferredLengthCharacterScalar(context, @@ -159,6 +166,9 @@ static std::optional CheckAllocateOptions( "ERRMSG may not be duplicated in a ALLOCATE statement"_err_en_US); } info.gotMsg = true; + info.msgVar = GetExpr(context, var); + info.msgSource = + parser::Unwrap(var)->GetSource(); }, }, statOrErr.u); @@ -460,6 +470,16 @@ static bool HaveCompatibleLengths( } } +bool AreSameAllocation(const SomeExpr *root, const SomeExpr *path) { + if (root && path) { + // For now we just use equality of expressions. If we implement a more + // sophisticated alias analysis we should use it here. + return *root == *path; + } else { + return false; + } +} + bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) { if (!ultimate_) { CHECK(context.AnyFatalError()); @@ -690,6 +710,17 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) { "Object in ALLOCATE must have DEVICE attribute when STREAM option is specified"_err_en_US); } } + + if (const SomeExpr *allocObj{GetExpr(context, allocateObject_)}) { + if (AreSameAllocation(allocObj, allocateInfo_.statVar)) { + context.Say(allocateInfo_.statSource.value_or(name_.source), + "STAT variable in ALLOCATE must not be the variable being allocated"_err_en_US); + } + if (AreSameAllocation(allocObj, allocateInfo_.msgVar)) { + context.Say(allocateInfo_.msgSource.value_or(name_.source), + "ERRMSG variable in ALLOCATE must not be the variable being allocated"_err_en_US); + } + } return RunCoarrayRelatedChecks(context); } diff --git a/flang/lib/Semantics/check-allocate.h b/flang/lib/Semantics/check-allocate.h index e3f7f07bca5b7..54f7380bc3fe8 100644 --- a/flang/lib/Semantics/check-allocate.h +++ b/flang/lib/Semantics/check-allocate.h @@ -24,5 +24,6 @@ class AllocateChecker : public virtual BaseChecker { private: SemanticsContext &context_; }; +bool AreSameAllocation(const SomeExpr *root, const SomeExpr *path); } // namespace Fortran::semantics #endif // FORTRAN_SEMANTICS_CHECK_ALLOCATE_H_ diff --git a/flang/lib/Semantics/check-deallocate.cpp b/flang/lib/Semantics/check-deallocate.cpp index c1ebc5f4c0ec2..e6ce1b30a59f5 100644 --- a/flang/lib/Semantics/check-deallocate.cpp +++ b/flang/lib/Semantics/check-deallocate.cpp @@ -7,51 +7,87 @@ //===----------------------------------------------------------------------===// #include "check-deallocate.h" +#include "check-allocate.h" #include "definable.h" #include "flang/Evaluate/type.h" #include "flang/Parser/message.h" #include "flang/Parser/parse-tree.h" #include "flang/Semantics/expression.h" #include "flang/Semantics/tools.h" +#include namespace Fortran::semantics { void DeallocateChecker::Leave(const parser::DeallocateStmt &deallocateStmt) { + bool gotStat{false}, gotMsg{false}; + const SomeExpr *statVar{nullptr}, *msgVar{nullptr}; + std::optional statSource; + std::optional msgSource; + for (const parser::StatOrErrmsg &deallocOpt : + std::get>(deallocateStmt.t)) { + common::visit( + common::visitors{ + [&](const parser::StatVariable &var) { + if (gotStat) { + context_.Say( + "STAT may not be duplicated in a DEALLOCATE statement"_err_en_US); + } + gotStat = true; + statVar = GetExpr(context_, var); + statSource = parser::Unwrap(var)->GetSource(); + }, + [&](const parser::MsgVariable &var) { + WarnOnDeferredLengthCharacterScalar(context_, + GetExpr(context_, var), + parser::UnwrapRef(var).GetSource(), + "ERRMSG="); + if (gotMsg) { + context_.Say( + "ERRMSG may not be duplicated in a DEALLOCATE statement"_err_en_US); + } + gotMsg = true; + msgVar = GetExpr(context_, var); + msgSource = parser::Unwrap(var)->GetSource(); + }, + }, + deallocOpt.u); + } for (const parser::AllocateObject &allocateObject : std::get>(deallocateStmt.t)) { + parser::CharBlock source; common::visit( common::visitors{ [&](const parser::Name &name) { const Symbol *symbol{ name.symbol ? &name.symbol->GetUltimate() : nullptr}; - ; + source = name.source; if (context_.HasError(symbol)) { // already reported an error } else if (!IsVariableName(*symbol)) { - context_.Say(name.source, + context_.Say(source, "Name in DEALLOCATE statement must be a variable name"_err_en_US); } else if (!IsAllocatableOrObjectPointer(symbol)) { // C936 - context_.Say(name.source, + context_.Say(source, "Name in DEALLOCATE statement must have the ALLOCATABLE or POINTER attribute"_err_en_US); - } else if (auto whyNot{WhyNotDefinable(name.source, - context_.FindScope(name.source), - {DefinabilityFlag::PointerDefinition, - DefinabilityFlag::AcceptAllocatable, - DefinabilityFlag::PotentialDeallocation}, - *symbol)}) { + } else if (auto whyNot{ + WhyNotDefinable(source, context_.FindScope(source), + {DefinabilityFlag::PointerDefinition, + DefinabilityFlag::AcceptAllocatable, + DefinabilityFlag::PotentialDeallocation}, + *symbol)}) { // Catch problems with non-definability of the // pointer/allocatable context_ - .Say(name.source, + .Say(source, "Name in DEALLOCATE statement is not definable"_err_en_US) .Attach(std::move( whyNot->set_severity(parser::Severity::Because))); - } else if (auto whyNot{WhyNotDefinable(name.source, - context_.FindScope(name.source), - DefinabilityFlags{}, *symbol)}) { + } else if (auto whyNot{ + WhyNotDefinable(source, context_.FindScope(source), + DefinabilityFlags{}, *symbol)}) { // Catch problems with non-definability of the dynamic object context_ - .Say(name.source, + .Say(source, "Object in DEALLOCATE statement is not deallocatable"_err_en_US) .Attach(std::move( whyNot->set_severity(parser::Severity::Because))); @@ -62,13 +98,12 @@ void DeallocateChecker::Leave(const parser::DeallocateStmt &deallocateStmt) { [&](const parser::StructureComponent &structureComponent) { // Only perform structureComponent checks if it was successfully // analyzed by expression analysis. - auto source{structureComponent.component.source}; + source = structureComponent.component.source; if (const auto *expr{GetExpr(context_, allocateObject)}) { - if (const Symbol * - symbol{structureComponent.component.symbol - ? &structureComponent.component.symbol - ->GetUltimate() - : nullptr}; + if (const Symbol *symbol{structureComponent.component.symbol + ? &structureComponent.component.symbol + ->GetUltimate() + : nullptr}; !IsAllocatableOrObjectPointer(symbol)) { // F'2023 C936 context_.Say(source, "Component in DEALLOCATE statement must have the ALLOCATABLE or POINTER attribute"_err_en_US); @@ -99,32 +134,16 @@ void DeallocateChecker::Leave(const parser::DeallocateStmt &deallocateStmt) { }, }, allocateObject.u); - } - bool gotStat{false}, gotMsg{false}; - for (const parser::StatOrErrmsg &deallocOpt : - std::get>(deallocateStmt.t)) { - common::visit( - common::visitors{ - [&](const parser::StatVariable &) { - if (gotStat) { - context_.Say( - "STAT may not be duplicated in a DEALLOCATE statement"_err_en_US); - } - gotStat = true; - }, - [&](const parser::MsgVariable &var) { - WarnOnDeferredLengthCharacterScalar(context_, - GetExpr(context_, var), - parser::UnwrapRef(var).GetSource(), - "ERRMSG="); - if (gotMsg) { - context_.Say( - "ERRMSG may not be duplicated in a DEALLOCATE statement"_err_en_US); - } - gotMsg = true; - }, - }, - deallocOpt.u); + if (const SomeExpr *allocObj{GetExpr(context_, allocateObject)}) { + if (AreSameAllocation(allocObj, statVar)) { + context_.Say(statSource.value_or(source), + "STAT variable in DEALLOCATE must not be the variable being deallocated"_err_en_US); + } + if (AreSameAllocation(allocObj, msgVar)) { + context_.Say(msgSource.value_or(source), + "ERRMSG variable in DEALLOCATE must not be the variable being deallocated"_err_en_US); + } + } } } diff --git a/flang/test/Semantics/allocate14.f90 b/flang/test/Semantics/allocate14.f90 new file mode 100644 index 0000000000000..a97cf5ad88b08 --- /dev/null +++ b/flang/test/Semantics/allocate14.f90 @@ -0,0 +1,56 @@ +! RUN: %python %S/test_errors.py %s %flang_fc1 +! Check for semantic errors in ALLOCATE statements + +program allocate14 + + integer, allocatable :: i1, i2 + character(200), allocatable :: msg1, msg2 + type t + integer, allocatable :: i + character(10), allocatable :: msg + end type t + type(t) :: tt(2) + type(t), allocatable :: ts(:) + + allocate(i1) + allocate(msg1) + + allocate(i2, stat=i1, errmsg=msg1) + allocate(msg2, stat=i1, errmsg=msg1) + deallocate(i2, stat=i1, errmsg=msg1) + deallocate(msg2, stat=i1, errmsg=msg1) + + !ERROR: STAT variable in ALLOCATE must not be the variable being allocated + allocate(i2, stat=i2, errmsg=msg2) + !ERROR: ERRMSG variable in ALLOCATE must not be the variable being allocated + allocate(msg2, stat=i2, errmsg=msg2) + !ERROR: STAT variable in DEALLOCATE must not be the variable being deallocated + deallocate(i2, stat=i2, errmsg=msg2) + !ERROR: ERRMSG variable in DEALLOCATE must not be the variable being deallocated + deallocate(msg2, stat=i2, errmsg=msg2) + + allocate(tt(1)%i) + allocate(tt(1)%msg) + + allocate(tt(2)%i, stat=tt(1)%i, errmsg=tt(1)%msg) + allocate(tt(2)%msg, stat=tt(1)%i, errmsg=tt(1)%msg) + deallocate(tt(2)%i, stat=tt(1)%i, errmsg=tt(1)%msg) + deallocate(tt(2)%msg, stat=tt(1)%i, errmsg=tt(1)%msg) + + !ERROR: STAT variable in ALLOCATE must not be the variable being allocated + allocate(tt(2)%i, stat=tt(2)%i, errmsg=tt(2)%msg) + !ERROR: ERRMSG variable in ALLOCATE must not be the variable being allocated + allocate(tt(2)%msg, stat=tt(2)%i, errmsg=tt(2)%msg) + !ERROR: STAT variable in DEALLOCATE must not be the variable being deallocated + deallocate(tt(2)%i, stat=tt(2)%i, errmsg=tt(2)%msg) + !ERROR: ERRMSG variable in DEALLOCATE must not be the variable being deallocated + deallocate(tt(2)%msg, stat=tt(2)%i, errmsg=tt(2)%msg) + + !TODO: STAT variable in ALLOCATE must not be the variable being allocated + !TODO: ERRMSG variable in ALLOCATE must not be the variable being allocated + allocate(ts(10), stat=ts(1)%i, errmsg=ts(1)%msg) + !TODO: STAT variable in DEALLOCATE must not be the variable being deallocated + !TODO: ERRMSG variable in DEALLOCATE must not be the variable being deallocated + deallocate(ts, stat=ts(1)%i, errmsg=ts(1)%msg) +end program + From 8914a8a8141bb5b283fbbe571ed9e65230f67eb0 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Fri, 31 Oct 2025 10:26:49 -0700 Subject: [PATCH 372/539] [flang] Better folding warning due to hidden conversion (#165430) When folding intrinsic function calls for types like REAL(2) that don't have host math library support, we convert them to a type that has greater range and precision, call a host math library routine that does exist, and convert the result back to the original result type. The folding of this second conversion can elicit floating-point warnings (usually overflow) that are somewhat unclear to the user. Add support for adding contextual information to these warnings. --- flang/include/flang/Evaluate/common.h | 8 +++++++- flang/lib/Evaluate/common.cpp | 18 +++++++++++------- flang/lib/Evaluate/fold-implementation.h | 14 +++++++------- flang/lib/Evaluate/host.cpp | 4 ++-- flang/lib/Evaluate/intrinsics-library.cpp | 4 +++- flang/lib/Semantics/expression.cpp | 2 +- flang/test/Evaluate/folding33.f90 | 4 ++++ 7 files changed, 35 insertions(+), 19 deletions(-) create mode 100644 flang/test/Evaluate/folding33.f90 diff --git a/flang/include/flang/Evaluate/common.h b/flang/include/flang/Evaluate/common.h index 0263f15d4215e..3d220afa71718 100644 --- a/flang/include/flang/Evaluate/common.h +++ b/flang/include/flang/Evaluate/common.h @@ -303,10 +303,16 @@ class FoldingContext { return common::ScopedSet(analyzingPDTComponentKindSelector_, true); } + common::Restorer SetRealFlagWarningContext(std::string str) { + return common::ScopedSet(realFlagWarningContext_, str); + } + parser::CharBlock SaveTempName(std::string &&name) { return {*tempNames_.emplace(std::move(name)).first}; } + void RealFlagWarnings(const RealFlags &, const char *op); + private: parser::ContextualMessages messages_; const common::IntrinsicTypeDefaultKinds &defaults_; @@ -318,8 +324,8 @@ class FoldingContext { std::map impliedDos_; const common::LanguageFeatureControl &languageFeatures_; std::set &tempNames_; + std::string realFlagWarningContext_; }; -void RealFlagWarnings(FoldingContext &, const RealFlags &, const char *op); } // namespace Fortran::evaluate #endif // FORTRAN_EVALUATE_COMMON_H_ diff --git a/flang/lib/Evaluate/common.cpp b/flang/lib/Evaluate/common.cpp index 46c75a5c2ee44..ed6a0ef93b0db 100644 --- a/flang/lib/Evaluate/common.cpp +++ b/flang/lib/Evaluate/common.cpp @@ -13,24 +13,28 @@ using namespace Fortran::parser::literals; namespace Fortran::evaluate { -void RealFlagWarnings( - FoldingContext &context, const RealFlags &flags, const char *operation) { +void FoldingContext::RealFlagWarnings( + const RealFlags &flags, const char *operation) { static constexpr auto warning{common::UsageWarning::FoldingException}; if (flags.test(RealFlag::Overflow)) { - context.Warn(warning, "overflow on %s"_warn_en_US, operation); + Warn(warning, "overflow on %s%s"_warn_en_US, operation, + realFlagWarningContext_); } if (flags.test(RealFlag::DivideByZero)) { if (std::strcmp(operation, "division") == 0) { - context.Warn(warning, "division by zero"_warn_en_US); + Warn(warning, "division by zero%s"_warn_en_US, realFlagWarningContext_); } else { - context.Warn(warning, "division by zero on %s"_warn_en_US, operation); + Warn(warning, "division by zero on %s%s"_warn_en_US, operation, + realFlagWarningContext_); } } if (flags.test(RealFlag::InvalidArgument)) { - context.Warn(warning, "invalid argument on %s"_warn_en_US, operation); + Warn(warning, "invalid argument on %s%s"_warn_en_US, operation, + realFlagWarningContext_); } if (flags.test(RealFlag::Underflow)) { - context.Warn(warning, "underflow on %s"_warn_en_US, operation); + Warn(warning, "underflow on %s%s"_warn_en_US, operation, + realFlagWarningContext_); } } diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h index 3fdf3a6f38848..52ea627d0bbe4 100644 --- a/flang/lib/Evaluate/fold-implementation.h +++ b/flang/lib/Evaluate/fold-implementation.h @@ -1862,7 +1862,7 @@ Expr FoldOperation( std::snprintf(buffer, sizeof buffer, "INTEGER(%d) to REAL(%d) conversion", Operand::kind, TO::kind); - RealFlagWarnings(ctx, converted.flags, buffer); + ctx.RealFlagWarnings(converted.flags, buffer); } return ScalarConstantToExpr(std::move(converted.value)); } else if constexpr (FromCat == TypeCategory::Real) { @@ -1871,7 +1871,7 @@ Expr FoldOperation( if (!converted.flags.empty()) { std::snprintf(buffer, sizeof buffer, "REAL(%d) to REAL(%d) conversion", Operand::kind, TO::kind); - RealFlagWarnings(ctx, converted.flags, buffer); + ctx.RealFlagWarnings(converted.flags, buffer); } if (ctx.targetCharacteristics().areSubnormalsFlushedToZero()) { converted.value = converted.value.FlushSubnormalToZero(); @@ -2012,7 +2012,7 @@ Expr FoldOperation(FoldingContext &context, Add &&x) { } else { auto sum{folded->first.Add( folded->second, context.targetCharacteristics().roundingMode())}; - RealFlagWarnings(context, sum.flags, "addition"); + context.RealFlagWarnings(sum.flags, "addition"); if (context.targetCharacteristics().areSubnormalsFlushedToZero()) { sum.value = sum.value.FlushSubnormalToZero(); } @@ -2041,7 +2041,7 @@ Expr FoldOperation(FoldingContext &context, Subtract &&x) { } else { auto difference{folded->first.Subtract( folded->second, context.targetCharacteristics().roundingMode())}; - RealFlagWarnings(context, difference.flags, "subtraction"); + context.RealFlagWarnings(difference.flags, "subtraction"); if (context.targetCharacteristics().areSubnormalsFlushedToZero()) { difference.value = difference.value.FlushSubnormalToZero(); } @@ -2070,7 +2070,7 @@ Expr FoldOperation(FoldingContext &context, Multiply &&x) { } else { auto product{folded->first.Multiply( folded->second, context.targetCharacteristics().roundingMode())}; - RealFlagWarnings(context, product.flags, "multiplication"); + context.RealFlagWarnings(product.flags, "multiplication"); if (context.targetCharacteristics().areSubnormalsFlushedToZero()) { product.value = product.value.FlushSubnormalToZero(); } @@ -2141,7 +2141,7 @@ Expr FoldOperation(FoldingContext &context, Divide &&x) { } } if (!isCanonicalNaNOrInf) { - RealFlagWarnings(context, quotient.flags, "division"); + context.RealFlagWarnings(quotient.flags, "division"); } if (context.targetCharacteristics().areSubnormalsFlushedToZero()) { quotient.value = quotient.value.FlushSubnormalToZero(); @@ -2201,7 +2201,7 @@ Expr FoldOperation(FoldingContext &context, RealToIntPower &&x) { [&](auto &y) -> Expr { if (auto folded{OperandsAreConstants(x.left(), y)}) { auto power{evaluate::IntPower(folded->first, folded->second)}; - RealFlagWarnings(context, power.flags, "power with INTEGER exponent"); + context.RealFlagWarnings(power.flags, "power with INTEGER exponent"); if (context.targetCharacteristics().areSubnormalsFlushedToZero()) { power.value = power.value.FlushSubnormalToZero(); } diff --git a/flang/lib/Evaluate/host.cpp b/flang/lib/Evaluate/host.cpp index 25409ac3418b8..bf0249647162a 100644 --- a/flang/lib/Evaluate/host.cpp +++ b/flang/lib/Evaluate/host.cpp @@ -140,8 +140,8 @@ void HostFloatingPointEnvironment::CheckAndRestoreFloatingPointEnvironment( } if (!flags_.empty()) { - RealFlagWarnings( - context, flags_, "evaluation of intrinsic function or operation"); + context.RealFlagWarnings( + flags_, "evaluation of intrinsic function or operation"); } errno = 0; if (fesetenv(&originalFenv_) != 0) { diff --git a/flang/lib/Evaluate/intrinsics-library.cpp b/flang/lib/Evaluate/intrinsics-library.cpp index 9820aa3d2ea3d..d8af5246fabdd 100644 --- a/flang/lib/Evaluate/intrinsics-library.cpp +++ b/flang/lib/Evaluate/intrinsics-library.cpp @@ -1043,7 +1043,7 @@ std::optional GetHostRuntimeWrapper(const std::string &name, if (const auto *hostFunction{ SearchHostRuntime(name, biggerResultType, biggerArgTypes)}) { auto hostFolderWithChecks{AddArgumentVerifierIfAny(name, *hostFunction)}; - return [hostFunction, resultType, hostFolderWithChecks]( + return [hostFunction, resultType, hostFolderWithChecks, name]( FoldingContext &context, std::vector> &&args) { auto nArgs{args.size()}; for (size_t i{0}; i < nArgs; ++i) { @@ -1051,6 +1051,8 @@ std::optional GetHostRuntimeWrapper(const std::string &name, ConvertToType(hostFunction->argumentTypes[i], std::move(args[i])) .value()); } + auto restorer{context.SetRealFlagWarningContext( + " after folding a call to '"s + name + "'"s)}; return Fold(context, ConvertToType( resultType, hostFolderWithChecks(context, std::move(args))) diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index 32aa6b1e0aa1d..c8167fd34f666 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -834,7 +834,7 @@ Constant ReadRealLiteral( auto valWithFlags{ Scalar::Read(p, context.targetCharacteristics().roundingMode())}; CHECK(p == source.end()); - RealFlagWarnings(context, valWithFlags.flags, "conversion of REAL literal"); + context.RealFlagWarnings(valWithFlags.flags, "conversion of REAL literal"); auto value{valWithFlags.value}; if (context.targetCharacteristics().areSubnormalsFlushedToZero()) { value = value.FlushSubnormalToZero(); diff --git a/flang/test/Evaluate/folding33.f90 b/flang/test/Evaluate/folding33.f90 new file mode 100644 index 0000000000000..fb5a23cf1f209 --- /dev/null +++ b/flang/test/Evaluate/folding33.f90 @@ -0,0 +1,4 @@ +!RUN: %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s +!CHECK: warning: overflow on REAL(4) to REAL(2) conversion after folding a call to 'exp' [-Wfolding-exception] +print *, exp((11.265625_2,1._2)) +end From 57accd6d7a2562260f03ff273a6355e3fa1211c5 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Fri, 31 Oct 2025 10:27:30 -0700 Subject: [PATCH 373/539] [flang] Don't allow function calls to PROCEDURE() (#165786) PROCEDURE() declares a procedure with no interface or result type. (When used to declare a derived type component, it must also be a NOPASS POINTER.) Document that we allow such procedures to be called as subroutines with implicit interfaces, despite the ISO standard -- this is a universal extension to the language. However, no longer allow such procedure entities to be referenced as implicitly-typed functions -- this usage is neither portable nor well-defined, as the compilers that do allow it do not respect the implicit typing rules established at the point of declaration. --- flang/docs/Extensions.md | 8 +++++++- flang/lib/Semantics/resolve-names.cpp | 19 ++++++++++++------- flang/test/Semantics/resolve09.f90 | 8 ++++---- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md index 6d872094811e3..c9cc02703fbc8 100644 --- a/flang/docs/Extensions.md +++ b/flang/docs/Extensions.md @@ -182,6 +182,13 @@ end Note that internally the main program symbol name is all uppercase, unlike the names of all other symbols, which are usually all lowercase. This may make a difference in testing/debugging. +* A `PROCEDURE()` with no interface name or type may be called as an + subroutine with an implicit interface, F'2023 15.4.3.6 paragraph 4 and + C1525 notwithstanding. + This is a universally portable feature, and it also applies to + `PROCEDURE(), POINTER, NOPASS` derived type components. + Such procedures may *not* be referenced as implicitly typed functions + without first being associated with a function pointer. ## Extensions, deletions, and legacy features supported by default @@ -954,4 +961,3 @@ print *, [(j,j=1,10)] "&GRP A(1:)=1. 2. 3./". This extension is necessarily disabled when the type of the array has an accessible defined formatted READ subroutine. - diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index f88af5fac0bbd..220f1c96b9823 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -9435,13 +9435,18 @@ bool ResolveNamesVisitor::SetProcFlag( SayWithDecl(name, symbol, "Implicit declaration of function '%s' has a different result type than in previous declaration"_err_en_US); return false; - } else if (symbol.has()) { - symbol.set(flag); // in case it hasn't been set yet - if (flag == Symbol::Flag::Function) { - ApplyImplicitRules(symbol); - } - if (symbol.attrs().test(Attr::INTRINSIC)) { - AcquireIntrinsicProcedureFlags(symbol); + } else if (const auto *proc{symbol.detailsIf()}) { + if (IsPointer(symbol) && !proc->type() && !proc->procInterface()) { + // PROCEDURE(), POINTER -- errors will be emitted later about a lack + // of known characteristics if used as a function + } else { + symbol.set(flag); // in case it hasn't been set yet + if (flag == Symbol::Flag::Function) { + ApplyImplicitRules(symbol); + } + if (symbol.attrs().test(Attr::INTRINSIC)) { + AcquireIntrinsicProcedureFlags(symbol); + } } } else if (symbol.GetType() && flag == Symbol::Flag::Subroutine) { SayWithDecl( diff --git a/flang/test/Semantics/resolve09.f90 b/flang/test/Semantics/resolve09.f90 index 2fe21aebf66bd..3384b05bf8f27 100644 --- a/flang/test/Semantics/resolve09.f90 +++ b/flang/test/Semantics/resolve09.f90 @@ -140,11 +140,11 @@ subroutine s9 procedure(), nopass, pointer :: p1, p2 end type type(t) x + !ERROR: Function result characteristics are not known print *, x%p1() - call x%p2 - !ERROR: Cannot call function 'p1' like a subroutine - call x%p1 - !ERROR: Cannot call subroutine 'p2' like a function + call x%p2 ! ok + call x%p1 ! ok + !ERROR: Function result characteristics are not known print *, x%p2() end subroutine From ea6ed8c41f6fc1d7549ca3f74b44c97a61e6da3d Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Fri, 31 Oct 2025 10:28:04 -0700 Subject: [PATCH 374/539] [flang] Treat conditional comments as comments (#165881) An OpenMP, OpenACC, or CUDA conditional line should be treated as a comment when that's what its payload contains, not as a conditional source line that will confuse the parser when it is indeed just a comment. --- flang/lib/Parser/prescan.cpp | 24 +++++++++++++++++++++-- flang/test/Preprocessing/bug136845.F | 1 - flang/test/Preprocessing/cond-comment.f | 5 +++++ flang/test/Preprocessing/cond-comment.f90 | 5 +++++ 4 files changed, 32 insertions(+), 3 deletions(-) create mode 100644 flang/test/Preprocessing/cond-comment.f create mode 100644 flang/test/Preprocessing/cond-comment.f90 diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index fd69404f313d3..efce8fc3d2e35 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -1642,6 +1642,17 @@ Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const { // This is a Continuation line, not an initial directive line. return std::nullopt; } + ++column, ++p; + } + if (isOpenMPConditional) { + for (; column <= fixedFormColumnLimit_; ++column, ++p) { + if (IsSpaceOrTab(p)) { + } else if (*p == '!') { + return std::nullopt; // !$ ! is a comment, not a directive + } else { + break; + } + } } if (const char *ss{IsCompilerDirectiveSentinel( sentinel, static_cast(sp - sentinel))}) { @@ -1657,8 +1668,17 @@ Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const { p && *p++ == '!') { if (auto maybePair{IsCompilerDirectiveSentinel(p)}) { auto offset{static_cast(p - start - 1)}; - return {LineClassification{LineClassification::Kind::CompilerDirective, - offset, maybePair->first}}; + const char *sentinel{maybePair->first}; + if ((sentinel[0] == '$' && sentinel[1] == '\0') || sentinel[1] == '@') { + if (const char *comment{IsFreeFormComment(maybePair->second)}) { + if (*comment == '!') { + // Conditional line comment - treat as comment + return std::nullopt; + } + } + } + return {LineClassification{ + LineClassification::Kind::CompilerDirective, offset, sentinel}}; } } return std::nullopt; diff --git a/flang/test/Preprocessing/bug136845.F b/flang/test/Preprocessing/bug136845.F index ce52c2953bb57..311ee0a2d874c 100644 --- a/flang/test/Preprocessing/bug136845.F +++ b/flang/test/Preprocessing/bug136845.F @@ -18,7 +18,6 @@ *$1 continue end -!PREPRO:!$ & !PREPRO: continue !PREPRO: k=0 !PREPRO: k=0 diff --git a/flang/test/Preprocessing/cond-comment.f b/flang/test/Preprocessing/cond-comment.f new file mode 100644 index 0000000000000..a484fcbfa8fa7 --- /dev/null +++ b/flang/test/Preprocessing/cond-comment.f @@ -0,0 +1,5 @@ +!RUN: %flang_fc1 -fopenmp -fdebug-unparse %s 2>&1 | FileCheck %s +!CHECK: END +!CHECK-NOT: error: + end +c$ ! diff --git a/flang/test/Preprocessing/cond-comment.f90 b/flang/test/Preprocessing/cond-comment.f90 new file mode 100644 index 0000000000000..457614ae9372e --- /dev/null +++ b/flang/test/Preprocessing/cond-comment.f90 @@ -0,0 +1,5 @@ +!RUN: %flang_fc1 -fopenmp -fdebug-unparse %s 2>&1 | FileCheck %s +!CHECK: END +!CHECK-NOT: error: +end +!$ ! From 95b250f6a6a319b15b189de54990e73050d7e291 Mon Sep 17 00:00:00 2001 From: Mariusz Borsa Date: Fri, 31 Oct 2025 10:45:36 -0700 Subject: [PATCH 375/539] Revert "[Sanitizers][Test] XFAIL suppressions/fread_fwrite (#154189)" (#165751) The macOS platform where test failures occured was updated to a newer version - these tests now pass, so undoing XFAIL rdar://163149340 This reverts commit 4dc32df3ca0a937ffb6052a40170fcc318330fd9. Co-authored-by: Mariusz Borsa --- .../test/asan/TestCases/Darwin/suppressions-sandbox.cpp | 3 --- compiler-rt/test/asan/TestCases/Posix/fread_fwrite.cpp | 3 --- 2 files changed, 6 deletions(-) diff --git a/compiler-rt/test/asan/TestCases/Darwin/suppressions-sandbox.cpp b/compiler-rt/test/asan/TestCases/Darwin/suppressions-sandbox.cpp index f12e2b2ada50d..651d0c5d05b07 100644 --- a/compiler-rt/test/asan/TestCases/Darwin/suppressions-sandbox.cpp +++ b/compiler-rt/test/asan/TestCases/Darwin/suppressions-sandbox.cpp @@ -15,9 +15,6 @@ // sandbox-exec isn't available on iOS // UNSUPPORTED: ios -// Symbolizer fails to find test functions on current macOS bot version -// XFAIL: system-darwin && target=arm{{.*}} - #include #if defined(SHARED_LIB) diff --git a/compiler-rt/test/asan/TestCases/Posix/fread_fwrite.cpp b/compiler-rt/test/asan/TestCases/Posix/fread_fwrite.cpp index c7b9280ea7d8e..c0629260418a3 100644 --- a/compiler-rt/test/asan/TestCases/Posix/fread_fwrite.cpp +++ b/compiler-rt/test/asan/TestCases/Posix/fread_fwrite.cpp @@ -2,9 +2,6 @@ // RUN: not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-FWRITE // RUN: not %run %t 1 2>&1 | FileCheck %s --check-prefix=CHECK-FREAD -// Symbolizer fails to find test functions on current macOS bot version -// XFAIL: system-darwin && target=arm{{.*}} - #include #include From a796f36b24631e02b635ced82d04180633b8ad03 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Fri, 31 Oct 2025 10:58:22 -0700 Subject: [PATCH 376/539] [AMDGPU] Reset VGPR MSBs at the end of fallthrough basic block (#164901) By convention a basic block shall start with MSBs zero. We also need to know a previous mode in all cases as SWDEV-562450 asks to record the old mode in the high bits of the mode. --- .../Target/AMDGPU/AMDGPULowerVGPREncoding.cpp | 72 +++++++++---------- .../CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir | 9 ++- 2 files changed, 39 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp index 1e6589eb42c15..9b932273b2216 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp @@ -82,12 +82,12 @@ class AMDGPULowerVGPREncoding { const SIInstrInfo *TII; const SIRegisterInfo *TRI; + // Current basic block. + MachineBasicBlock *MBB; + /// Most recent s_set_* instruction. MachineInstr *MostRecentModeSet; - /// Whether the current mode is known. - bool CurrentModeKnown; - /// Current mode bits. ModeTy CurrentMode; @@ -108,10 +108,13 @@ class AMDGPULowerVGPREncoding { MachineInstr *Clause; /// Insert mode change before \p I. \returns true if mode was changed. - bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I); + bool setMode(ModeTy NewMode, ModeTy Mask, + MachineBasicBlock::instr_iterator I); /// Reset mode to default. - void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); } + void resetMode(MachineBasicBlock::instr_iterator I) { + setMode(ModeTy(), ModeTy::fullMask(), I); + } /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt. std::optional getMSBs(const MachineOperand &MO) const; @@ -130,38 +133,35 @@ class AMDGPULowerVGPREncoding { /// Check if an instruction \p I is within a clause and returns a suitable /// iterator to insert mode change. It may also modify the S_CLAUSE /// instruction to extend it or drop the clause if it cannot be adjusted. - MachineInstr *handleClause(MachineInstr *I); + MachineBasicBlock::instr_iterator + handleClause(MachineBasicBlock::instr_iterator I); }; bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask, - MachineInstr *I) { + MachineBasicBlock::instr_iterator I) { assert((NewMode.raw_bits() & ~Mask.raw_bits()).none()); - if (CurrentModeKnown) { - auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits(); + auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits(); - if ((Delta & Mask.raw_bits()).none()) { - CurrentMask |= Mask; - return false; - } + if ((Delta & Mask.raw_bits()).none()) { + CurrentMask |= Mask; + return false; + } - if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) { - CurrentMode |= NewMode; - CurrentMask |= Mask; + if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) { + CurrentMode |= NewMode; + CurrentMask |= Mask; - MostRecentModeSet->getOperand(0).setImm(CurrentMode); - return true; - } + MostRecentModeSet->getOperand(0).setImm(CurrentMode); + return true; } I = handleClause(I); MostRecentModeSet = - BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB)) - .addImm(NewMode); + BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB)).addImm(NewMode); CurrentMode = NewMode; CurrentMask = Mask; - CurrentModeKnown = true; return true; } @@ -233,21 +233,22 @@ bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) { if (Ops.first) { ModeTy NewMode, Mask; computeMode(NewMode, Mask, MI, Ops.first, Ops.second); - return setMode(NewMode, Mask, &MI); + return setMode(NewMode, Mask, MI.getIterator()); } assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo()); return false; } -MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) { +MachineBasicBlock::instr_iterator +AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) { if (!ClauseRemaining) return I; // A clause cannot start with a special instruction, place it right before // the clause. if (ClauseRemaining == ClauseLen) { - I = Clause->getPrevNode(); + I = Clause->getPrevNode()->getIterator(); assert(I->isBundle()); return I; } @@ -284,9 +285,9 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { ClauseLen = ClauseRemaining = 0; CurrentMode.reset(); CurrentMask.reset(); - CurrentModeKnown = true; for (auto &MBB : MF) { MostRecentModeSet = nullptr; + this->MBB = &MBB; for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) { if (MI.isMetaInstruction()) @@ -294,17 +295,16 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { if (MI.isTerminator() || MI.isCall()) { if (MI.getOpcode() == AMDGPU::S_ENDPGM || - MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { + MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) CurrentMode.reset(); - CurrentModeKnown = true; - } else - resetMode(&MI); + else + resetMode(MI.getIterator()); continue; } if (MI.isInlineAsm()) { if (TII->hasVGPRUses(MI)) - resetMode(&MI); + resetMode(MI.getIterator()); continue; } @@ -323,14 +323,8 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { --ClauseRemaining; } - // If we're falling through to a block that has at least one other - // predecessor, we no longer know the mode. - MachineBasicBlock *Next = MBB.getNextNode(); - if (Next && Next->pred_size() >= 2 && - llvm::is_contained(Next->predecessors(), &MBB)) { - if (CurrentMode.raw_bits().any()) - CurrentModeKnown = false; - } + // Reset the mode if we are falling through. + resetMode(MBB.instr_end()); } return Changed; diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir index f508df2292e90..41a7b82913bb0 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir @@ -478,16 +478,18 @@ body: | ; ASM: .LBB{{.*_1}}: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 + ; GCN-NEXT: s_set_vgpr_msb 0 $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec - ; No mode switch on fall through + ; Reset on fallthrough block end bb.2: ; ASM-NEXT: %bb.2: - ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 ; GCN-NEXT: s_set_vgpr_msb 0 ; GCN-NEXT: s_branch - S_NOP 0 + $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec S_BRANCH %bb.3 ; Reset mode on terminator @@ -574,6 +576,7 @@ body: | ; ASM: %bb.0: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0 + ; GCN-NEXT: s_set_vgpr_msb 0 $vgpr256 = V_MOV_B32_e32 undef $vgpr0, implicit $exec bb.1: From 36144796a94c60bbaf635746d8f008971e869584 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Fri, 31 Oct 2025 11:03:39 -0700 Subject: [PATCH 377/539] =?UTF-8?q?Revert=20"Reland=20"[lldb-dap]=20Improv?= =?UTF-8?q?ing=20consistency=20of=20tests=20by=20removing=E2=80=A6=20(#165?= =?UTF-8?q?892)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … concurrency." (#165688)"" This reverts commit 17dbd8690e36f8e514fb47f4418f78420d0fc019. This was causing timeouts on the premerge runners. Reverting for now until the timeouts trigger within lit and/or we have a better testing strategy for this. --- .../test/tools/lldb-dap/dap_server.py | 206 +++++++++++------- .../test/tools/lldb-dap/lldbdap_testcase.py | 4 +- .../TestDAP_breakpointEvents.py | 30 +-- .../tools/lldb-dap/launch/TestDAP_launch.py | 2 +- .../module-event/TestDAP_module_event.py | 88 ++++---- .../tools/lldb-dap/module/TestDAP_module.py | 8 +- .../restart/TestDAP_restart_console.py | 24 +- .../lldb-dap/send-event/TestDAP_sendEvent.py | 2 +- 8 files changed, 203 insertions(+), 161 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py index 8f3652172dfdf..d892c01f0bc71 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py @@ -10,8 +10,8 @@ import subprocess import signal import sys +import threading import warnings -import selectors import time from typing import ( Any, @@ -139,6 +139,35 @@ def dump_memory(base_addr, data, num_per_line, outfile): outfile.write("\n") +def read_packet( + f: IO[bytes], trace_file: Optional[IO[str]] = None +) -> Optional[ProtocolMessage]: + """Decode a JSON packet that starts with the content length and is + followed by the JSON bytes from a file 'f'. Returns None on EOF. + """ + line = f.readline().decode("utf-8") + if len(line) == 0: + return None # EOF. + + # Watch for line that starts with the prefix + prefix = "Content-Length: " + if line.startswith(prefix): + # Decode length of JSON bytes + length = int(line[len(prefix) :]) + # Skip empty line + separator = f.readline().decode() + if separator != "": + Exception("malformed DAP content header, unexpected line: " + separator) + # Read JSON bytes + json_str = f.read(length).decode() + if trace_file: + trace_file.write("from adapter:\n%s\n" % (json_str)) + # Decode the JSON bytes into a python dictionary + return json.loads(json_str) + + raise Exception("unexpected malformed message from lldb-dap: " + line) + + def packet_type_is(packet, packet_type): return "type" in packet and packet["type"] == packet_type @@ -170,8 +199,16 @@ def __init__( self.log_file = log_file self.send = send self.recv = recv - self.selector = selectors.DefaultSelector() - self.selector.register(recv, selectors.EVENT_READ) + + # Packets that have been received and processed but have not yet been + # requested by a test case. + self._pending_packets: List[Optional[ProtocolMessage]] = [] + # Received packets that have not yet been processed. + self._recv_packets: List[Optional[ProtocolMessage]] = [] + # Used as a mutex for _recv_packets and for notify when _recv_packets + # changes. + self._recv_condition = threading.Condition() + self._recv_thread = threading.Thread(target=self._read_packet_thread) # session state self.init_commands = init_commands @@ -197,6 +234,9 @@ def __init__( # keyed by breakpoint id self.resolved_breakpoints: dict[str, Breakpoint] = {} + # trigger enqueue thread + self._recv_thread.start() + @classmethod def encode_content(cls, s: str) -> bytes: return ("Content-Length: %u\r\n\r\n%s" % (len(s), s)).encode("utf-8") @@ -212,46 +252,17 @@ def validate_response(cls, command, response): f"seq mismatch in response {command['seq']} != {response['request_seq']}" ) - def _read_packet( - self, - timeout: float = DEFAULT_TIMEOUT, - ) -> Optional[ProtocolMessage]: - """Decode a JSON packet that starts with the content length and is - followed by the JSON bytes from self.recv. Returns None on EOF. - """ - - ready = self.selector.select(timeout) - if not ready: - warnings.warn( - "timeout occurred waiting for a packet, check if the test has a" - " negative assertion and see if it can be inverted.", - stacklevel=4, - ) - return None # timeout - - line = self.recv.readline().decode("utf-8") - if len(line) == 0: - return None # EOF. - - # Watch for line that starts with the prefix - prefix = "Content-Length: " - if line.startswith(prefix): - # Decode length of JSON bytes - length = int(line[len(prefix) :]) - # Skip empty line - separator = self.recv.readline().decode() - if separator != "": - Exception("malformed DAP content header, unexpected line: " + separator) - # Read JSON bytes - json_str = self.recv.read(length).decode() - if self.trace_file: - self.trace_file.write( - "%s from adapter:\n%s\n" % (time.time(), json_str) - ) - # Decode the JSON bytes into a python dictionary - return json.loads(json_str) - - raise Exception("unexpected malformed message from lldb-dap: " + line) + def _read_packet_thread(self): + try: + while True: + packet = read_packet(self.recv, trace_file=self.trace_file) + # `packet` will be `None` on EOF. We want to pass it down to + # handle_recv_packet anyway so the main thread can handle unexpected + # termination of lldb-dap and stop waiting for new packets. + if not self._handle_recv_packet(packet): + break + finally: + dump_dap_log(self.log_file) def get_modules( self, start_module: Optional[int] = None, module_count: Optional[int] = None @@ -299,6 +310,34 @@ def collect_output( output += self.get_output(category, clear=clear) return output + def _enqueue_recv_packet(self, packet: Optional[ProtocolMessage]): + with self.recv_condition: + self.recv_packets.append(packet) + self.recv_condition.notify() + + def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool: + """Handles an incoming packet. + + Called by the read thread that is waiting for all incoming packets + to store the incoming packet in "self._recv_packets" in a thread safe + way. This function will then signal the "self._recv_condition" to + indicate a new packet is available. + + Args: + packet: A new packet to store. + + Returns: + True if the caller should keep calling this function for more + packets. + """ + with self._recv_condition: + self._recv_packets.append(packet) + self._recv_condition.notify() + # packet is None on EOF + return packet is not None and not ( + packet["type"] == "response" and packet["command"] == "disconnect" + ) + def _recv_packet( self, *, @@ -322,34 +361,46 @@ def _recv_packet( The first matching packet for the given predicate, if specified, otherwise None. """ - deadline = time.time() + timeout - - while time.time() < deadline: - packet = self._read_packet(timeout=deadline - time.time()) - if packet is None: - return None - self._process_recv_packet(packet) - if not predicate or predicate(packet): - return packet - - def _process_recv_packet(self, packet) -> None: + assert ( + threading.current_thread != self._recv_thread + ), "Must not be called from the _recv_thread" + + def process_until_match(): + self._process_recv_packets() + for i, packet in enumerate(self._pending_packets): + if packet is None: + # We need to return a truthy value to break out of the + # wait_for, use `EOFError` as an indicator of EOF. + return EOFError() + if predicate and predicate(packet): + self._pending_packets.pop(i) + return packet + + with self._recv_condition: + packet = self._recv_condition.wait_for(process_until_match, timeout) + return None if isinstance(packet, EOFError) else packet + + def _process_recv_packets(self) -> None: """Process received packets, updating the session state.""" - if packet and ("seq" not in packet or packet["seq"] == 0): - warnings.warn( - f"received a malformed packet, expected 'seq != 0' for {packet!r}" - ) - # Handle events that may modify any stateful properties of - # the DAP session. - if packet and packet["type"] == "event": - self._handle_event(packet) - elif packet and packet["type"] == "request": - # Handle reverse requests and keep processing. - self._handle_reverse_request(packet) + with self._recv_condition: + for packet in self._recv_packets: + if packet and ("seq" not in packet or packet["seq"] == 0): + warnings.warn( + f"received a malformed packet, expected 'seq != 0' for {packet!r}" + ) + # Handle events that may modify any stateful properties of + # the DAP session. + if packet and packet["type"] == "event": + self._handle_event(packet) + elif packet and packet["type"] == "request": + # Handle reverse requests and keep processing. + self._handle_reverse_request(packet) + # Move the packet to the pending queue. + self._pending_packets.append(packet) + self._recv_packets.clear() def _handle_event(self, packet: Event) -> None: """Handle any events that modify debug session state we track.""" - self.events.append(packet) - event = packet["event"] body: Optional[Dict] = packet.get("body", None) @@ -402,8 +453,6 @@ def _handle_event(self, packet: Event) -> None: self.invalidated_event = packet elif event == "memory": self.memory_event = packet - elif event == "module": - self.module_events.append(packet) def _handle_reverse_request(self, request: Request) -> None: if request in self.reverse_requests: @@ -472,14 +521,18 @@ def send_packet(self, packet: ProtocolMessage) -> int: Returns the seq number of the request. """ - packet["seq"] = self.sequence - self.sequence += 1 + # Set the seq for requests. + if packet["type"] == "request": + packet["seq"] = self.sequence + self.sequence += 1 + else: + packet["seq"] = 0 # Encode our command dictionary as a JSON string json_str = json.dumps(packet, separators=(",", ":")) if self.trace_file: - self.trace_file.write("%s to adapter:\n%s\n" % (time.time(), json_str)) + self.trace_file.write("to adapter:\n%s\n" % (json_str)) length = len(json_str) if length > 0: @@ -860,8 +913,6 @@ def request_restart(self, restartArguments=None): if restartArguments: command_dict["arguments"] = restartArguments - # Clear state, the process is about to restart... - self._process_continued(True) response = self._send_recv(command_dict) # Caller must still call wait_for_stopped. return response @@ -1428,10 +1479,8 @@ def request_testGetTargetBreakpoints(self): def terminate(self): self.send.close() - self.recv.close() - self.selector.close() - if self.log_file: - dump_dap_log(self.log_file) + if self._recv_thread.is_alive(): + self._recv_thread.join() def request_setInstructionBreakpoints(self, memory_reference=[]): breakpoints = [] @@ -1528,7 +1577,6 @@ def launch( stdout=subprocess.PIPE, stderr=sys.stderr, env=adapter_env, - bufsize=0, ) if connection is None: diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py index 405e91fc2dc36..29935bb8046ff 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py @@ -15,8 +15,6 @@ # DAP tests as a whole have been flakey on the Windows on Arm bot. See: # https://github.com/llvm/llvm-project/issues/137660 @skipIf(oslist=["windows"], archs=["aarch64"]) -# The Arm Linux bot needs stable resources before it can run these tests reliably. -@skipIf(oslist=["linux"], archs=["arm$"]) class DAPTestCaseBase(TestBase): # set timeout based on whether ASAN was enabled or not. Increase # timeout by a factor of 10 if ASAN is enabled. @@ -418,7 +416,7 @@ def continue_to_next_stop(self): return self.dap_server.wait_for_stopped() def continue_to_breakpoint(self, breakpoint_id: str): - self.continue_to_breakpoints([breakpoint_id]) + self.continue_to_breakpoints((breakpoint_id)) def continue_to_breakpoints(self, breakpoint_ids): self.do_continue() diff --git a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py index 7b78541fb4f8e..beab4d6c1f5a6 100644 --- a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py +++ b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py @@ -81,20 +81,24 @@ def test_breakpoint_events(self): breakpoint["verified"], "expect foo breakpoint to not be verified" ) + # Flush the breakpoint events. + self.dap_server.wait_for_breakpoint_events() + # Continue to the breakpoint - self.continue_to_breakpoint(foo_bp_id) - self.continue_to_next_stop() # foo_bp2 - self.continue_to_breakpoint(main_bp_id) - self.continue_to_exit() + self.continue_to_breakpoints(dap_breakpoint_ids) - bp_events = [e for e in self.dap_server.events if e["event"] == "breakpoint"] + verified_breakpoint_ids = [] + unverified_breakpoint_ids = [] + for breakpoint_event in self.dap_server.wait_for_breakpoint_events(): + breakpoint = breakpoint_event["body"]["breakpoint"] + id = breakpoint["id"] + if breakpoint["verified"]: + verified_breakpoint_ids.append(id) + else: + unverified_breakpoint_ids.append(id) - main_bp_events = [ - e for e in bp_events if e["body"]["breakpoint"]["id"] == main_bp_id - ] - foo_bp_events = [ - e for e in bp_events if e["body"]["breakpoint"]["id"] == foo_bp_id - ] + self.assertIn(main_bp_id, unverified_breakpoint_ids) + self.assertIn(foo_bp_id, unverified_breakpoint_ids) - self.assertTrue(main_bp_events) - self.assertTrue(foo_bp_events) + self.assertIn(main_bp_id, verified_breakpoint_ids) + self.assertIn(foo_bp_id, verified_breakpoint_ids) diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py index 09b13223e0a78..ca881f1d817c5 100644 --- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py +++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py @@ -156,7 +156,6 @@ def test_debuggerRoot(self): self.build_and_launch( program, debuggerRoot=program_parent_dir, initCommands=commands ) - self.continue_to_exit() output = self.get_console() self.assertTrue(output and len(output) > 0, "expect console output") lines = output.splitlines() @@ -172,6 +171,7 @@ def test_debuggerRoot(self): % (program_parent_dir, line[len(prefix) :]), ) self.assertTrue(found, "verified lldb-dap working directory") + self.continue_to_exit() def test_sourcePath(self): """ diff --git a/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py b/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py index 9d1d17b704f76..1f4afabbd161e 100644 --- a/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py +++ b/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py @@ -1,58 +1,58 @@ -""" -Test 'module' events for dynamically loaded libraries. -""" - +import dap_server from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil import lldbdap_testcase +import re class TestDAP_module_event(lldbdap_testcase.DAPTestCaseBase): - def lookup_module_id(self, name): - """Returns the identifier for the first module event starting with the given name.""" - for event in self.dap_server.module_events: - if self.get_dict_value(event, ["body", "module", "name"]).startswith(name): - return self.get_dict_value(event, ["body", "module", "id"]) - self.fail(f"No module events matching name={name}") - - def module_events(self, id): - """Finds all module events by identifier.""" - return [ - event - for event in self.dap_server.module_events - if self.get_dict_value(event, ["body", "module", "id"]) == id - ] - - def module_reasons(self, events): - """Returns the list of 'reason' values from the given events.""" - return [event["body"]["reason"] for event in events] - @skipIfWindows def test_module_event(self): - """ - Test that module events are fired on target load and when the list of - dynamic libraries updates while running. - """ program = self.getBuildArtifact("a.out") self.build_and_launch(program) - # We can analyze the order of events after the process exits. - self.continue_to_exit() - a_out_id = self.lookup_module_id("a.out") - a_out_events = self.module_events(id=a_out_id) + source = "main.cpp" + breakpoint1_line = line_number(source, "// breakpoint 1") + breakpoint2_line = line_number(source, "// breakpoint 2") + breakpoint3_line = line_number(source, "// breakpoint 3") - self.assertIn( - "new", - self.module_reasons(a_out_events), - "Expected a.out to load during the debug session.", + breakpoint_ids = self.set_source_breakpoints( + source, [breakpoint1_line, breakpoint2_line, breakpoint3_line] ) + self.continue_to_breakpoints(breakpoint_ids) + + # We're now stopped at breakpoint 1 before the dlopen. Flush all the module events. + event = self.dap_server.wait_for_event(["module"]) + while event is not None: + event = self.dap_server.wait_for_event(["module"]) + + # Continue to the second breakpoint, before the dlclose. + self.continue_to_breakpoints(breakpoint_ids) + + # Make sure we got a module event for libother. + event = self.dap_server.wait_for_event(["module"]) + self.assertIsNotNone(event, "didn't get a module event") + module_name = event["body"]["module"]["name"] + module_id = event["body"]["module"]["id"] + self.assertEqual(event["body"]["reason"], "new") + self.assertIn("libother", module_name) + + # Continue to the third breakpoint, after the dlclose. + self.continue_to_breakpoints(breakpoint_ids) + + # Make sure we got a module event for libother. + event = self.dap_server.wait_for_event(["module"]) + self.assertIsNotNone(event, "didn't get a module event") + reason = event["body"]["reason"] + self.assertEqual(reason, "removed") + self.assertEqual(event["body"]["module"]["id"], module_id) + + # The removed module event should omit everything but the module id and name + # as they are required fields. + module_data = event["body"]["module"] + required_keys = ["id", "name"] + self.assertListEqual(list(module_data.keys()), required_keys) + self.assertEqual(module_data["name"], "", "expects empty name.") - libother_id = self.lookup_module_id( - "libother." # libother.so or libother.dylib based on OS. - ) - libother_events = self.module_events(id=libother_id) - self.assertEqual( - self.module_reasons(libother_events), - ["new", "removed"], - "Expected libother to be loaded then unloaded during the debug session.", - ) + self.continue_to_exit() diff --git a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py index 2d00c512721c6..0ed53dac5d869 100644 --- a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py +++ b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py @@ -64,18 +64,19 @@ def check_symbols_loaded_with_size(): self.assertEqual(program, program_module["path"]) self.assertIn("addressRange", program_module) - self.continue_to_exit() - # Collect all the module names we saw as events. module_new_names = [] module_changed_names = [] - for module_event in self.dap_server.module_events: + module_event = self.dap_server.wait_for_event(["module"]) + while module_event is not None: reason = module_event["body"]["reason"] if reason == "new": module_new_names.append(module_event["body"]["module"]["name"]) elif reason == "changed": module_changed_names.append(module_event["body"]["module"]["name"]) + module_event = self.dap_server.wait_for_event(["module"]) + # Make sure we got an event for every active module. self.assertNotEqual(len(module_new_names), 0) for module in active_modules: @@ -85,6 +86,7 @@ def check_symbols_loaded_with_size(): # symbols got added. self.assertNotEqual(len(module_changed_names), 0) self.assertIn(program_module["name"], module_changed_names) + self.continue_to_exit() @skipIfWindows def test_modules(self): diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py index fa62ec243f5c5..e1ad1425a993d 100644 --- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py +++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py @@ -30,11 +30,7 @@ def verify_stopped_on_entry(self, stopped_events: List[Dict[str, Any]]): if reason == "entry": seen_stopped_event += 1 - self.assertEqual( - seen_stopped_event, - 1, - f"expect only one stopped entry event in {stopped_events}", - ) + self.assertEqual(seen_stopped_event, 1, "expect only one stopped entry event.") @skipIfAsan @skipIfWindows @@ -96,13 +92,11 @@ def test_stopOnEntry(self): self.build_and_launch(program, console="integratedTerminal", stopOnEntry=True) [bp_main] = self.set_function_breakpoints(["main"]) - self.dap_server.request_configurationDone() - stopped_threads = list(self.dap_server.thread_stop_reasons.values()) + self.dap_server.request_continue() # sends configuration done + stopped_events = self.dap_server.wait_for_stopped() # We should be stopped at the entry point. - self.assertEqual( - len(stopped_threads), 1, "Expected the main thread to be stopped on entry." - ) - self.assertEqual(stopped_threads[0]["reason"], "entry") + self.assertGreaterEqual(len(stopped_events), 0, "expect stopped events") + self.verify_stopped_on_entry(stopped_events) # Then, if we continue, we should hit the breakpoint at main. self.dap_server.request_continue() @@ -111,12 +105,8 @@ def test_stopOnEntry(self): # Restart and check that we still get a stopped event before reaching # main. self.dap_server.request_restart() - stopped_threads = list(self.dap_server.thread_stop_reasons.values()) - # We should be stopped at the entry point. - self.assertEqual( - len(stopped_threads), 1, "Expected the main thread to be stopped on entry." - ) - self.assertEqual(stopped_threads[0]["reason"], "entry") + stopped_events = self.dap_server.wait_for_stopped() + self.verify_stopped_on_entry(stopped_events) # continue to main self.dap_server.request_continue() diff --git a/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py index 0184020589176..a01845669666f 100644 --- a/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py +++ b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py @@ -32,7 +32,7 @@ def test_send_event(self): ], ) self.set_source_breakpoints(source, [breakpoint_line]) - self.do_continue() + self.continue_to_next_stop() custom_event = self.dap_server.wait_for_event( filter=["my-custom-event-no-body"] From 1c15e3db4a10dc5cf135b830621bace3766b9195 Mon Sep 17 00:00:00 2001 From: Sarah Spall Date: Fri, 31 Oct 2025 11:15:19 -0700 Subject: [PATCH 378/539] [HLSL] Simplify test (#165743) Simplify test that fcgl flag is expanded to the right flags. --- clang/test/Driver/dxc_fcgl.hlsl | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/clang/test/Driver/dxc_fcgl.hlsl b/clang/test/Driver/dxc_fcgl.hlsl index fe65124c197bc..4db7ada9622c5 100644 --- a/clang/test/Driver/dxc_fcgl.hlsl +++ b/clang/test/Driver/dxc_fcgl.hlsl @@ -1,9 +1,5 @@ -// RUN: not %clang_dxc -fcgl -T lib_6_7 foo.hlsl -### %s 2>&1 | FileCheck %s -// RUN: %clang_dxc -fcgl -T lib_6_7 %s -Xclang -verify +// RUN: %clang_dxc -fcgl -T lib_6_7 %s -### %s 2>&1 | FileCheck %s // Make sure fcgl option flag which translated into "-emit-llvm" "-disable-llvm-passes". // CHECK: "-emit-llvm" // CHECK-SAME: "-disable-llvm-passes" - -// Make sure fcgl option not generate any diagnostics. -// expected-no-diagnostics From 058c37e37391585c45690d70a093890d371517d5 Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Fri, 31 Oct 2025 19:23:49 +0100 Subject: [PATCH 379/539] [Polly][CodePreparation] Extract common code of LPM and NPM (#140419) Use a common function for the non-boilerplate code shared between LPM and NPM as done by most other passes already. ScalarEvolution is not actually used. Patch extracted out of #125442 requested by https://github.com/llvm/llvm-project/pull/125442#discussion_r2034416019 --- polly/lib/Transform/CodePreparation.cpp | 45 +++++++++++++++---------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/polly/lib/Transform/CodePreparation.cpp b/polly/lib/Transform/CodePreparation.cpp index 7c8579eb93218..d045fb6b62c90 100644 --- a/polly/lib/Transform/CodePreparation.cpp +++ b/polly/lib/Transform/CodePreparation.cpp @@ -27,6 +27,26 @@ using namespace llvm; using namespace polly; +static bool runCodePreprationImpl(Function &F, DominatorTree *DT, LoopInfo *LI, + RegionInfo *RI) { + // Find first non-alloca instruction. Every basic block has a non-alloca + // instruction, as every well formed basic block has a terminator. + auto &EntryBlock = F.getEntryBlock(); + BasicBlock::iterator I = EntryBlock.begin(); + while (isa(I)) + ++I; + + // Abort if not necessary to split + if (I->isTerminator() && isa(I) && + cast(I)->isUnconditional()) + return false; + + // splitBlock updates DT, LI and RI. + splitEntryBlockForAlloca(&EntryBlock, DT, LI, RI); + + return true; +} + namespace { /// Prepare the IR for the scop detection. @@ -35,9 +55,6 @@ class CodePreparation final : public FunctionPass { CodePreparation(const CodePreparation &) = delete; const CodePreparation &operator=(const CodePreparation &) = delete; - LoopInfo *LI; - ScalarEvolution *SE; - void clear(); public: @@ -58,19 +75,11 @@ class CodePreparation final : public FunctionPass { PreservedAnalyses CodePreparationPass::run(Function &F, FunctionAnalysisManager &FAM) { - - // Find first non-alloca instruction. Every basic block has a non-alloca - // instruction, as every well formed basic block has a terminator. - auto &EntryBlock = F.getEntryBlock(); - BasicBlock::iterator I = EntryBlock.begin(); - while (isa(I)) - ++I; - auto &DT = FAM.getResult(F); auto &LI = FAM.getResult(F); - - // splitBlock updates DT, LI and RI. - splitEntryBlockForAlloca(&EntryBlock, &DT, &LI, nullptr); + bool Changed = runCodePreprationImpl(F, &DT, &LI, nullptr); + if (!Changed) + return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserve(); @@ -84,7 +93,6 @@ CodePreparation::~CodePreparation() { clear(); } void CodePreparation::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); - AU.addRequired(); AU.addPreserved(); AU.addPreserved(); @@ -96,10 +104,11 @@ bool CodePreparation::runOnFunction(Function &F) { if (skipFunction(F)) return false; - LI = &getAnalysis().getLoopInfo(); - SE = &getAnalysis().getSE(); + DominatorTree *DT = &getAnalysis().getDomTree(); + LoopInfo *LI = &getAnalysis().getLoopInfo(); + RegionInfo *RI = &getAnalysis().getRegionInfo(); - splitEntryBlockForAlloca(&F.getEntryBlock(), this); + runCodePreprationImpl(F, DT, LI, RI); return true; } From 9dba4bfa67a0fc685f875ab14cecbb6e63a3e340 Mon Sep 17 00:00:00 2001 From: Yifei Xu Date: Fri, 31 Oct 2025 13:23:58 -0500 Subject: [PATCH 380/539] Port ec657d8 to Bazel --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 101dfb7cf68ae..1385e1a802d5b 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -3729,6 +3729,7 @@ cc_library( ":XeGPUAttrInterfaceIncGen", ":XeGPUEnumsIncGen", ":XeGPUIncGen", + ":XeGPUUtils", ":XeGPUuArch", ":XeVMDialect", "//llvm:Support", @@ -3781,6 +3782,7 @@ cc_library( ":XeGPUDialect", ":XeGPUPassIncGen", ":XeGPUUtils", + ":XeGPUuArch", "//llvm:Support", ], ) From 24c80a94cbcba13c9e76a0bce01be6ac5872e772 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 31 Oct 2025 11:27:52 -0700 Subject: [PATCH 381/539] [SCEV] Fix switch formatting in collectFromBlock (NFC). Fix formatting for switch, to avoid unrelated changes/formatting errors in https://github.com/llvm/llvm-project/pull/163021. --- llvm/lib/Analysis/ScalarEvolution.cpp | 50 +++++++++++++-------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 7597f3ad685a0..c9baeda24b47b 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -15670,31 +15670,31 @@ void ScalarEvolution::LoopGuards::collectFromBlock( // predicate. const SCEV *One = SE.getOne(RHS->getType()); switch (Predicate) { - case CmpInst::ICMP_ULT: - if (RHS->getType()->isPointerTy()) - return; - RHS = SE.getUMaxExpr(RHS, One); - [[fallthrough]]; - case CmpInst::ICMP_SLT: { - RHS = SE.getMinusSCEV(RHS, One); - RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE); - break; - } - case CmpInst::ICMP_UGT: - case CmpInst::ICMP_SGT: - RHS = SE.getAddExpr(RHS, One); - RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE); - break; - case CmpInst::ICMP_ULE: - case CmpInst::ICMP_SLE: - RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE); - break; - case CmpInst::ICMP_UGE: - case CmpInst::ICMP_SGE: - RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE); - break; - default: - break; + case CmpInst::ICMP_ULT: + if (RHS->getType()->isPointerTy()) + return; + RHS = SE.getUMaxExpr(RHS, One); + [[fallthrough]]; + case CmpInst::ICMP_SLT: { + RHS = SE.getMinusSCEV(RHS, One); + RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE); + break; + } + case CmpInst::ICMP_UGT: + case CmpInst::ICMP_SGT: + RHS = SE.getAddExpr(RHS, One); + RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE); + break; + case CmpInst::ICMP_ULE: + case CmpInst::ICMP_SLE: + RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE); + break; + case CmpInst::ICMP_UGE: + case CmpInst::ICMP_SGE: + RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE); + break; + default: + break; } SmallVector Worklist(1, LHS); From 20f164bc09264d01265a03f62e114bd156413c0f Mon Sep 17 00:00:00 2001 From: choikwa <5455710+choikwa@users.noreply.github.com> Date: Fri, 31 Oct 2025 14:43:35 -0400 Subject: [PATCH 382/539] [AMDGPU] NFC, add testcase showing promote-alloca of array of vectors to a large vector (#165824) later patch will target series of extractelement/insertelement pairs. --- .../AMDGPU/promote-alloca-array-to-vector.ll | 325 ++++++++++++++++++ 1 file changed, 325 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll new file mode 100644 index 0000000000000..05a0e39d4a715 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll @@ -0,0 +1,325 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s + +define amdgpu_kernel void @large_array_vectors_small_users(<16 x i8> %in, <16 x i8> %add, ptr addrspace(3) %out) #0 { +; OPT-LABEL: define amdgpu_kernel void @large_array_vectors_small_users( +; OPT-SAME: <16 x i8> [[IN:%.*]], <16 x i8> [[ADD:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-NEXT: [[ENTRY:.*:]] +; OPT-NEXT: [[ALLOCA:%.*]] = freeze <128 x i8> poison +; OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[IN]], i64 0 +; OPT-NEXT: [[TMP1:%.*]] = insertelement <128 x i8> [[ALLOCA]], i8 [[TMP0]], i32 0 +; OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP3:%.*]] = insertelement <128 x i8> [[TMP1]], i8 [[TMP2]], i32 1 +; OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP5:%.*]] = insertelement <128 x i8> [[TMP3]], i8 [[TMP4]], i32 2 +; OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP7:%.*]] = insertelement <128 x i8> [[TMP5]], i8 [[TMP6]], i32 3 +; OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP9:%.*]] = insertelement <128 x i8> [[TMP7]], i8 [[TMP8]], i32 4 +; OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP11:%.*]] = insertelement <128 x i8> [[TMP9]], i8 [[TMP10]], i32 5 +; OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP13:%.*]] = insertelement <128 x i8> [[TMP11]], i8 [[TMP12]], i32 6 +; OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP15:%.*]] = insertelement <128 x i8> [[TMP13]], i8 [[TMP14]], i32 7 +; OPT-NEXT: [[TMP16:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP17:%.*]] = insertelement <128 x i8> [[TMP15]], i8 [[TMP16]], i32 8 +; OPT-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP19:%.*]] = insertelement <128 x i8> [[TMP17]], i8 [[TMP18]], i32 9 +; OPT-NEXT: [[TMP20:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP21:%.*]] = insertelement <128 x i8> [[TMP19]], i8 [[TMP20]], i32 10 +; OPT-NEXT: [[TMP22:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP23:%.*]] = insertelement <128 x i8> [[TMP21]], i8 [[TMP22]], i32 11 +; OPT-NEXT: [[TMP24:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP25:%.*]] = insertelement <128 x i8> [[TMP23]], i8 [[TMP24]], i32 12 +; OPT-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP27:%.*]] = insertelement <128 x i8> [[TMP25]], i8 [[TMP26]], i32 13 +; OPT-NEXT: [[TMP28:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP29:%.*]] = insertelement <128 x i8> [[TMP27]], i8 [[TMP28]], i32 14 +; OPT-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP31:%.*]] = insertelement <128 x i8> [[TMP29]], i8 [[TMP30]], i32 15 +; OPT-NEXT: [[TMP32:%.*]] = extractelement <16 x i8> [[IN]], i64 0 +; OPT-NEXT: [[TMP33:%.*]] = insertelement <128 x i8> [[TMP31]], i8 [[TMP32]], i32 0 +; OPT-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 1 +; OPT-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 2 +; OPT-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 3 +; OPT-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 4 +; OPT-NEXT: [[TMP42:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 5 +; OPT-NEXT: [[TMP44:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 6 +; OPT-NEXT: [[TMP46:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 7 +; OPT-NEXT: [[TMP48:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 8 +; OPT-NEXT: [[TMP50:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 9 +; OPT-NEXT: [[TMP52:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 10 +; OPT-NEXT: [[TMP54:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 11 +; OPT-NEXT: [[TMP56:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 12 +; OPT-NEXT: [[TMP58:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 13 +; OPT-NEXT: [[TMP60:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 14 +; OPT-NEXT: [[TMP62:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 15 +; OPT-NEXT: [[TMP64:%.*]] = extractelement <16 x i8> [[IN]], i64 0 +; OPT-NEXT: [[TMP65:%.*]] = insertelement <128 x i8> [[TMP63]], i8 [[TMP64]], i32 0 +; OPT-NEXT: [[TMP66:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 1 +; OPT-NEXT: [[TMP68:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 2 +; OPT-NEXT: [[TMP70:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 3 +; OPT-NEXT: [[TMP72:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 4 +; OPT-NEXT: [[TMP74:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 5 +; OPT-NEXT: [[TMP76:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 6 +; OPT-NEXT: [[TMP78:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 7 +; OPT-NEXT: [[TMP80:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 8 +; OPT-NEXT: [[TMP82:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 9 +; OPT-NEXT: [[TMP84:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 10 +; OPT-NEXT: [[TMP86:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 11 +; OPT-NEXT: [[TMP88:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 12 +; OPT-NEXT: [[TMP90:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 13 +; OPT-NEXT: [[TMP92:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 14 +; OPT-NEXT: [[TMP94:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 15 +; OPT-NEXT: [[TMP96:%.*]] = extractelement <16 x i8> [[IN]], i64 0 +; OPT-NEXT: [[TMP97:%.*]] = insertelement <128 x i8> [[TMP95]], i8 [[TMP96]], i32 0 +; OPT-NEXT: [[TMP98:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 1 +; OPT-NEXT: [[TMP100:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 2 +; OPT-NEXT: [[TMP102:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 3 +; OPT-NEXT: [[TMP104:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 4 +; OPT-NEXT: [[TMP106:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 5 +; OPT-NEXT: [[TMP108:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 6 +; OPT-NEXT: [[TMP110:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 7 +; OPT-NEXT: [[TMP112:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 8 +; OPT-NEXT: [[TMP114:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 9 +; OPT-NEXT: [[TMP116:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 10 +; OPT-NEXT: [[TMP118:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 11 +; OPT-NEXT: [[TMP120:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 12 +; OPT-NEXT: [[TMP122:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 13 +; OPT-NEXT: [[TMP124:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 14 +; OPT-NEXT: [[TMP126:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 15 +; OPT-NEXT: [[TMP128:%.*]] = extractelement <16 x i8> [[IN]], i64 0 +; OPT-NEXT: [[TMP129:%.*]] = insertelement <128 x i8> [[TMP127]], i8 [[TMP128]], i32 0 +; OPT-NEXT: [[TMP130:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 1 +; OPT-NEXT: [[TMP132:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 2 +; OPT-NEXT: [[TMP134:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 3 +; OPT-NEXT: [[TMP136:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 4 +; OPT-NEXT: [[TMP138:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 5 +; OPT-NEXT: [[TMP140:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 6 +; OPT-NEXT: [[TMP142:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 7 +; OPT-NEXT: [[TMP144:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 8 +; OPT-NEXT: [[TMP146:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 9 +; OPT-NEXT: [[TMP148:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 10 +; OPT-NEXT: [[TMP150:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 11 +; OPT-NEXT: [[TMP152:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 12 +; OPT-NEXT: [[TMP154:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 13 +; OPT-NEXT: [[TMP156:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 14 +; OPT-NEXT: [[TMP158:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 15 +; OPT-NEXT: [[TMP160:%.*]] = extractelement <16 x i8> [[IN]], i64 0 +; OPT-NEXT: [[TMP161:%.*]] = insertelement <128 x i8> [[TMP159]], i8 [[TMP160]], i32 0 +; OPT-NEXT: [[TMP162:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 1 +; OPT-NEXT: [[TMP164:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 2 +; OPT-NEXT: [[TMP166:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 3 +; OPT-NEXT: [[TMP168:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 4 +; OPT-NEXT: [[TMP170:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 5 +; OPT-NEXT: [[TMP172:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 6 +; OPT-NEXT: [[TMP174:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 7 +; OPT-NEXT: [[TMP176:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 8 +; OPT-NEXT: [[TMP178:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 9 +; OPT-NEXT: [[TMP180:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 10 +; OPT-NEXT: [[TMP182:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 11 +; OPT-NEXT: [[TMP184:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 12 +; OPT-NEXT: [[TMP186:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 13 +; OPT-NEXT: [[TMP188:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 14 +; OPT-NEXT: [[TMP190:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 15 +; OPT-NEXT: [[TMP192:%.*]] = extractelement <16 x i8> [[IN]], i64 0 +; OPT-NEXT: [[TMP193:%.*]] = insertelement <128 x i8> [[TMP191]], i8 [[TMP192]], i32 0 +; OPT-NEXT: [[TMP194:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 1 +; OPT-NEXT: [[TMP196:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 2 +; OPT-NEXT: [[TMP198:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 3 +; OPT-NEXT: [[TMP200:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 4 +; OPT-NEXT: [[TMP202:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 5 +; OPT-NEXT: [[TMP204:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 6 +; OPT-NEXT: [[TMP206:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 7 +; OPT-NEXT: [[TMP208:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 8 +; OPT-NEXT: [[TMP210:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 9 +; OPT-NEXT: [[TMP212:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 10 +; OPT-NEXT: [[TMP214:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 11 +; OPT-NEXT: [[TMP216:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 12 +; OPT-NEXT: [[TMP218:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 13 +; OPT-NEXT: [[TMP220:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 14 +; OPT-NEXT: [[TMP222:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 15 +; OPT-NEXT: [[TMP224:%.*]] = extractelement <16 x i8> [[IN]], i64 0 +; OPT-NEXT: [[TMP225:%.*]] = insertelement <128 x i8> [[TMP223]], i8 [[TMP224]], i32 0 +; OPT-NEXT: [[TMP226:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 1 +; OPT-NEXT: [[TMP228:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 2 +; OPT-NEXT: [[TMP230:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 3 +; OPT-NEXT: [[TMP232:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 4 +; OPT-NEXT: [[TMP234:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 5 +; OPT-NEXT: [[TMP236:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 6 +; OPT-NEXT: [[TMP238:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 7 +; OPT-NEXT: [[TMP240:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 8 +; OPT-NEXT: [[TMP242:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 9 +; OPT-NEXT: [[TMP244:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 10 +; OPT-NEXT: [[TMP246:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 11 +; OPT-NEXT: [[TMP248:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 12 +; OPT-NEXT: [[TMP250:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 13 +; OPT-NEXT: [[TMP252:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 14 +; OPT-NEXT: [[TMP254:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 15 +; OPT-NEXT: [[TMP256:%.*]] = extractelement <128 x i8> [[TMP255]], i32 80 +; OPT-NEXT: [[TMP257:%.*]] = insertelement <16 x i8> poison, i8 [[TMP256]], i64 0 +; OPT-NEXT: [[TMP258:%.*]] = extractelement <128 x i8> [[TMP255]], i32 81 +; OPT-NEXT: [[TMP259:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP258]], i64 1 +; OPT-NEXT: [[TMP260:%.*]] = extractelement <128 x i8> [[TMP255]], i32 82 +; OPT-NEXT: [[TMP261:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP260]], i64 2 +; OPT-NEXT: [[TMP262:%.*]] = extractelement <128 x i8> [[TMP255]], i32 83 +; OPT-NEXT: [[TMP263:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP262]], i64 3 +; OPT-NEXT: [[TMP264:%.*]] = extractelement <128 x i8> [[TMP255]], i32 84 +; OPT-NEXT: [[TMP265:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP264]], i64 4 +; OPT-NEXT: [[TMP266:%.*]] = extractelement <128 x i8> [[TMP255]], i32 85 +; OPT-NEXT: [[TMP267:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP266]], i64 5 +; OPT-NEXT: [[TMP268:%.*]] = extractelement <128 x i8> [[TMP255]], i32 86 +; OPT-NEXT: [[TMP269:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP268]], i64 6 +; OPT-NEXT: [[TMP270:%.*]] = extractelement <128 x i8> [[TMP255]], i32 87 +; OPT-NEXT: [[TMP271:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP270]], i64 7 +; OPT-NEXT: [[TMP272:%.*]] = extractelement <128 x i8> [[TMP255]], i32 88 +; OPT-NEXT: [[TMP273:%.*]] = insertelement <16 x i8> [[TMP271]], i8 [[TMP272]], i64 8 +; OPT-NEXT: [[TMP274:%.*]] = extractelement <128 x i8> [[TMP255]], i32 89 +; OPT-NEXT: [[TMP275:%.*]] = insertelement <16 x i8> [[TMP273]], i8 [[TMP274]], i64 9 +; OPT-NEXT: [[TMP276:%.*]] = extractelement <128 x i8> [[TMP255]], i32 90 +; OPT-NEXT: [[TMP277:%.*]] = insertelement <16 x i8> [[TMP275]], i8 [[TMP276]], i64 10 +; OPT-NEXT: [[TMP278:%.*]] = extractelement <128 x i8> [[TMP255]], i32 91 +; OPT-NEXT: [[TMP279:%.*]] = insertelement <16 x i8> [[TMP277]], i8 [[TMP278]], i64 11 +; OPT-NEXT: [[TMP280:%.*]] = extractelement <128 x i8> [[TMP255]], i32 92 +; OPT-NEXT: [[TMP281:%.*]] = insertelement <16 x i8> [[TMP279]], i8 [[TMP280]], i64 12 +; OPT-NEXT: [[TMP282:%.*]] = extractelement <128 x i8> [[TMP255]], i32 93 +; OPT-NEXT: [[TMP283:%.*]] = insertelement <16 x i8> [[TMP281]], i8 [[TMP282]], i64 13 +; OPT-NEXT: [[TMP284:%.*]] = extractelement <128 x i8> [[TMP255]], i32 94 +; OPT-NEXT: [[TMP285:%.*]] = insertelement <16 x i8> [[TMP283]], i8 [[TMP284]], i64 14 +; OPT-NEXT: [[TMP286:%.*]] = extractelement <128 x i8> [[TMP255]], i32 95 +; OPT-NEXT: [[TMP287:%.*]] = insertelement <16 x i8> [[TMP285]], i8 [[TMP286]], i64 15 +; OPT-NEXT: [[SUM:%.*]] = add <16 x i8> [[TMP287]], [[ADD]] +; OPT-NEXT: store <16 x i8> [[SUM]], ptr addrspace(3) [[OUT]], align 16 +; OPT-NEXT: ret void +; +entry: + %alloca = alloca [8 x <16 x i8>], align 16, addrspace(5) + %gep0 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 0 + store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + %gep1 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 1 + store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + %gep2 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 2 + store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + %gep3 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 3 + store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + %gep4 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 4 + store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + %gep5 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 5 + store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + %gep6 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 6 + store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + %gep7 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 7 + store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + %load = load <16 x i8>, ptr addrspace(5) %gep5, align 16 + %sum = add <16 x i8> %load, %add + store <16 x i8> %sum, ptr addrspace(3) %out, align 16 + ret void +} + +attributes #0 = {"amdgpu-waves-per-eu"="2,2"} From 04756889e2615d10b475f4cb799bc8f1388b5c4a Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Fri, 31 Oct 2025 11:57:08 -0700 Subject: [PATCH 383/539] [libc] Templatize strtointeger implementation. (#165884) * Removes the copy-pasta implementation of wcstointeger, and migrate the wcsto* family of functions to use a template version of strtointeger. * Fixes the out-of-bound read in the original implementation(s) when the entire input string consists of whitespaces (then the sign check can access OOB memory) The code is currently slightly peppered with "if constexpr" statements to distinguish between char and wchar_t. We can probably simplify it in subsequent changes by: * using overrides, so that internal::isalnum() is overriden for both char and wchar_t (since C++ luckily allows us to reuse names). * this wouldn't help for direct comparison with literals - for this as a somewhat ugly workaround like is_char_literal(c, '0', L'0') --- libc/src/__support/CMakeLists.txt | 12 -- libc/src/__support/str_to_integer.h | 98 +++++++---- libc/src/__support/wcs_to_integer.h | 155 ------------------ libc/src/wchar/CMakeLists.txt | 8 +- libc/src/wchar/wcstol.cpp | 4 +- libc/src/wchar/wcstoll.cpp | 4 +- libc/src/wchar/wcstoul.cpp | 4 +- libc/src/wchar/wcstoull.cpp | 4 +- libc/test/src/__support/CMakeLists.txt | 2 +- .../src/__support/str_to_integer_test.cpp | 6 +- .../src/__support/wcs_to_integer_test.cpp | 102 ++++++------ .../llvm-project-overlay/libc/BUILD.bazel | 1 + 12 files changed, 139 insertions(+), 261 deletions(-) delete mode 100644 libc/src/__support/wcs_to_integer.h diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt index 0ef09a9b8c9d0..b7af751ec3f27 100644 --- a/libc/src/__support/CMakeLists.txt +++ b/libc/src/__support/CMakeLists.txt @@ -179,19 +179,7 @@ add_header_library( DEPENDS .ctype_utils .str_to_num_result - libc.hdr.errno_macros - libc.src.__support.CPP.limits - libc.src.__support.CPP.type_traits - libc.src.__support.common -) - -add_header_library( - wcs_to_integer - HDRS - wcs_to_integer.h - DEPENDS .wctype_utils - .str_to_num_result libc.hdr.errno_macros libc.src.__support.CPP.limits libc.src.__support.CPP.type_traits diff --git a/libc/src/__support/str_to_integer.h b/libc/src/__support/str_to_integer.h index d332c929f2c31..ba3f49fa2f47b 100644 --- a/libc/src/__support/str_to_integer.h +++ b/libc/src/__support/str_to_integer.h @@ -25,36 +25,63 @@ #include "src/__support/macros/config.h" #include "src/__support/str_to_num_result.h" #include "src/__support/uint128.h" +#include "src/__support/wctype_utils.h" namespace LIBC_NAMESPACE_DECL { namespace internal { // Returns the idx to the first character in src that is not a whitespace -// character (as determined by isspace()) +// character (as determined by isspace() / iswspace()) +template LIBC_INLINE size_t -first_non_whitespace(const char *__restrict src, +first_non_whitespace(const CharType *__restrict src, size_t src_len = cpp::numeric_limits::max()) { size_t src_cur = 0; - while (src_cur < src_len && internal::isspace(src[src_cur])) { + while (src_cur < src_len) { + if constexpr (cpp::is_same_v) { + if (!internal::isspace(src[src_cur])) + break; + } else { + if (!internal::iswspace(src[src_cur])) + break; + } ++src_cur; } return src_cur; } +// Returns +1, -1, or 0 if 'src' starts with (respectively) +// plus sign, minus sign, or neither. +template +LIBC_INLINE static int get_sign(const CharType *__restrict src) { + if constexpr (cpp::is_same_v) { + return (src[0] == '+') ? 1 : (src[0] == '-' ? -1 : 0); + } else { + return (src[0] == L'+') ? 1 : (src[0] == L'-' ? -1 : 0); + } +} + // checks if the next 3 characters of the string pointer are the start of a // hexadecimal number. Does not advance the string pointer. -LIBC_INLINE bool -is_hex_start(const char *__restrict src, - size_t src_len = cpp::numeric_limits::max()) { +template +LIBC_INLINE static bool is_hex_start(const CharType *__restrict src, + size_t src_len) { if (src_len < 3) return false; - return *src == '0' && tolower(*(src + 1)) == 'x' && isalnum(*(src + 2)) && - b36_char_to_int(*(src + 2)) < 16; + if constexpr (cpp::is_same_v) { + return src[0] == '0' && tolower(src[1]) == 'x' && isalnum(src[2]) && + b36_char_to_int(src[2]) < 16; + } else { + return src[0] == L'0' && towlower(src[1]) == L'x' && iswalnum(src[2]) && + b36_wchar_to_int(src[2]) < 16; + } } // Takes the address of the string pointer and parses the base from the start of // it. -LIBC_INLINE int infer_base(const char *__restrict src, size_t src_len) { +template +LIBC_INLINE static int infer_base(const CharType *__restrict src, + size_t src_len) { // A hexadecimal number is defined as "the prefix 0x or 0X followed by a // sequence of the decimal digits and the letters a (or A) through f (or F) // with values 10 through 15 respectively." (C standard 6.4.4.1) @@ -63,8 +90,15 @@ LIBC_INLINE int infer_base(const char *__restrict src, size_t src_len) { // An octal number is defined as "the prefix 0 optionally followed by a // sequence of the digits 0 through 7 only" (C standard 6.4.4.1) and so any // number that starts with 0, including just 0, is an octal number. - if (src_len > 0 && src[0] == '0') - return 8; + if (src_len > 0) { + if constexpr (cpp::is_same_v) { + if (src[0] == '0') + return 8; + } else { + if (src[0] == L'0') + return 8; + } + } // A decimal number is defined as beginning "with a nonzero digit and // consist[ing] of a sequence of decimal digits." (C standard 6.4.4.1) return 10; @@ -77,32 +111,27 @@ LIBC_INLINE int infer_base(const char *__restrict src, size_t src_len) { // ----------------------------------------------------------------------------- // Takes a pointer to a string and the base to convert to. This function is used // as the backend for all of the string to int functions. -template +template LIBC_INLINE StrToNumResult -strtointeger(const char *__restrict src, int base, +strtointeger(const CharType *__restrict src, int base, const size_t src_len = cpp::numeric_limits::max()) { using ResultType = make_integral_or_big_int_unsigned_t; - ResultType result = 0; - - bool is_number = false; - size_t src_cur = 0; - int error_val = 0; - if (src_len == 0) return {0, 0, 0}; if (base < 0 || base == 1 || base > 36) return {0, 0, EINVAL}; - src_cur = first_non_whitespace(src, src_len); - - char result_sign = '+'; - if (src[src_cur] == '+' || src[src_cur] == '-') { - result_sign = src[src_cur]; - ++src_cur; + size_t src_cur = first_non_whitespace(src, src_len); + if (src_cur == src_len) { + return {0, 0, 0}; } + int sign = get_sign(src + src_cur); + bool is_positive = (sign >= 0); + src_cur += (sign != 0); + if (base == 0) base = infer_base(src + src_cur, src_len - src_cur); @@ -110,8 +139,6 @@ strtointeger(const char *__restrict src, int base, src_cur = src_cur + 2; constexpr bool IS_UNSIGNED = cpp::is_unsigned_v; - const bool is_positive = (result_sign == '+'); - ResultType constexpr NEGATIVE_MAX = !IS_UNSIGNED ? static_cast(cpp::numeric_limits::max()) + 1 : cpp::numeric_limits::max(); @@ -120,8 +147,21 @@ strtointeger(const char *__restrict src, int base, ResultType const abs_max_div_by_base = abs_max / static_cast(base); - while (src_cur < src_len && isalnum(src[src_cur])) { - int cur_digit = b36_char_to_int(src[src_cur]); + bool is_number = false; + int error_val = 0; + ResultType result = 0; + while (src_cur < src_len) { + int cur_digit; + if constexpr (cpp::is_same_v) { + if (!isalnum(src[src_cur])) + break; + cur_digit = b36_char_to_int(src[src_cur]); + } else { + if (!iswalnum(src[src_cur])) + break; + cur_digit = b36_wchar_to_int(src[src_cur]); + } + if (cur_digit >= base) break; diff --git a/libc/src/__support/wcs_to_integer.h b/libc/src/__support/wcs_to_integer.h deleted file mode 100644 index 4254bd860f77a..0000000000000 --- a/libc/src/__support/wcs_to_integer.h +++ /dev/null @@ -1,155 +0,0 @@ -//===-- Widechar string to integer conversion utils -------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H -#define LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H - -#include "hdr/errno_macros.h" // For ERANGE -#include "src/__support/CPP/limits.h" -#include "src/__support/CPP/type_traits.h" -#include "src/__support/CPP/type_traits/make_unsigned.h" -#include "src/__support/big_int.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" -#include "src/__support/str_to_num_result.h" -#include "src/__support/uint128.h" -#include "src/__support/wctype_utils.h" - -namespace LIBC_NAMESPACE_DECL { -namespace internal { - -// Returns the idx of the first character in src that is not a whitespace -// character (as determined by iswspace()) -LIBC_INLINE size_t -first_non_whitespace(const wchar_t *__restrict src, - size_t src_len = cpp::numeric_limits::max()) { - size_t src_cur = 0; - while (src_cur < src_len && internal::iswspace(src[src_cur])) { - ++src_cur; - } - return src_cur; -} - -// checks if the next 3 characters of the string pointer are the start of a -// hexadecimal number. Does not advance the string pointer. -LIBC_INLINE bool -is_hex_start(const wchar_t *__restrict src, - size_t src_len = cpp::numeric_limits::max()) { - if (src_len < 3) - return false; - return *src == L'0' && towlower(*(src + 1)) == L'x' && iswalnum(*(src + 2)) && - b36_wchar_to_int(*(src + 2)) < 16; -} - -// Takes the address of the string pointer and parses the base from the start of -// it. -LIBC_INLINE int infer_base(const wchar_t *__restrict src, size_t src_len) { - // A hexadecimal number is defined as "the prefix 0x or 0X followed by a - // sequence of the decimal digits and the letters a (or A) through f (or F) - // with values 10 through 15 respectively." (C standard 6.4.4.1) - if (is_hex_start(src, src_len)) - return 16; - // An octal number is defined as "the prefix 0 optionally followed by a - // sequence of the digits 0 through 7 only" (C standard 6.4.4.1) and so any - // number that starts with 0, including just 0, is an octal number. - if (src_len > 0 && src[0] == L'0') - return 8; - // A decimal number is defined as beginning "with a nonzero digit and - // consist[ing] of a sequence of decimal digits." (C standard 6.4.4.1) - return 10; -} - -template -LIBC_INLINE StrToNumResult -wcstointeger(const wchar_t *__restrict src, int base, - const size_t src_len = cpp::numeric_limits::max()) { - using ResultType = make_integral_or_big_int_unsigned_t; - - ResultType result = 0; - - bool is_number = false; - size_t src_cur = 0; - int error_val = 0; - - if (src_len == 0) - return {0, 0, 0}; - - if (base < 0 || base == 1 || base > 36) - return {0, 0, EINVAL}; - - src_cur = first_non_whitespace(src, src_len); - - wchar_t result_sign = L'+'; - if (src[src_cur] == L'+' || src[src_cur] == L'-') { - result_sign = src[src_cur]; - ++src_cur; - } - - if (base == 0) - base = infer_base(src + src_cur, src_len - src_cur); - - if (base == 16 && is_hex_start(src + src_cur, src_len - src_cur)) - src_cur = src_cur + 2; - - constexpr bool IS_UNSIGNED = cpp::is_unsigned_v; - const bool is_positive = (result_sign == L'+'); - - ResultType constexpr NEGATIVE_MAX = - !IS_UNSIGNED ? static_cast(cpp::numeric_limits::max()) + 1 - : cpp::numeric_limits::max(); - ResultType const abs_max = - (is_positive ? cpp::numeric_limits::max() : NEGATIVE_MAX); - ResultType const abs_max_div_by_base = - abs_max / static_cast(base); - - while (src_cur < src_len && iswalnum(src[src_cur])) { - int cur_digit = b36_wchar_to_int(src[src_cur]); - if (cur_digit >= base) - break; - - is_number = true; - ++src_cur; - - // If the number has already hit the maximum value for the current type then - // the result cannot change, but we still need to advance src to the end of - // the number. - if (result == abs_max) { - error_val = ERANGE; - continue; - } - - if (result > abs_max_div_by_base) { - result = abs_max; - error_val = ERANGE; - } else { - result = result * static_cast(base); - } - if (result > abs_max - static_cast(cur_digit)) { - result = abs_max; - error_val = ERANGE; - } else { - result = result + static_cast(cur_digit); - } - } - - ptrdiff_t str_len = is_number ? static_cast(src_cur) : 0; - - if (error_val == ERANGE) { - if (is_positive || IS_UNSIGNED) - return {cpp::numeric_limits::max(), str_len, error_val}; - else // T is signed and there is a negative overflow - return {cpp::numeric_limits::min(), str_len, error_val}; - } - - return {static_cast(is_positive ? result : -result), str_len, error_val}; -} - -} // namespace internal -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt index adde382bf0950..ba27cd77f6bac 100644 --- a/libc/src/wchar/CMakeLists.txt +++ b/libc/src/wchar/CMakeLists.txt @@ -63,7 +63,7 @@ add_entrypoint_object( wcstol.h DEPENDS libc.src.errno.errno - libc.src.__support.wcs_to_integer + libc.src.__support.str_to_integer ) add_entrypoint_object( @@ -74,7 +74,7 @@ add_entrypoint_object( wcstoll.h DEPENDS libc.src.errno.errno - libc.src.__support.wcs_to_integer + libc.src.__support.str_to_integer ) add_entrypoint_object( @@ -85,7 +85,7 @@ add_entrypoint_object( wcstoul.h DEPENDS libc.src.errno.errno - libc.src.__support.wcs_to_integer + libc.src.__support.str_to_integer ) add_entrypoint_object( @@ -96,7 +96,7 @@ add_entrypoint_object( wcstoull.h DEPENDS libc.src.errno.errno - libc.src.__support.wcs_to_integer + libc.src.__support.str_to_integer ) add_entrypoint_object( diff --git a/libc/src/wchar/wcstol.cpp b/libc/src/wchar/wcstol.cpp index a05718f706dfd..a56b5f91272cd 100644 --- a/libc/src/wchar/wcstol.cpp +++ b/libc/src/wchar/wcstol.cpp @@ -10,14 +10,14 @@ #include "src/__support/common.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/__support/wcs_to_integer.h" +#include "src/__support/str_to_integer.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(long, wcstol, (const wchar_t *__restrict str, wchar_t **__restrict str_end, int base)) { - auto result = internal::wcstointeger(str, base); + auto result = internal::strtointeger(str, base); if (result.has_error()) libc_errno = result.error; diff --git a/libc/src/wchar/wcstoll.cpp b/libc/src/wchar/wcstoll.cpp index de1299d681cdb..6229d24172b51 100644 --- a/libc/src/wchar/wcstoll.cpp +++ b/libc/src/wchar/wcstoll.cpp @@ -10,14 +10,14 @@ #include "src/__support/common.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/__support/wcs_to_integer.h" +#include "src/__support/str_to_integer.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(long long, wcstoll, (const wchar_t *__restrict str, wchar_t **__restrict str_end, int base)) { - auto result = internal::wcstointeger(str, base); + auto result = internal::strtointeger(str, base); if (result.has_error()) libc_errno = result.error; diff --git a/libc/src/wchar/wcstoul.cpp b/libc/src/wchar/wcstoul.cpp index 79b8c9b5c9fa3..c5639bee1d649 100644 --- a/libc/src/wchar/wcstoul.cpp +++ b/libc/src/wchar/wcstoul.cpp @@ -10,14 +10,14 @@ #include "src/__support/common.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/__support/wcs_to_integer.h" +#include "src/__support/str_to_integer.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(unsigned long, wcstoul, (const wchar_t *__restrict str, wchar_t **__restrict str_end, int base)) { - auto result = internal::wcstointeger(str, base); + auto result = internal::strtointeger(str, base); if (result.has_error()) libc_errno = result.error; diff --git a/libc/src/wchar/wcstoull.cpp b/libc/src/wchar/wcstoull.cpp index 768e03c4bd189..2ab24e9b2b2a1 100644 --- a/libc/src/wchar/wcstoull.cpp +++ b/libc/src/wchar/wcstoull.cpp @@ -10,14 +10,14 @@ #include "src/__support/common.h" #include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/__support/wcs_to_integer.h" +#include "src/__support/str_to_integer.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(unsigned long long, wcstoull, (const wchar_t *__restrict str, wchar_t **__restrict str_end, int base)) { - auto result = internal::wcstointeger(str, base); + auto result = internal::strtointeger(str, base); if (result.has_error()) libc_errno = result.error; diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index a02514106a307..138866b4cc869 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -151,7 +151,7 @@ add_libc_test( wcs_to_integer_test.cpp DEPENDS libc.src.__support.integer_literals - libc.src.__support.wcs_to_integer + libc.src.__support.str_to_integer ) add_libc_test( diff --git a/libc/test/src/__support/str_to_integer_test.cpp b/libc/test/src/__support/str_to_integer_test.cpp index 1ec882b212b8a..e5ac1d6cbb7b3 100644 --- a/libc/test/src/__support/str_to_integer_test.cpp +++ b/libc/test/src/__support/str_to_integer_test.cpp @@ -49,12 +49,14 @@ TEST(LlvmLibcStrToIntegerTest, LeadingSpaces) { EXPECT_EQ(result.parsed_len, ptrdiff_t(7)); ASSERT_EQ(result.value, 12); - result = LIBC_NAMESPACE::internal::strtointeger(" 12345", 10, 5); + // Use a non-null-terminated buffer to test for possible OOB access. + char buf[5] = {' ', ' ', ' ', ' ', ' '}; + result = LIBC_NAMESPACE::internal::strtointeger(buf, 10, 5); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::strtointeger(" 12345", 10, 0); + result = LIBC_NAMESPACE::internal::strtointeger(buf, 10, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); diff --git a/libc/test/src/__support/wcs_to_integer_test.cpp b/libc/test/src/__support/wcs_to_integer_test.cpp index 4554968be67ce..38af778ca2440 100644 --- a/libc/test/src/__support/wcs_to_integer_test.cpp +++ b/libc/test/src/__support/wcs_to_integer_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/__support/wcs_to_integer.h" +#include "src/__support/str_to_integer.h" #include #include "test/UnitTest/Test.h" @@ -14,224 +14,226 @@ // This file is for testing the src_len argument and other internal interface // features. Primary testing is done through the public interface. -TEST(LlvmLibcStrToIntegerTest, SimpleLength) { - auto result = LIBC_NAMESPACE::internal::wcstointeger(L"12345", 10, 10); +TEST(LlvmLibcWcsToIntegerTest, SimpleLength) { + auto result = LIBC_NAMESPACE::internal::strtointeger(L"12345", 10, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(5)); ASSERT_EQ(result.value, 12345); - result = LIBC_NAMESPACE::internal::wcstointeger(L"12345", 10, 2); + result = LIBC_NAMESPACE::internal::strtointeger(L"12345", 10, 2); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(2)); ASSERT_EQ(result.value, 12); - result = LIBC_NAMESPACE::internal::wcstointeger(L"12345", 10, 0); + result = LIBC_NAMESPACE::internal::strtointeger(L"12345", 10, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); } -TEST(LlvmLibcStrToIntegerTest, LeadingSpaces) { +TEST(LlvmLibcWcsToIntegerTest, LeadingSpaces) { auto result = - LIBC_NAMESPACE::internal::wcstointeger(L" 12345", 10, 15); + LIBC_NAMESPACE::internal::strtointeger(L" 12345", 10, 15); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(10)); ASSERT_EQ(result.value, 12345); - result = LIBC_NAMESPACE::internal::wcstointeger(L" 12345", 10, 10); + result = LIBC_NAMESPACE::internal::strtointeger(L" 12345", 10, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(10)); ASSERT_EQ(result.value, 12345); - result = LIBC_NAMESPACE::internal::wcstointeger(L" 12345", 10, 7); + result = LIBC_NAMESPACE::internal::strtointeger(L" 12345", 10, 7); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(7)); ASSERT_EQ(result.value, 12); - result = LIBC_NAMESPACE::internal::wcstointeger(L" 12345", 10, 5); + // Use a non-null-terminated buffer to test for possible OOB access. + wchar_t buf[5] = {L' ', L' ', L' ', L' ', L' '}; + result = LIBC_NAMESPACE::internal::strtointeger(buf, 10, 5); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::wcstointeger(L" 12345", 10, 0); + result = LIBC_NAMESPACE::internal::strtointeger(buf, 10, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); } -TEST(LlvmLibcStrToIntegerTest, LeadingSign) { - auto result = LIBC_NAMESPACE::internal::wcstointeger(L"+12345", 10, 10); +TEST(LlvmLibcWcsToIntegerTest, LeadingSign) { + auto result = LIBC_NAMESPACE::internal::strtointeger(L"+12345", 10, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, 12345); - result = LIBC_NAMESPACE::internal::wcstointeger(L"-12345", 10, 10); + result = LIBC_NAMESPACE::internal::strtointeger(L"-12345", 10, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, -12345); - result = LIBC_NAMESPACE::internal::wcstointeger(L"+12345", 10, 6); + result = LIBC_NAMESPACE::internal::strtointeger(L"+12345", 10, 6); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, 12345); - result = LIBC_NAMESPACE::internal::wcstointeger(L"-12345", 10, 6); + result = LIBC_NAMESPACE::internal::strtointeger(L"-12345", 10, 6); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, -12345); - result = LIBC_NAMESPACE::internal::wcstointeger(L"+12345", 10, 3); + result = LIBC_NAMESPACE::internal::strtointeger(L"+12345", 10, 3); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(3)); ASSERT_EQ(result.value, 12); - result = LIBC_NAMESPACE::internal::wcstointeger(L"-12345", 10, 3); + result = LIBC_NAMESPACE::internal::strtointeger(L"-12345", 10, 3); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(3)); ASSERT_EQ(result.value, -12); - result = LIBC_NAMESPACE::internal::wcstointeger(L"+12345", 10, 1); + result = LIBC_NAMESPACE::internal::strtointeger(L"+12345", 10, 1); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::wcstointeger(L"-12345", 10, 1); + result = LIBC_NAMESPACE::internal::strtointeger(L"-12345", 10, 1); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::wcstointeger(L"+12345", 10, 0); + result = LIBC_NAMESPACE::internal::strtointeger(L"+12345", 10, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::wcstointeger(L"-12345", 10, 0); + result = LIBC_NAMESPACE::internal::strtointeger(L"-12345", 10, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); } -TEST(LlvmLibcStrToIntegerTest, Base16PrefixAutoSelect) { - auto result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 0, 10); +TEST(LlvmLibcWcsToIntegerTest, Base16PrefixAutoSelect) { + auto result = LIBC_NAMESPACE::internal::strtointeger(L"0x12345", 0, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(7)); ASSERT_EQ(result.value, 0x12345); - result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 0, 7); + result = LIBC_NAMESPACE::internal::strtointeger(L"0x12345", 0, 7); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(7)); ASSERT_EQ(result.value, 0x12345); - result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 0, 5); + result = LIBC_NAMESPACE::internal::strtointeger(L"0x12345", 0, 5); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(5)); ASSERT_EQ(result.value, 0x123); - result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 0, 2); + result = LIBC_NAMESPACE::internal::strtointeger(L"0x12345", 0, 2); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(1)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 0, 0); + result = LIBC_NAMESPACE::internal::strtointeger(L"0x12345", 0, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); } -TEST(LlvmLibcStrToIntegerTest, Base16PrefixManualSelect) { - auto result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 16, 10); +TEST(LlvmLibcWcsToIntegerTest, Base16PrefixManualSelect) { + auto result = LIBC_NAMESPACE::internal::strtointeger(L"0x12345", 16, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(7)); ASSERT_EQ(result.value, 0x12345); - result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 16, 7); + result = LIBC_NAMESPACE::internal::strtointeger(L"0x12345", 16, 7); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(7)); ASSERT_EQ(result.value, 0x12345); - result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 16, 5); + result = LIBC_NAMESPACE::internal::strtointeger(L"0x12345", 16, 5); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(5)); ASSERT_EQ(result.value, 0x123); - result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 16, 2); + result = LIBC_NAMESPACE::internal::strtointeger(L"0x12345", 16, 2); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(1)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::wcstointeger(L"0x12345", 16, 0); + result = LIBC_NAMESPACE::internal::strtointeger(L"0x12345", 16, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); } -TEST(LlvmLibcStrToIntegerTest, Base8PrefixAutoSelect) { - auto result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 0, 10); +TEST(LlvmLibcWcsToIntegerTest, Base8PrefixAutoSelect) { + auto result = LIBC_NAMESPACE::internal::strtointeger(L"012345", 0, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, 012345); - result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 0, 6); + result = LIBC_NAMESPACE::internal::strtointeger(L"012345", 0, 6); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, 012345); - result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 0, 4); + result = LIBC_NAMESPACE::internal::strtointeger(L"012345", 0, 4); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(4)); ASSERT_EQ(result.value, 0123); - result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 0, 1); + result = LIBC_NAMESPACE::internal::strtointeger(L"012345", 0, 1); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(1)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 0, 0); + result = LIBC_NAMESPACE::internal::strtointeger(L"012345", 0, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); } -TEST(LlvmLibcStrToIntegerTest, Base8PrefixManualSelect) { - auto result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 8, 10); +TEST(LlvmLibcWcsToIntegerTest, Base8PrefixManualSelect) { + auto result = LIBC_NAMESPACE::internal::strtointeger(L"012345", 8, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, 012345); - result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 8, 6); + result = LIBC_NAMESPACE::internal::strtointeger(L"012345", 8, 6); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, 012345); - result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 8, 4); + result = LIBC_NAMESPACE::internal::strtointeger(L"012345", 8, 4); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(4)); ASSERT_EQ(result.value, 0123); - result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 8, 1); + result = LIBC_NAMESPACE::internal::strtointeger(L"012345", 8, 1); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(1)); ASSERT_EQ(result.value, 0); - result = LIBC_NAMESPACE::internal::wcstointeger(L"012345", 8, 0); + result = LIBC_NAMESPACE::internal::strtointeger(L"012345", 8, 0); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(0)); ASSERT_EQ(result.value, 0); } -TEST(LlvmLibcStrToIntegerTest, CombinedTests) { +TEST(LlvmLibcWcsToIntegerTest, CombinedTests) { auto result = - LIBC_NAMESPACE::internal::wcstointeger(L" -0x123", 0, 10); + LIBC_NAMESPACE::internal::strtointeger(L" -0x123", 0, 10); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(10)); ASSERT_EQ(result.value, -0x123); - result = LIBC_NAMESPACE::internal::wcstointeger(L" -0x123", 0, 8); + result = LIBC_NAMESPACE::internal::strtointeger(L" -0x123", 0, 8); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(8)); ASSERT_EQ(result.value, -0x1); - result = LIBC_NAMESPACE::internal::wcstointeger(L" -0x123", 0, 7); + result = LIBC_NAMESPACE::internal::strtointeger(L" -0x123", 0, 7); EXPECT_FALSE(result.has_error()); EXPECT_EQ(result.parsed_len, ptrdiff_t(6)); ASSERT_EQ(result.value, 0); diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 599bc4b3d8bbf..5a1e0b53b021c 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -1000,6 +1000,7 @@ libc_support_library( ":__support_ctype_utils", ":__support_str_to_num_result", ":__support_uint128", + ":__support_wctype_utils", ":hdr_errno_macros", ], ) From 1160848e28bf2d3a34853adb3592496a01ab6f58 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Fri, 31 Oct 2025 12:21:59 -0700 Subject: [PATCH 384/539] [AMDGPU] Record old VGPR MSBs in the high bits of s_set_vgpr_msb (#165035) Fixes: SWDEV-562450 --- .../Target/AMDGPU/AMDGPULowerVGPREncoding.cpp | 16 +- llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 2 +- .../MCTargetDesc/AMDGPUMCTargetDesc.cpp | 2 +- .../AMDGPU/vgpr-lowering-gfx1250-t16.mir | 4 +- .../CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir | 137 +++++++++--------- .../CodeGen/AMDGPU/whole-wave-functions.ll | 36 ++--- 6 files changed, 104 insertions(+), 93 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp index 9b932273b2216..d7d0292083e1c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp @@ -58,6 +58,8 @@ class AMDGPULowerVGPREncoding { static constexpr unsigned BitsPerField = 2; static constexpr unsigned NumFields = 4; static constexpr unsigned FieldMask = (1 << BitsPerField) - 1; + static constexpr unsigned ModeWidth = NumFields * BitsPerField; + static constexpr unsigned ModeMask = (1 << ModeWidth) - 1; using ModeType = PackedVector>; @@ -152,13 +154,21 @@ bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask, CurrentMode |= NewMode; CurrentMask |= Mask; - MostRecentModeSet->getOperand(0).setImm(CurrentMode); + MachineOperand &Op = MostRecentModeSet->getOperand(0); + + // Carry old mode bits from the existing instruction. + int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth); + + Op.setImm(CurrentMode | OldModeBits); return true; } + // Record previous mode into high 8 bits of the immediate. + int64_t OldModeBits = CurrentMode << ModeWidth; + I = handleClause(I); - MostRecentModeSet = - BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB)).addImm(NewMode); + MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB)) + .addImm(NewMode | OldModeBits); CurrentMode = NewMode; CurrentMask = Mask; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 680e7eb3de6be..844649ebb9ae6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -412,7 +412,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { *OutStreamer); if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) { - unsigned V = MI->getOperand(0).getImm(); + unsigned V = MI->getOperand(0).getImm() & 0xff; OutStreamer->AddComment( " msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) + " src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3)); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 013cfeb364048..28b4da8ab9ebb 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -168,7 +168,7 @@ bool AMDGPUMCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr, void AMDGPUMCInstrAnalysis::updateState(const MCInst &Inst, uint64_t Addr) { if (Inst.getOpcode() == AMDGPU::S_SET_VGPR_MSB_gfx12) - VgprMSBs = Inst.getOperand(0).getImm(); + VgprMSBs = Inst.getOperand(0).getImm() & 0xff; else if (isTerminator(Inst)) VgprMSBs = 0; } diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir index 8a70a8acd28d3..32cc398740d62 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir @@ -36,7 +36,7 @@ body: | ; GCN-NEXT: v_add_f16_e64 v128.l /*v384.l*/, v129.l /*v385.l*/, v130.l /*v386.l*/ $vgpr384_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr385_lo16, 0, undef $vgpr386_lo16, 0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x8a + ; GCN-NEXT: s_set_vgpr_msb 0x458a ; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=0 ; GCN-NEXT: v_add_f16_e64 v0.h /*v512.h*/, v1.h /*v513.h*/, v2.h /*v514.h*/ $vgpr512_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr513_hi16, 0, undef $vgpr514_hi16, 0, 0, 0, implicit $exec, implicit $mode @@ -50,7 +50,7 @@ body: | ; GCN-NEXT: v_add_f16_e64 v128.l /*v640.l*/, v129.l /*v641.l*/, v130.l /*v642.l*/ $vgpr640_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr641_lo16, 0, undef $vgpr642_lo16, 0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xcf + ; GCN-NEXT: s_set_vgpr_msb 0x8acf ; ASM-SAME: ; msbs: dst=3 src0=3 src1=3 src2=0 ; GCN-NEXT: v_add_f16_e64 v0.h /*v768.h*/, v1.h /*v769.h*/, v2.h /*v770.h*/ $vgpr768_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr769_hi16, 0, undef $vgpr770_hi16, 0, 0, 0, implicit $exec, implicit $mode diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir index 41a7b82913bb0..7e1c28f8e7bbb 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir @@ -22,13 +22,13 @@ body: | $vgpr257 = V_MOV_B32_e32 undef $vgpr510, implicit $exec ; Single bit change - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x4101 ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0 ; GCN-NEXT: v_rcp_f32_e64 v255, v2 /*v258*/ $vgpr255 = V_RCP_F32_e64 0, undef $vgpr258, 0, 0, implicit $exec, implicit $mode ; Reset - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x100 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: v_rcp_f32_e64 v255, v1 $vgpr255 = V_RCP_F32_e64 0, undef $vgpr1, 0, 0, implicit $exec, implicit $mode @@ -40,7 +40,7 @@ body: | ; GCN-NEXT: v_add_nc_u32_e32 v0, v253 /*v509*/, v252 /*v508*/ $vgpr0 = V_ADD_U32_e32 undef $vgpr509, undef $vgpr508, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x544 ; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_add_f32_e64 v2 /*v258*/, v0, v251 /*v507*/ @@ -48,7 +48,7 @@ body: | ; VOP3 - ; GCN-NEXT: s_set_vgpr_msb 0x55 + ; GCN-NEXT: s_set_vgpr_msb 0x4455 ; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=1 ; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/ $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode @@ -58,32 +58,32 @@ body: | $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode ; Tuple crossing the 256 boundary - ; GCN-NEXT: s_set_vgpr_msb 17 + ; GCN-NEXT: s_set_vgpr_msb 0x5511 ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=1 ; GCN-NEXT: v_mqsad_u32_u8 v[254:257], v[2:3] /*v[258:259]*/, v0, v[244:247] /*v[500:503]*/ $vgpr254_vgpr255_vgpr256_vgpr257 = V_MQSAD_U32_U8_e64 $vgpr258_vgpr259, $vgpr0, undef $vgpr500_vgpr501_vgpr502_vgpr503, 0, implicit $exec ; DPP/tied operand - ; GCN-NEXT: s_set_vgpr_msb 0x45 + ; GCN-NEXT: s_set_vgpr_msb 0x1145 ; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_add_nc_u16_e64_dpp v0 /*v256*/, v1 /*v257*/, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 $vgpr256 = V_ADD_NC_U16_fake16_e64_dpp $vgpr256, 0, $vgpr257, 0, undef $vgpr258, 0, 0, 1, 15, 15, 1, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 17 + ; GCN-NEXT: s_set_vgpr_msb 0x4511 ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=1 ; GCN-NEXT: v_add3_u32_e64_dpp v0, v1 /*v257*/, v0, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 $vgpr0 = V_ADD3_U32_e64_dpp $vgpr0, $vgpr257, $vgpr0, undef $vgpr258, 1, 15, 15, 1, implicit $exec ; DS (addr, data0, and data1 operands) - ; GCN-NEXT: s_set_vgpr_msb 20 + ; GCN-NEXT: s_set_vgpr_msb 0x1114 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=1 src2=1 ; GCN-NEXT: ds_store_2addr_b32 v0, v248 /*v504*/, v249 /*v505*/ offset1:1 DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr504, undef $vgpr505, 0, 1, 0, implicit $exec ; Reset - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x1400 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: ds_store_2addr_b32 v0, v248, v249 offset1:1 DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr248, undef $vgpr249, 0, 1, 0, implicit $exec @@ -93,13 +93,13 @@ body: | ; GCN-NEXT: ds_load_b32 v0, v255 /*v511*/ $vgpr0 = DS_READ_B32_gfx9 $vgpr511, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x144 ; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0 ; GCN-NEXT: ds_add_rtn_u32 v255 /*v511*/, v0, v248 /*v504*/ $vgpr511 = DS_ADD_RTN_U32_gfx9 $vgpr0, undef $vgpr504, 0, 0, implicit $exec ; Reset - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4400 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: ds_add_rtn_u32 v0, v0, v0 $vgpr0 = DS_ADD_RTN_U32_gfx9 $vgpr0, $vgpr0, 0, 0, implicit $exec @@ -111,17 +111,17 @@ body: | ; GCN-NEXT: global_load_b32 v2, v[2:3] /*v[258:259]*/, off $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr258_vgpr259, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: s_set_vgpr_msb 0x140 ; ASM-SAME: ; msbs: dst=1 src0=0 src1=0 src2=0 ; GCN-NEXT: global_load_b32 v255 /*v511*/, v0, s[0:1] $vgpr511 = GLOBAL_LOAD_DWORD_SADDR undef $sgpr0_sgpr1, $vgpr0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x4001 ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0 ; GCN-NEXT: scratch_load_u8 v0, v255 /*v511*/, s0 $vgpr0 = SCRATCH_LOAD_UBYTE_SVS $vgpr511, undef $sgpr0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x100 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec @@ -135,13 +135,13 @@ body: | ; GCN-NEXT: global_store_b96 v[0:1] /*v[256:257]*/, v[244:246] /*v[500:502]*/, off GLOBAL_STORE_DWORDX3 $vgpr256_vgpr257, $vgpr500_vgpr501_vgpr502, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x544 ; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0 ; GCN-NEXT: flat_atomic_add_u32 v254 /*v510*/, v[0:1], v255 /*v511*/ th:TH_ATOMIC_RETURN $vgpr510 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr511, 0, 1, implicit $exec, implicit $flat_scr ; Reset - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4400 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: flat_atomic_add_u32 v0, v[0:1], v255 th:TH_ATOMIC_RETURN $vgpr0 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr255, 0, 1, implicit $exec, implicit $flat_scr @@ -156,12 +156,12 @@ body: | ; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0, s[8:11], s3 offen $vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr0, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; GCN-NEXT: s_set_vgpr_msb 0x4041 ; ASM-SAME: ; msbs: dst=1 src0=1 src1=0 src2=0 ; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0 /*v256*/, s[8:11], s3 offen $vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr256, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4100 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: buffer_store_b32 v0, v1, s[0:3], s3 offen BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec @@ -171,7 +171,7 @@ body: | ; GCN-NEXT: buffer_store_b32 v0 /*v256*/, v1 /*v257*/, s[0:3], s3 offen BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr256, $vgpr257, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4100 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s3 offen BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, implicit $exec @@ -183,44 +183,44 @@ body: | ; VGPRs above 512 - ; GCN-NEXT: s_set_vgpr_msb 0xaa + ; GCN-NEXT: s_set_vgpr_msb 0x41aa ; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=2 ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/ $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xab + ; GCN-NEXT: s_set_vgpr_msb 0xaaab ; ASM-SAME: ; msbs: dst=2 src0=3 src1=2 src2=2 ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v0 /*v768*/, v2 /*v514*/, v3 /*v515*/ $vgpr512 = V_FMA_F32_e64 0, undef $vgpr768, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xae + ; GCN-NEXT: s_set_vgpr_msb 0xabae ; ASM-SAME: ; msbs: dst=2 src0=2 src1=3 src2=2 ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v770*/, v3 /*v515*/ $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr770, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xba + ; GCN-NEXT: s_set_vgpr_msb 0xaeba ; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=3 ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v771*/ $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xea + ; GCN-NEXT: s_set_vgpr_msb 0xbaea ; ASM-SAME: ; msbs: dst=3 src0=2 src1=2 src2=2 ; GCN-NEXT: v_fma_f32 v255 /*v1023*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/ $vgpr1023 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xff + ; GCN-NEXT: s_set_vgpr_msb 0xeaff ; ASM-SAME: ; msbs: dst=3 src0=3 src1=3 src2=3 ; GCN-NEXT: v_fma_f32 v0 /*v768*/, v1 /*v769*/, v2 /*v770*/, v3 /*v771*/ $vgpr768 = V_FMA_F32_e64 0, undef $vgpr769, 0, undef $vgpr770, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x42 + ; GCN-NEXT: s_set_vgpr_msb 0xff42 ; ASM-SAME: ; msbs: dst=1 src0=2 src1=0 src2=0 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0 /*v512*/ $vgpr256 = V_MOV_B32_e32 undef $vgpr512, implicit $exec ; Reset - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4200 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: v_fma_f32 v0, v1, v2, v3 $vgpr0 = V_FMA_F32_e64 0, undef $vgpr1, 0, undef $vgpr2, 0, undef $vgpr3, 0, 0, implicit $exec, implicit $mode @@ -232,12 +232,12 @@ body: | ; GCN-NEXT: global_store_b96 v[0:1] /*v[512:513]*/, v[0:2] /*v[512:514]*/, off GLOBAL_STORE_DWORDX3 $vgpr512_vgpr513, $vgpr512_vgpr513_vgpr514, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 11 + ; GCN-NEXT: s_set_vgpr_msb 0xa0b ; ASM-SAME: ; msbs: dst=0 src0=3 src1=2 src2=0 ; GCN-NEXT: global_store_b64 v[254:255] /*v[1022:1023]*/, v[254:255] /*v[766:767]*/, off GLOBAL_STORE_DWORDX2 $vgpr1022_vgpr1023, $vgpr766_vgpr767, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x55 + ; GCN-NEXT: s_set_vgpr_msb 0xb55 ; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=1 ; GCN-NEXT: v_wmma_f32_16x16x32_bf16 v[14:21] /*v[270:277]*/, v[26:33] /*v[282:289]*/, v[34:41] /*v[290:297]*/, v[14:21] /*v[270:277]*/ early-clobber $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, undef $vgpr282_vgpr283_vgpr284_vgpr285_vgpr286_vgpr287_vgpr288_vgpr289, 8, undef $vgpr290_vgpr291_vgpr292_vgpr293_vgpr294_vgpr295_vgpr296_vgpr297, 8, killed undef $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277, 0, 0, 0, 0, implicit $exec @@ -247,6 +247,7 @@ body: | ... # ASM-LABEL: {{^}}vopd: + # DIS-LABEL: : --- name: vopd @@ -262,35 +263,35 @@ body: | ; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, v1, v2 :: v_dual_mul_f32 v0 /*v256*/, v3, v4 $vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr2, undef $vgpr3, undef $vgpr4, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; GCN-NEXT: s_set_vgpr_msb 0x4041 ; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, s1, v2 :: v_dual_mul_f32 v0 /*v256*/, v44 /*v300*/, v4 $vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $sgpr1, undef $vgpr2, undef $vgpr300, undef $vgpr4, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 4 + ; GCN-NEXT: s_set_vgpr_msb 0x4104 ; GCN-NEXT: v_dual_sub_f32 v255, v1, v44 /*v300*/ :: v_dual_mul_f32 v6, v0, v1 /*v257*/ $vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr300, undef $vgpr0, $vgpr257, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x401 ; GCN-NEXT: v_dual_sub_f32 v255, 0, v1 :: v_dual_mul_f32 v6, v44 /*v300*/, v3 $vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 0, undef $vgpr1, undef $vgpr300, undef $vgpr3, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: s_set_vgpr_msb 0x140 ; GCN-NEXT: v_dual_fmamk_f32 v243 /*v499*/, v0, 0xa, v3 :: v_dual_fmac_f32 v0 /*v256*/, v1, v1 $vgpr499, $vgpr256 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr3, undef $vgpr1, undef $vgpr1, $vgpr256, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: s_set_vgpr_msb 0x4005 ; GCN-NEXT: v_dual_mov_b32 v2, v3 /*v259*/ :: v_dual_add_f32 v3, v1 /*v257*/, v2 /*v258*/ $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1250 undef $vgpr259, undef $vgpr257, undef $vgpr258, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x544 ; GCN-NEXT: v_dual_fmamk_f32 v244 /*v500*/, v0, 0xa, v44 /*v300*/ :: v_dual_fmac_f32 v3 /*v259*/, v1, v1 /*v257*/ $vgpr500, $vgpr259 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr300, undef $vgpr1, undef $vgpr257, $vgpr259, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 16 + ; GCN-NEXT: s_set_vgpr_msb 0x4410 ; GCN-NEXT: v_dual_fma_f32 v0, v6, v6, v44 /*v300*/ :: v_dual_fma_f32 v1, v4, v5, v45 /*v301*/ $vgpr0, $vgpr1 = V_DUAL_FMA_F32_e64_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, 0, undef $vgpr300, 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $vgpr301, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x1000 ; GCN-NEXT: v_dual_fmac_f32 v2, v6, v6 :: v_dual_fma_f32 v3, v4, v5, v3 $vgpr2, $vgpr3 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, undef $vgpr2, 0, undef $vgpr4, 0, undef $vgpr5, 0, $vgpr3, implicit $mode, implicit $exec @@ -298,7 +299,7 @@ body: | ; GCN-NEXT: v_dual_fma_f32 v244 /*v500*/, v6, v7, v8 :: v_dual_add_f32 v3 /*v259*/, v4, v5 $vgpr500, $vgpr259 = V_DUAL_FMA_F32_e64_X_ADD_F32_e32_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr7, 0, undef $vgpr8, 0, undef $vgpr4, 0, undef $vgpr5, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0xae + ; GCN-NEXT: s_set_vgpr_msb 0x40ae ; GCN-NEXT: v_dual_fmac_f32 v2 /*v514*/, v6 /*v518*/, v8 /*v776*/ :: v_dual_fma_f32 v3 /*v515*/, v4 /*v516*/, v7 /*v775*/, v3 /*v515*/ $vgpr514, $vgpr515 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr518, 0, undef $vgpr776, undef $vgpr514, 0, undef $vgpr516, 0, undef $vgpr775, 0, $vgpr515, implicit $mode, implicit $exec @@ -319,31 +320,31 @@ body: | ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2 /*v258*/, 0x1 $vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: s_set_vgpr_msb 0x4505 ; GCN-NEXT: v_fmaak_f32 v0, v1 /*v257*/, v2 /*v258*/, 0x1 $vgpr0 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; GCN-NEXT: s_set_vgpr_msb 0x541 ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2, 0x1 $vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr2, 1, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x4144 ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1, v2 /*v258*/, 0x1 $vgpr256 = V_FMAAK_F32 undef $vgpr1, undef $vgpr258, 1, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x45 + ; GCN-NEXT: s_set_vgpr_msb 0x4445 ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2 /*v258*/ $vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: s_set_vgpr_msb 0x4505 ; GCN-NEXT: v_fmamk_f32 v0, v1 /*v257*/, 0x1, v2 /*v258*/ $vgpr0 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; GCN-NEXT: s_set_vgpr_msb 0x541 ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2 $vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr2, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x4144 ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1, 0x1, v2 /*v258*/ $vgpr256 = V_FMAMK_F32 undef $vgpr1, 1, undef $vgpr258, implicit $exec, implicit $mode @@ -389,15 +390,15 @@ body: | ; GCN-NEXT: v_lshlrev_b32_e64 v0, v0 /*v256*/, v2 $vgpr0 = V_LSHLREV_B32_e64 undef $vgpr256, undef $vgpr2, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 4 + ; GCN-NEXT: s_set_vgpr_msb 0x104 ; GCN-NEXT: v_lshlrev_b32_e64 v0, v1, v0 /*v256*/ $vgpr0 = V_LSHLREV_B32_e64 undef $vgpr1, undef $vgpr256, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x401 ; GCN-NEXT: v_subrev_nc_u32_e32 v0, v0 /*v256*/, v2 $vgpr0 = V_SUBREV_U32_e32 undef $vgpr256, undef $vgpr2, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 4 + ; GCN-NEXT: s_set_vgpr_msb 0x104 ; GCN-NEXT: v_subrev_nc_u32_e32 v0, v1, v0 /*v256*/ $vgpr0 = V_SUBREV_U32_e32 undef $vgpr1, undef $vgpr256, implicit $exec @@ -417,7 +418,7 @@ body: | ; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/ $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x5500 ; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v2 $vgpr0 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec @@ -431,7 +432,7 @@ body: | ; GCN-NEXT: v_add_nc_u32_e32 v0 /*v256*/, v1, v2 $vgpr256 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: v_fma_f32 v3, v4, v5, s2 $vgpr3 = V_FMA_F32_e64 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $sgpr2, 0, 0, implicit $exec, implicit $mode @@ -439,17 +440,17 @@ body: | ; GCN-NEXT: v_fma_f32 v3, v4 /*v260*/, v5, 1 $vgpr3 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr5, 0, 1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 4 + ; GCN-NEXT: s_set_vgpr_msb 0x104 ; GCN-NEXT: v_mov_b32_e32 v0, v1 $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec ; GCN-NEXT: v_add_nc_u32_e32 v2, v1, v3 /*v259*/ $vgpr2 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr259, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x401 ; GCN-NEXT: v_mov_b32_e32 v0, v0 /*v256*/ ; GCN-NEXT: v_add_nc_u32_e32 v1, v1 /*v257*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: s_set_vgpr_msb 0x105 ; GCN-NEXT: v_add_nc_u32_e32 v2, v2 /*v258*/, v2 /*v258*/ $vgpr0 = V_MOV_B32_e32 undef $vgpr256, implicit $exec $vgpr1 = V_ADD_U32_e32 undef $vgpr257, undef $vgpr1, implicit $exec @@ -478,7 +479,7 @@ body: | ; ASM: .LBB{{.*_1}}: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec ; Reset on fallthrough block end @@ -487,7 +488,7 @@ body: | ; ASM-NEXT: %bb.2: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: s_branch $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec S_BRANCH %bb.3 @@ -498,7 +499,7 @@ body: | ; ASM: .LBB{{.*_3}}: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: s_swap_pc_i64 $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec $exec = S_SWAPPC_B64 undef $sgpr0_sgpr1 @@ -520,7 +521,7 @@ body: | ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: s_set_pc_i64 $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec @@ -540,7 +541,7 @@ body: | ; ASM-NEXT: %bb.7: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; ASM-NEXT: ; return to shader part epilog $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec SI_RETURN_TO_EPILOG undef $vgpr0, implicit-def $exec @@ -558,7 +559,7 @@ body: | ; ASM-NEXT: %bb.9: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: s_set_pc_i64 $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec S_SETPC_B64_return undef $sgpr0_sgpr1, implicit-def $exec @@ -576,14 +577,14 @@ body: | ; ASM: %bb.0: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 $vgpr256 = V_MOV_B32_e32 undef $vgpr0, implicit $exec bb.1: ; ASM: .LBB{{[0-9]+}}_1: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: s_cbranch_scc0 $vgpr257 = V_MOV_B32_e32 undef $vgpr1, implicit $exec S_CBRANCH_SCC0 %bb.1, undef implicit $scc @@ -607,7 +608,7 @@ body: | ; ASM: %bb.0: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; ASM: def v0 ; GCN-NOT: s_set_vgpr_msb ; ASM: use v0 @@ -641,7 +642,7 @@ body: | ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 ; GCN-NEXT: s_nop 0 - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x4001 ; GCN-NEXT: v_mov_b32_e32 v1, v0 /*v256*/ BUNDLE implicit-def $vgpr256 { $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec @@ -683,7 +684,7 @@ body: | ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-NEXT: v_mov_b32_e32 v3, v1 BUNDLE implicit-def $vgpr256, implicit-def $vgpr257, implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr1 { @@ -712,7 +713,7 @@ body: | ; GCN-NEXT: s_clause 0x3e ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: v_mov_b32_e32 v1, v1 ; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-COUNT-60: v_mov_b32_e32 v1, v1 @@ -826,7 +827,7 @@ body: | ; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v1, v2 $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x500 ; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v1, v2 $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec @@ -838,11 +839,11 @@ body: | ; GCN-NEXT: v_wmma_ld_scale16_paired_b64 v[0:1], v[2:3] V_WMMA_LD_SCALE16_PAIRED_B64 undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: s_set_vgpr_msb 0x105 ; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v[0:1], v[2:3] $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x500 ; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v[0:1], v[2:3] $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll index a42c8ac706d27..75817105e74fd 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll @@ -3182,7 +3182,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v509*/, s33 offset:1592 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v510*/, s33 offset:1596 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v511*/, s33 offset:1600 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v512*/, s33 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v513*/, s33 offset:1608 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v514*/, s33 offset:1612 @@ -3443,7 +3443,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v765*/, s33 offset:2616 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v766*/, s33 offset:2620 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v767*/, s33 offset:2624 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v768*/, s33 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v769*/, s33 offset:2632 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v770*/, s33 offset:2636 @@ -3706,7 +3706,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v1023*/, s33 offset:3648 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: v_writelane_b32 v40, s0, 3 @@ -4135,7 +4135,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v509*/, off, s33 offset:1592 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v510*/, off, s33 offset:1596 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v511*/, off, s33 offset:1600 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v512*/, off, s33 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v513*/, off, s33 offset:1608 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v514*/, off, s33 offset:1612 @@ -4396,7 +4396,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v765*/, off, s33 offset:2616 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v766*/, off, s33 offset:2620 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v767*/, off, s33 offset:2624 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v768*/, off, s33 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v769*/, off, s33 offset:2632 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v770*/, off, s33 offset:2636 @@ -4661,7 +4661,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s4 ; GFX1250-DAGISEL-NEXT: s_mov_b32 s33, s0 ; GFX1250-DAGISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[30:31] %ret = call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent ret <2 x half> %ret @@ -6346,7 +6346,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v509*/, s32 offset:1588 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v510*/, s32 offset:1592 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v511*/, s32 offset:1596 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v512*/, s32 offset:1600 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v513*/, s32 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v514*/, s32 offset:1608 @@ -6607,7 +6607,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v765*/, s32 offset:2612 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v766*/, s32 offset:2616 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v767*/, s32 offset:2620 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v768*/, s32 offset:2624 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v769*/, s32 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v770*/, s32 offset:2632 @@ -6872,7 +6872,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1 ; GFX1250-DAGISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX1250-DAGISEL-NEXT: s_mov_b64 s[36:37], gfx_callee@abs64 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: v_swap_b32 v0, v1 ; GFX1250-DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 ; GFX1250-DAGISEL-NEXT: s_clause 0x3e @@ -7283,7 +7283,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v509*/, off, s32 offset:1588 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v510*/, off, s32 offset:1592 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v511*/, off, s32 offset:1596 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v512*/, off, s32 offset:1600 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v513*/, off, s32 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v514*/, off, s32 offset:1608 @@ -7544,7 +7544,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v765*/, off, s32 offset:2612 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v766*/, off, s32 offset:2616 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v767*/, off, s32 offset:2620 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v768*/, off, s32 offset:2624 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v769*/, off, s32 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v770*/, off, s32 offset:2632 @@ -7807,7 +7807,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v1023*/, off, s32 offset:3644 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s0 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[36:37] %ret = tail call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent ret <2 x half> %ret @@ -9657,7 +9657,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v509*/, s33 offset:1600 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v510*/, s33 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v511*/, s33 offset:1608 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v512*/, s33 offset:1612 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v513*/, s33 offset:1616 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v514*/, s33 offset:1620 @@ -9918,7 +9918,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v765*/, s33 offset:2624 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v766*/, s33 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v767*/, s33 offset:2632 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v768*/, s33 offset:2636 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v769*/, s33 offset:2640 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v770*/, s33 offset:2644 @@ -10181,7 +10181,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v1023*/, s33 offset:3656 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: s_clause 0x2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v42, s33 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:164 @@ -10616,7 +10616,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v509*/, off, s33 offset:1600 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v510*/, off, s33 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v511*/, off, s33 offset:1608 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v512*/, off, s33 offset:1612 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v513*/, off, s33 offset:1616 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v514*/, off, s33 offset:1620 @@ -10877,7 +10877,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v765*/, off, s33 offset:2624 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v766*/, off, s33 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v767*/, off, s33 offset:2632 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v768*/, off, s33 offset:2636 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v769*/, off, s33 offset:2640 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v770*/, off, s33 offset:2644 @@ -11142,7 +11142,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s4 ; GFX1250-DAGISEL-NEXT: s_mov_b32 s33, s0 ; GFX1250-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[30:31] %ret = call float(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, <8 x float> %x) convergent store float %ret, ptr %p From e3be57ec4dfe7e7c8636ca57da9e1508510948cc Mon Sep 17 00:00:00 2001 From: Ehsan Amiri Date: Fri, 31 Oct 2025 15:50:30 -0400 Subject: [PATCH 385/539] Revert "[DA] Check for overflow in strong SIV test" (#165905) Reverts llvm/llvm-project#164704 that broke several built bots. --- llvm/lib/Analysis/DependenceAnalysis.cpp | 19 +--- .../SimpleSIVNoValidityCheck.ll | 2 +- .../Analysis/DependenceAnalysis/StrongSIV.ll | 86 +++---------------- 3 files changed, 17 insertions(+), 90 deletions(-) diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index e45d1f79b3165..11d829492a10e 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -1587,15 +1587,6 @@ static const SCEV *minusSCEVNoSignedOverflow(const SCEV *A, const SCEV *B, return nullptr; } -/// Returns \p A * \p B if it guaranteed not to signed wrap. Otherwise returns -/// nullptr. \p A and \p B must have the same integer type. -static const SCEV *mulSCEVNoSignedOverflow(const SCEV *A, const SCEV *B, - ScalarEvolution &SE) { - if (SE.willNotOverflow(Instruction::Mul, /*Signed=*/true, A, B)) - return SE.getMulExpr(A, B); - return nullptr; -} - /// Returns the absolute value of \p A. In the context of dependence analysis, /// we need an absolute value in a mathematical sense. If \p A is the signed /// minimum value, we cannot represent it unless extending the original type. @@ -1695,11 +1686,7 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst, assert(0 < Level && Level <= CommonLevels && "level out of range"); Level--; - const SCEV *Delta = minusSCEVNoSignedOverflow(SrcConst, DstConst, *SE); - if (!Delta) { - Result.Consistent = false; - return false; - } + const SCEV *Delta = SE->getMinusSCEV(SrcConst, DstConst); LLVM_DEBUG(dbgs() << "\t Delta = " << *Delta); LLVM_DEBUG(dbgs() << ", " << *Delta->getType() << "\n"); @@ -1715,9 +1702,7 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst, const SCEV *AbsCoeff = absSCEVNoSignedOverflow(Coeff, *SE); if (!AbsDelta || !AbsCoeff) return false; - const SCEV *Product = mulSCEVNoSignedOverflow(UpperBound, AbsCoeff, *SE); - if (!Product) - return false; + const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff); return isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product); }(); if (IsDeltaLarge) { diff --git a/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll b/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll index 181a4494b036e..4346507ba8f90 100644 --- a/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll +++ b/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll @@ -210,7 +210,7 @@ define void @t3(i64 %n, i64 %m, i64 %lb, ptr %a) { ; CHECK-NEXT: Src: %2 = load i32, ptr %arrayidx6, align 4 --> Dst: %2 = load i32, ptr %arrayidx6, align 4 ; CHECK-NEXT: da analyze - none! ; CHECK-NEXT: Src: %2 = load i32, ptr %arrayidx6, align 4 --> Dst: store i32 %2, ptr %arrayidx8, align 4 -; CHECK-NEXT: da analyze - anti [1 *]! +; CHECK-NEXT: da analyze - consistent anti [1 -2]! ; CHECK-NEXT: Src: store i32 %2, ptr %arrayidx8, align 4 --> Dst: store i32 %2, ptr %arrayidx8, align 4 ; CHECK-NEXT: da analyze - none! ; diff --git a/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll b/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll index 160196284f415..44bd9b7727910 100644 --- a/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll +++ b/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-ALL -; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa -da-enable-dependence-test=strong-siv 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-STRONG-SIV +; RUN: | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.6.0" @@ -425,33 +423,19 @@ for.end: ; preds = %for.body ;; *B++ = A[i + 2*n]; define void @strong9(ptr %A, ptr %B, i64 %n) nounwind uwtable ssp { -; CHECK-ALL-LABEL: 'strong9' -; CHECK-ALL-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4 -; CHECK-ALL-NEXT: da analyze - none! -; CHECK-ALL-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 -; CHECK-ALL-NEXT: da analyze - none! -; CHECK-ALL-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 -; CHECK-ALL-NEXT: da analyze - confused! -; CHECK-ALL-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 -; CHECK-ALL-NEXT: da analyze - none! -; CHECK-ALL-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 -; CHECK-ALL-NEXT: da analyze - confused! -; CHECK-ALL-NEXT: Src: store i32 %0, ptr %B.addr.02, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 -; CHECK-ALL-NEXT: da analyze - none! -; -; CHECK-STRONG-SIV-LABEL: 'strong9' -; CHECK-STRONG-SIV-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4 -; CHECK-STRONG-SIV-NEXT: da analyze - none! -; CHECK-STRONG-SIV-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 -; CHECK-STRONG-SIV-NEXT: da analyze - flow [*|<]! -; CHECK-STRONG-SIV-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 -; CHECK-STRONG-SIV-NEXT: da analyze - confused! -; CHECK-STRONG-SIV-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 -; CHECK-STRONG-SIV-NEXT: da analyze - none! -; CHECK-STRONG-SIV-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 -; CHECK-STRONG-SIV-NEXT: da analyze - confused! -; CHECK-STRONG-SIV-NEXT: Src: store i32 %0, ptr %B.addr.02, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 -; CHECK-STRONG-SIV-NEXT: da analyze - none! +; CHECK-LABEL: 'strong9' +; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4 +; CHECK-NEXT: da analyze - none! +; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 +; CHECK-NEXT: da analyze - none! +; CHECK-NEXT: Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 +; CHECK-NEXT: da analyze - confused! +; CHECK-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4 +; CHECK-NEXT: da analyze - none! +; CHECK-NEXT: Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 +; CHECK-NEXT: da analyze - confused! +; CHECK-NEXT: Src: store i32 %0, ptr %B.addr.02, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4 +; CHECK-NEXT: da analyze - none! ; entry: %cmp1 = icmp eq i64 %n, 0 @@ -528,45 +512,3 @@ for.body: ; preds = %entry, %for.body for.end: ; preds = %for.body ret void } - - -;; for (long unsigned i = 0; i < 9223372036854775806; i++) -;; for (long unsigned j = 0; j < 2147483640; j++) -;; if (i < 3000000000) -;; A[i] = 0; -; -; FIXME: DependenceAnalysis fails to detect the dependency between A[i] and -; itself, while Strong SIV has been able to prove it. -define void @strong11(ptr %A) nounwind uwtable ssp { -; CHECK-ALL-LABEL: 'strong11' -; CHECK-ALL-NEXT: Src: store i32 0, ptr %arrayidx, align 4 --> Dst: store i32 0, ptr %arrayidx, align 4 -; CHECK-ALL-NEXT: da analyze - none! -; -; CHECK-STRONG-SIV-LABEL: 'strong11' -; CHECK-STRONG-SIV-NEXT: Src: store i32 0, ptr %arrayidx, align 4 --> Dst: store i32 0, ptr %arrayidx, align 4 -; CHECK-STRONG-SIV-NEXT: da analyze - consistent output [0 S]! -; -entry: - br label %for.cond1.preheader - -for.cond1.preheader: ; preds = %entry, %for.cond.cleanup3 - %i.017 = phi i64 [ 0, %entry ], [ %inc8, %for.cond.cleanup3 ] - %cmp5 = icmp samesign ult i64 %i.017, 3000000000 - %arrayidx = getelementptr inbounds nuw i32, ptr %A, i64 %i.017 - br i1 %cmp5, label %for.body4.us, label %for.cond.cleanup3 - -for.body4.us: ; preds = %for.cond1.preheader, %for.body4.us - %j.016.us = phi i64 [ %inc.us, %for.body4.us ], [ 0, %for.cond1.preheader ] - store i32 0, ptr %arrayidx, align 4 - %inc.us = add nuw nsw i64 %j.016.us, 1 - %exitcond.not = icmp eq i64 %inc.us, 2147483640 - br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4.us - -for.cond.cleanup: ; preds = %for.cond.cleanup3 - ret void - -for.cond.cleanup3: ; preds = %for.body4.us, %for.cond1.preheader - %inc8 = add nuw nsw i64 %i.017, 1 - %exitcond19.not = icmp eq i64 %inc8, 9223372036854775806 - br i1 %exitcond19.not, label %for.cond.cleanup, label %for.cond1.preheader -} From cd66a76989a0a534259a613231a02f06c1d7e2fe Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 31 Oct 2025 12:58:36 -0700 Subject: [PATCH 386/539] ARM: Avoid doing strncmp on libcall name (#165203) Check if the default implementation is the aeabi impl directly. If getLibcallName returned null, this would crash. --- llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp index ebfa593fbe9e6..bf7c962f02efc 100644 --- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -47,9 +47,7 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( // Only use a specialized AEABI function if the default version of this // Libcall is an AEABI function. - if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0) - return SDValue(); - + // // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be // able to translate memset to memclr and use the value to index the function // name array. @@ -61,12 +59,21 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( } AEABILibcall; switch (LC) { case RTLIB::MEMCPY: + if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memcpy) + return SDValue(); + AEABILibcall = AEABI_MEMCPY; break; case RTLIB::MEMMOVE: + if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memmove) + return SDValue(); + AEABILibcall = AEABI_MEMMOVE; break; case RTLIB::MEMSET: + if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memset) + return SDValue(); + AEABILibcall = AEABI_MEMSET; if (isNullConstant(Src)) AEABILibcall = AEABI_MEMCLR; From d4c02b445d5fdda5fc74bf358c3cc82491eb28b8 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 31 Oct 2025 13:01:23 -0700 Subject: [PATCH 387/539] [PowerPC] Remove a redundant cast (NFC) (#165834) PtrValue is already of type Value *. --- llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp index 3640d2545b5ac..70df59d01d6c7 100644 --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -1316,7 +1316,7 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) { // useless and possible to break some original well-form addressing mode // to make this pre-inc prep for it. if (PointerElementType->isIntegerTy(64)) { - const SCEV *LSCEV = SE->getSCEVAtScope(const_cast(PtrValue), L); + const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L); const SCEVAddRecExpr *LARSCEV = dyn_cast(LSCEV); if (!LARSCEV || LARSCEV->getLoop() != L) return false; From c4cb4a36bc98de01ea3744f178e8cd9acf81af48 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Fri, 31 Oct 2025 13:05:18 -0700 Subject: [PATCH 388/539] [SimplifyCFG] Propagate profile in `simplifySwitchOfPowersOfTwo` (#165804) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `simplifySwitchOfPowersOfTwo`​ converts (when applicable, see `00f5a1e30b`​) a switch to a conditional branch. Its false case goes to the `default`​ target of the former switch, and the true case goes to a BB performing a `cttz`​. We can calculate the branch weights from the branch weights of the old switch. Issue #147390 --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 27 +++++++++++++++++++ .../X86/switch-of-powers-of-two.ll | 27 ++++++++++++++----- llvm/utils/profcheck-xfail.txt | 2 -- 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 7f6d779687e94..6addcfab15125 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -80,6 +80,7 @@ #include #include #include +#include #include #include #include @@ -7632,7 +7633,33 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, auto *DefaultCaseBB = SI->getDefaultDest(); BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU); auto It = OrigBB->getTerminator()->getIterator(); + SmallVector Weights; + auto HasWeights = + !ProfcheckDisableMetadataFixes && extractBranchWeights(*SI, Weights); auto *BI = BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It); + if (HasWeights && any_of(Weights, [](const auto &V) { return V != 0; })) { + // IsPow2 covers a subset of the cases in which we'd go to the default + // label. The other is those powers of 2 that don't appear in the case + // statement. We don't know the distribution of the values coming in, so + // the safest is to split 50-50 the original probability to `default`. + uint64_t OrigDenominator = sum_of(map_range( + Weights, [](const auto &V) { return static_cast(V); })); + SmallVector NewWeights(2); + NewWeights[1] = Weights[0] / 2; + NewWeights[0] = OrigDenominator - NewWeights[1]; + setFittedBranchWeights(*BI, NewWeights, /*IsExpected=*/false); + + // For the original switch, we reduce the weight of the default by the + // amount by which the previous branch contributes to getting to default, + // and then make sure the remaining weights have the same relative ratio + // wrt eachother. + uint64_t CasesDenominator = OrigDenominator - Weights[0]; + Weights[0] /= 2; + for (auto &W : drop_begin(Weights)) + W = NewWeights[0] * static_cast(W) / CasesDenominator; + + setBranchWeights(*SI, Weights, /*IsExpected=*/false); + } // BI is handling the default case for SI, and so should share its DebugLoc. BI->setDebugLoc(SI->getDebugLoc()); It->eraseFromParent(); diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll index aa95b3fd235e5..d818335f075e5 100644 --- a/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll +++ b/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll @@ -1,8 +1,13 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt -passes='simplifycfg' -simplifycfg-require-and-preserve-domtree=1 -S < %s | FileCheck %s target triple = "x86_64-unknown-linux-gnu" +;. +; CHECK: @switch.table.switch_of_powers_two = private unnamed_addr constant [7 x i32] [i32 3, i32 poison, i32 poison, i32 2, i32 1, i32 0, i32 42], align 4 +; CHECK: @switch.table.switch_of_powers_two_default_reachable = private unnamed_addr constant [7 x i32] [i32 3, i32 5, i32 5, i32 2, i32 1, i32 0, i32 42], align 4 +; CHECK: @switch.table.switch_of_powers_two_default_reachable_multipreds = private unnamed_addr constant [7 x i32] [i32 3, i32 poison, i32 poison, i32 2, i32 1, i32 0, i32 42], align 4 +;. define i32 @switch_of_powers_two(i32 %arg) { ; CHECK-LABEL: define i32 @switch_of_powers_two( ; CHECK-SAME: i32 [[ARG:%.*]]) { @@ -35,17 +40,17 @@ return: ret i32 %phi } -define i32 @switch_of_powers_two_default_reachable(i32 %arg) { +define i32 @switch_of_powers_two_default_reachable(i32 %arg) !prof !0 { ; CHECK-LABEL: define i32 @switch_of_powers_two_default_reachable( -; CHECK-SAME: i32 [[ARG:%.*]]) { +; CHECK-SAME: i32 [[ARG:%.*]]) !prof [[PROF0:![0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[ARG]]) ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 1 -; CHECK-NEXT: br i1 [[TMP1]], label %[[ENTRY_SPLIT:.*]], label %[[RETURN:.*]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[ENTRY_SPLIT:.*]], label %[[RETURN:.*]], !prof [[PROF1:![0-9]+]] ; CHECK: [[ENTRY_SPLIT]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[ARG]], i1 true) ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 7 -; CHECK-NEXT: br i1 [[TMP3]], label %[[SWITCH_LOOKUP:.*]], label %[[RETURN]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[SWITCH_LOOKUP:.*]], label %[[RETURN]], !prof [[PROF2:![0-9]+]] ; CHECK: [[SWITCH_LOOKUP]]: ; CHECK-NEXT: [[TMP4:%.*]] = zext nneg i32 [[TMP2]] to i64 ; CHECK-NEXT: [[SWITCH_GEP:%.*]] = getelementptr inbounds [7 x i32], ptr @switch.table.switch_of_powers_two_default_reachable, i64 0, i64 [[TMP4]] @@ -62,7 +67,7 @@ entry: i32 16, label %bb3 i32 32, label %bb4 i32 64, label %bb5 - ] + ], !prof !1 default_case: br label %return bb1: br label %return @@ -128,3 +133,13 @@ return: %phi = phi i32 [ 3, %bb1 ], [ 2, %bb2 ], [ 1, %bb3 ], [ 0, %bb4 ], [ 42, %bb5 ], [ %pn, %default_case ] ret i32 %phi } + +!0 = !{!"function_entry_count", i32 10} +!1 = !{!"branch_weights", i32 10, i32 5, i32 7, i32 11, i32 13, i32 17} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;. +; CHECK: [[PROF0]] = !{!"function_entry_count", i32 10} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 58, i32 5} +; CHECK: [[PROF2]] = !{!"branch_weights", i32 56, i32 5} +;. diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt index aef7c0987fda7..83bffc70574a8 100644 --- a/llvm/utils/profcheck-xfail.txt +++ b/llvm/utils/profcheck-xfail.txt @@ -1317,8 +1317,6 @@ Transforms/SimpleLoopUnswitch/pr60736.ll Transforms/SimpleLoopUnswitch/trivial-unswitch-freeze-individual-conditions.ll Transforms/SimpleLoopUnswitch/trivial-unswitch.ll Transforms/SimpleLoopUnswitch/trivial-unswitch-logical-and-or.ll -Transforms/SimplifyCFG/RISCV/switch-of-powers-of-two.ll -Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll Transforms/StructurizeCFG/hoist-zerocost.ll From a722366b7cf6a5b99a0e22b8024f23e62eb3e6c3 Mon Sep 17 00:00:00 2001 From: Piyush Jaiswal Date: Fri, 31 Oct 2025 13:06:36 -0700 Subject: [PATCH 389/539] [lldb] Refactor LLDB Breakpoint Event Notifications to centralize and eliminate code duplication (#164739) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Summary This PR refactors breakpoint event notification in LLDB to centralize and eliminate code duplication. It creates a unified method in the `Target` class for sending breakpoint change events. The new methods check if listeners exist before broadcasting events ### Test Screenshot 2025-10-23 at 12 49 31 PM --------- Co-authored-by: Piyush Jaiswal --- lldb/include/lldb/Target/Target.h | 7 +++++++ lldb/source/Breakpoint/Breakpoint.cpp | 17 +++++------------ lldb/source/Breakpoint/BreakpointList.cpp | 8 +------- lldb/source/Breakpoint/BreakpointLocation.cpp | 6 ++---- lldb/source/Target/Target.cpp | 17 +++++++++++++++++ 5 files changed, 32 insertions(+), 23 deletions(-) diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h index c375df248154f..40f9c9bea1c12 100644 --- a/lldb/include/lldb/Target/Target.h +++ b/lldb/include/lldb/Target/Target.h @@ -1346,6 +1346,13 @@ class Target : public std::enable_shared_from_this, const lldb_private::RegisterFlags &flags, uint32_t byte_size); + /// Sends a breakpoint notification event. + void NotifyBreakpointChanged(Breakpoint &bp, + lldb::BreakpointEventType event_kind); + /// Sends a breakpoint notification event. + void NotifyBreakpointChanged(Breakpoint &bp, + const lldb::EventDataSP &breakpoint_data_sp); + llvm::Expected ReadInstructions(const Address &start_addr, uint32_t count, const char *flavor_string = nullptr); diff --git a/lldb/source/Breakpoint/Breakpoint.cpp b/lldb/source/Breakpoint/Breakpoint.cpp index b23d1143d60c4..201d8d20c4901 100644 --- a/lldb/source/Breakpoint/Breakpoint.cpp +++ b/lldb/source/Breakpoint/Breakpoint.cpp @@ -1098,14 +1098,9 @@ bool Breakpoint::EvaluatePrecondition(StoppointCallbackContext &context) { } void Breakpoint::SendBreakpointChangedEvent( - lldb::BreakpointEventType eventKind) { - if (!IsInternal() && GetTarget().EventTypeHasListeners( - Target::eBroadcastBitBreakpointChanged)) { - std::shared_ptr data = - std::make_shared(eventKind, shared_from_this()); - - GetTarget().BroadcastEvent(Target::eBroadcastBitBreakpointChanged, data); - } + lldb::BreakpointEventType event_kind) { + if (!IsInternal()) + GetTarget().NotifyBreakpointChanged(*this, event_kind); } void Breakpoint::SendBreakpointChangedEvent( @@ -1113,10 +1108,8 @@ void Breakpoint::SendBreakpointChangedEvent( if (!breakpoint_data_sp) return; - if (!IsInternal() && - GetTarget().EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged)) - GetTarget().BroadcastEvent(Target::eBroadcastBitBreakpointChanged, - breakpoint_data_sp); + if (!IsInternal()) + GetTarget().NotifyBreakpointChanged(*this, breakpoint_data_sp); } const char *Breakpoint::BreakpointEventTypeAsCString(BreakpointEventType type) { diff --git a/lldb/source/Breakpoint/BreakpointList.cpp b/lldb/source/Breakpoint/BreakpointList.cpp index 779490ae0316a..e3dd62bfa329d 100644 --- a/lldb/source/Breakpoint/BreakpointList.cpp +++ b/lldb/source/Breakpoint/BreakpointList.cpp @@ -16,13 +16,7 @@ using namespace lldb; using namespace lldb_private; static void NotifyChange(const BreakpointSP &bp, BreakpointEventType event) { - Target &target = bp->GetTarget(); - if (target.EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged)) { - auto event_data_sp = - std::make_shared(event, bp); - target.BroadcastEvent(Target::eBroadcastBitBreakpointChanged, - event_data_sp); - } + bp->GetTarget().NotifyBreakpointChanged(*bp, event); } BreakpointList::BreakpointList(bool is_internal) diff --git a/lldb/source/Breakpoint/BreakpointLocation.cpp b/lldb/source/Breakpoint/BreakpointLocation.cpp index 22c98acda8c59..f25209c15e007 100644 --- a/lldb/source/Breakpoint/BreakpointLocation.cpp +++ b/lldb/source/Breakpoint/BreakpointLocation.cpp @@ -749,13 +749,11 @@ void BreakpointLocation::Dump(Stream *s) const { void BreakpointLocation::SendBreakpointLocationChangedEvent( lldb::BreakpointEventType eventKind) { - if (!m_owner.IsInternal() && m_owner.GetTarget().EventTypeHasListeners( - Target::eBroadcastBitBreakpointChanged)) { + if (!m_owner.IsInternal()) { auto data_sp = std::make_shared( eventKind, m_owner.shared_from_this()); data_sp->GetBreakpointLocationCollection().Add(shared_from_this()); - m_owner.GetTarget().BroadcastEvent(Target::eBroadcastBitBreakpointChanged, - data_sp); + m_owner.GetTarget().NotifyBreakpointChanged(m_owner, data_sp); } } diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index d070c3d953d4a..1e43094421f0a 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "lldb/Target/Target.h" +#include "lldb/Breakpoint/Breakpoint.h" #include "lldb/Breakpoint/BreakpointIDList.h" #include "lldb/Breakpoint/BreakpointPrecondition.h" #include "lldb/Breakpoint/BreakpointResolver.h" @@ -5271,3 +5272,19 @@ void Target::ClearSectionLoadList() { GetSectionLoadList().Clear(); } void Target::DumpSectionLoadList(Stream &s) { GetSectionLoadList().Dump(s, this); } + +void Target::NotifyBreakpointChanged(Breakpoint &bp, + lldb::BreakpointEventType eventKind) { + if (EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged)) { + std::shared_ptr data_sp = + std::make_shared( + eventKind, bp.shared_from_this()); + BroadcastEvent(Target::eBroadcastBitBreakpointChanged, data_sp); + } +} + +void Target::NotifyBreakpointChanged( + Breakpoint &bp, const lldb::EventDataSP &breakpoint_data_sp) { + if (EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged)) + BroadcastEvent(Target::eBroadcastBitBreakpointChanged, breakpoint_data_sp); +} From 95193e3c271bc89ee2bc7c5dc3405c0f4d8382bc Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 31 Oct 2025 13:10:09 -0700 Subject: [PATCH 390/539] [lldb] Emit a progress event from the source manager (#165802) Reading a source file might take a while, for example because it's located on a virtual file system that's fetching the data on demand. This PR emits a progress event to convey this to the user when reading the file exceeds a certain threshold (500ms). Although it doesn't speed up the operation, it still greatly improves the user experience by helping them understand what's going on. rdar://163750392 --- lldb/include/lldb/Core/SourceManager.h | 2 ++ lldb/source/Core/SourceManager.cpp | 26 ++++++++++++++++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/lldb/include/lldb/Core/SourceManager.h b/lldb/include/lldb/Core/SourceManager.h index 1244291596b73..83dc74768733d 100644 --- a/lldb/include/lldb/Core/SourceManager.h +++ b/lldb/include/lldb/Core/SourceManager.h @@ -109,6 +109,8 @@ class SourceManager { private: void CommonInitializer(lldb::SupportFileSP support_file_sp, lldb::TargetSP target_sp); + void CommonInitializerImpl(lldb::SupportFileSP support_file_sp, + lldb::TargetSP target_sp); }; typedef std::shared_ptr FileSP; diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp index f786866a18137..097173ffe678e 100644 --- a/lldb/source/Core/SourceManager.cpp +++ b/lldb/source/Core/SourceManager.cpp @@ -34,6 +34,7 @@ #include "llvm/ADT/Twine.h" +#include #include #include #include @@ -54,8 +55,7 @@ using namespace lldb_private; static inline bool is_newline_char(char ch) { return ch == '\n' || ch == '\r'; } static void resolve_tilde(FileSpec &file_spec) { - if (!FileSystem::Instance().Exists(file_spec) && - file_spec.GetDirectory() && + if (!FileSystem::Instance().Exists(file_spec) && file_spec.GetDirectory() && file_spec.GetDirectory().GetCString()[0] == '~') { FileSystem::Instance().Resolve(file_spec); } @@ -477,6 +477,28 @@ SourceManager::File::File(SupportFileSP support_file_sp, TargetSP target_sp) void SourceManager::File::CommonInitializer(SupportFileSP support_file_sp, TargetSP target_sp) { + // It might take a while to read a source file, for example because it's + // coming from a virtual file system that's fetching the data on demand. When + // reading the data exceeds a certain threshold, show a progress event to let + // the user know what's going on. + static constexpr auto g_progress_delay = std::chrono::milliseconds(500); + + std::future future = std::async(std::launch::async, [=]() { + CommonInitializerImpl(support_file_sp, target_sp); + }); + + std::optional progress; + if (future.wait_for(g_progress_delay) == std::future_status::timeout) { + Debugger *debugger = target_sp ? &target_sp->GetDebugger() : nullptr; + progress.emplace("Loading source file", + support_file_sp->GetSpecOnly().GetFilename().GetString(), + 1, debugger); + } + future.wait(); +} + +void SourceManager::File::CommonInitializerImpl(SupportFileSP support_file_sp, + TargetSP target_sp) { // Set the file and update the modification time. SetSupportFile(support_file_sp); From bd63cfbc1ed273161e73f0e832a1a7d0686ea99c Mon Sep 17 00:00:00 2001 From: vangthao95 Date: Fri, 31 Oct 2025 13:17:44 -0700 Subject: [PATCH 391/539] [AMDGPU][GlobalISel] Clean up selectCOPY_SCC_VCC function (#165797) Follow-up patch to address the comments in https://github.com/llvm/llvm-project/pull/165355. --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 5 ++--- .../CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll | 6 +++--- .../CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir | 6 +++--- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index aed325cf627bc..0c977416f1793 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -224,13 +224,12 @@ bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const { Register VCCReg = I.getOperand(1).getReg(); MachineInstr *Cmp; - if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Set SCC as a side effect with S_CMP or S_OR. + if (STI.hasScalarCompareEq64()) { unsigned CmpOpc = STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0); } else { - // For gfx7 and earlier, S_CMP_LG_U64 doesn't exist, so we use S_OR_B64 - // which sets SCC as a side effect. Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst) .addReg(VCCReg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll index 1a7ccf0835686..588802cbd56c7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr addrspace(1) %out) { ; GFX7-LABEL: fcmp_uniform_select: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir index 67cc0169af619..b6652f605be19 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX7 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GF8 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GFX7 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GF8 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GFX11 %s --- name: test_copy_scc_vcc From f2bf3a4d8a8cf741857dff98780ac8f9e2643326 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 31 Oct 2025 13:13:58 -0700 Subject: [PATCH 392/539] [SLP][NFC]Add a test with the incorrect minbitwidth in alternate nodes, NFC --- ...ernate-opcode-strict-bitwidth-than-main.ll | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll new file mode 100644 index 0000000000000..959b2350d9d78 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define float @test(i8 %0) { +; CHECK-LABEL: define float @test( +; CHECK-SAME: i8 [[TMP0:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> , i8 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = or i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: switch i32 [[TMP8]], label %[[EXIT:.*]] [ +; CHECK-NEXT: i32 0, label %[[EXIT]] +; CHECK-NEXT: i32 1, label %[[EXIT]] +; CHECK-NEXT: ] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret float 0.000000e+00 +; +entry: + %1 = sext i8 0 to i32 + %2 = lshr i32 %1, 27 + %3 = sext i8 %0 to i32 + %reass.add.epil = mul i32 %3, 2 + %4 = or i32 %reass.add.epil, %2 + switch i32 %4, label %exit [ + i32 0, label %exit + i32 1, label %exit + ] + +exit: + ret float 0.000000e+00 +} From 33892e561da0f13d797698314181e9e1d480734e Mon Sep 17 00:00:00 2001 From: Razvan Lupusoru Date: Fri, 31 Oct 2025 13:24:51 -0700 Subject: [PATCH 393/539] [acc][flang] Implement OpenACC interface for partial entity accesses (#165911) For OpenACC clause ordering, such as maintaining appropriate parent-child relationship ordering, we need to be able to walk references back to their base entities. This introduces the operation interface in the `acc` dialect named `PartialEntityAccessOpInterface` which can be used for this purpose. The interface provides two methods: - `getBaseEntity()`: Returns the base entity being accessed - `isCompleteView()`: Indicates whether the access covers the complete entity to allow this interface to be attached to cases that only conditionally offer a partial view This also adds a utility function `mlir::acc::getBaseEntity()` that uses this interface to retrieve the base entity from a value. This work has some similarities with the ViewLikeOpInterface proposal for FIR: https://github.com/llvm/llvm-project/pull/164020 but it differs in the following ways: - Attached only to operations where we can assume a partial entity access - Includes fir.declare operations due to common block storage associations Tests are included that demonstrate the interface on memref.subview operations, implemented locally in the test since memref operations already have ViewLikeOpInterface for similar purposes. --- .../include/flang/Optimizer/Dialect/FIROps.h | 1 + .../include/flang/Optimizer/Dialect/FIROps.td | 5 +- .../OpenACC/Support/FIROpenACCOpsInterfaces.h | 58 ++++++++++ .../Optimizer/OpenACC/Support/CMakeLists.txt | 1 + .../Support/FIROpenACCOpsInterfaces.cpp | 62 +++++++++++ .../Support/RegisterOpenACCExtensions.cpp | 22 ++++ .../Dialect/OpenACC/OpenACCOpsInterfaces.td | 18 +++ .../mlir/Dialect/OpenACC/OpenACCUtils.h | 5 + .../Dialect/OpenACC/Utils/OpenACCUtils.cpp | 10 ++ .../Dialect/OpenACC/OpenACCUtilsTest.cpp | 104 ++++++++++++++++++ 10 files changed, 285 insertions(+), 1 deletion(-) create mode 100644 flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h create mode 100644 flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.h b/flang/include/flang/Optimizer/Dialect/FIROps.h index 62ef8b4b502f2..4651f2bb8038e 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.h +++ b/flang/include/flang/Optimizer/Dialect/FIROps.h @@ -20,6 +20,7 @@ #include "mlir/Dialect/LLVMIR/LLVMAttrs.h" #include "mlir/Interfaces/LoopLikeInterface.h" #include "mlir/Interfaces/SideEffectInterfaces.h" +#include "mlir/Interfaces/ViewLikeInterface.h" namespace fir { diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index 58a317cf5d691..bae52d63fda45 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -17,6 +17,7 @@ include "mlir/Dialect/Arith/IR/ArithBase.td" include "mlir/Dialect/Arith/IR/ArithOpsInterfaces.td" include "mlir/Dialect/LLVMIR/LLVMAttrDefs.td" +include "mlir/Interfaces/ViewLikeInterface.td" include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td" include "flang/Optimizer/Dialect/FIRDialect.td" include "flang/Optimizer/Dialect/FIRTypes.td" @@ -2828,7 +2829,8 @@ def fir_VolatileCastOp : fir_SimpleOneResultOp<"volatile_cast", [Pure]> { let hasFolder = 1; } -def fir_ConvertOp : fir_SimpleOneResultOp<"convert", [NoMemoryEffect]> { +def fir_ConvertOp + : fir_SimpleOneResultOp<"convert", [NoMemoryEffect, ViewLikeOpInterface]> { let summary = "encapsulates all Fortran entity type conversions"; let description = [{ @@ -2866,6 +2868,7 @@ def fir_ConvertOp : fir_SimpleOneResultOp<"convert", [NoMemoryEffect]> { static bool isPointerCompatible(mlir::Type ty); static bool canBeConverted(mlir::Type inType, mlir::Type outType); static bool areVectorsCompatible(mlir::Type inTy, mlir::Type outTy); + mlir::Value getViewSource() { return getValue(); } }]; let hasCanonicalizer = 1; } diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h new file mode 100644 index 0000000000000..7afe97aac57e8 --- /dev/null +++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h @@ -0,0 +1,58 @@ +//===- FIROpenACCOpsInterfaces.h --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains external operation interfaces for FIR. +// +//===----------------------------------------------------------------------===// + +#ifndef FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_ +#define FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_ + +#include "mlir/Dialect/OpenACC/OpenACC.h" + +namespace fir { +class DeclareOp; +} // namespace fir + +namespace hlfir { +class DeclareOp; +class DesignateOp; +} // namespace hlfir + +namespace fir::acc { + +template +struct PartialEntityAccessModel + : public mlir::acc::PartialEntityAccessOpInterface::ExternalModel< + PartialEntityAccessModel, Op> { + mlir::Value getBaseEntity(mlir::Operation *op) const; + + // Default implementation - returns false (partial view) + bool isCompleteView(mlir::Operation *op) const { return false; } +}; + +// Full specializations for declare operations +template <> +struct PartialEntityAccessModel + : public mlir::acc::PartialEntityAccessOpInterface::ExternalModel< + PartialEntityAccessModel, fir::DeclareOp> { + mlir::Value getBaseEntity(mlir::Operation *op) const; + bool isCompleteView(mlir::Operation *op) const; +}; + +template <> +struct PartialEntityAccessModel + : public mlir::acc::PartialEntityAccessOpInterface::ExternalModel< + PartialEntityAccessModel, hlfir::DeclareOp> { + mlir::Value getBaseEntity(mlir::Operation *op) const; + bool isCompleteView(mlir::Operation *op) const; +}; + +} // namespace fir::acc + +#endif // FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_ diff --git a/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt index ef67ab1549537..898fb00d41dfe 100644 --- a/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt +++ b/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt @@ -2,6 +2,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_flang_library(FIROpenACCSupport FIROpenACCAttributes.cpp + FIROpenACCOpsInterfaces.cpp FIROpenACCTypeInterfaces.cpp RegisterOpenACCExtensions.cpp diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp new file mode 100644 index 0000000000000..c1734be5185f4 --- /dev/null +++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp @@ -0,0 +1,62 @@ +//===-- FIROpenACCOpsInterfaces.cpp ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implementation of external operation interfaces for FIR. +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h" + +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" + +namespace fir::acc { + +template <> +mlir::Value PartialEntityAccessModel::getBaseEntity( + mlir::Operation *op) const { + return mlir::cast(op).getMemref(); +} + +template <> +mlir::Value PartialEntityAccessModel::getBaseEntity( + mlir::Operation *op) const { + return mlir::cast(op).getRef(); +} + +template <> +mlir::Value PartialEntityAccessModel::getBaseEntity( + mlir::Operation *op) const { + return mlir::cast(op).getMemref(); +} + +mlir::Value PartialEntityAccessModel::getBaseEntity( + mlir::Operation *op) const { + return mlir::cast(op).getStorage(); +} + +bool PartialEntityAccessModel::isCompleteView( + mlir::Operation *op) const { + // Return false (partial view) only if storage is present + // Return true (complete view) if storage is absent + return !getBaseEntity(op); +} + +mlir::Value PartialEntityAccessModel::getBaseEntity( + mlir::Operation *op) const { + return mlir::cast(op).getStorage(); +} + +bool PartialEntityAccessModel::isCompleteView( + mlir::Operation *op) const { + // Return false (partial view) only if storage is present + // Return true (complete view) if storage is absent + return !getBaseEntity(op); +} + +} // namespace fir::acc diff --git a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp index 717bf344e40aa..d71c40dfac03c 100644 --- a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp +++ b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp @@ -11,8 +11,13 @@ //===----------------------------------------------------------------------===// #include "flang/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.h" + #include "flang/Optimizer/Dialect/FIRDialect.h" +#include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/HLFIR/HLFIRDialect.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" +#include "flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h" #include "flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h" namespace fir::acc { @@ -37,7 +42,24 @@ void registerOpenACCExtensions(mlir::DialectRegistry ®istry) { fir::LLVMPointerType::attachInterface< OpenACCPointerLikeModel>(*ctx); + + fir::ArrayCoorOp::attachInterface< + PartialEntityAccessModel>(*ctx); + fir::CoordinateOp::attachInterface< + PartialEntityAccessModel>(*ctx); + fir::DeclareOp::attachInterface>( + *ctx); }); + + // Register HLFIR operation interfaces + registry.addExtension( + +[](mlir::MLIRContext *ctx, hlfir::hlfirDialect *dialect) { + hlfir::DesignateOp::attachInterface< + PartialEntityAccessModel>(*ctx); + hlfir::DeclareOp::attachInterface< + PartialEntityAccessModel>(*ctx); + }); + registerAttrsExtensions(registry); } diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td index 6fb9a950489f8..054c13a88a552 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td @@ -26,4 +26,22 @@ def ComputeRegionOpInterface : OpInterface<"ComputeRegionOpInterface"> { ]; } +def PartialEntityAccessOpInterface : OpInterface<"PartialEntityAccessOpInterface"> { + let cppNamespace = "::mlir::acc"; + + let description = [{ + An interface for operations that access a partial entity such as + field or array element access. + }]; + + let methods = [ + InterfaceMethod<"Get the base entity being accessed", "::mlir::Value", + "getBaseEntity", (ins)>, + InterfaceMethod<"Check if this is a complete view of the entity", "bool", + "isCompleteView", (ins), [{ + return false; + }]>, + ]; +} + #endif // OPENACC_OPS_INTERFACES diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h index 563c1e0099fc0..964735755c4a3 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h @@ -47,6 +47,11 @@ std::string getVariableName(mlir::Value v); /// Returns an empty string if not possible to generate a recipe name. std::string getRecipeName(mlir::acc::RecipeKind kind, mlir::Type type); +// Get the base entity from partial entity access. This is used for getting +// the base `struct` from an operation that only accesses a field or the +// base `array` from an operation that only accesses a subarray. +mlir::Value getBaseEntity(mlir::Value val); + } // namespace acc } // namespace mlir diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp index 660c3138af0ec..fbac28e740750 100644 --- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp +++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp @@ -145,3 +145,13 @@ std::string mlir::acc::getRecipeName(mlir::acc::RecipeKind kind, return recipeName; } + +mlir::Value mlir::acc::getBaseEntity(mlir::Value val) { + if (auto partialEntityAccessOp = + dyn_cast(val.getDefiningOp())) { + if (!partialEntityAccessOp.isCompleteView()) + return partialEntityAccessOp.getBaseEntity(); + } + + return val; +} diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp index f1fe53c15a6f5..6f4e30585b2c9 100644 --- a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp +++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp @@ -570,3 +570,107 @@ TEST_F(OpenACCUtilsTest, getRecipeNamePrivateUnrankedMemref) { getRecipeName(RecipeKind::private_recipe, unrankedMemrefTy); EXPECT_EQ(recipeName, "privatization_memref_Zxi32_"); } + +//===----------------------------------------------------------------------===// +// getBaseEntity Tests +//===----------------------------------------------------------------------===// + +// Local implementation of PartialEntityAccessOpInterface for memref.subview. +// This is implemented locally in the test rather than officially because memref +// operations already have ViewLikeOpInterface, which serves a similar purpose +// for walking through views to the base entity. This test demonstrates how +// getBaseEntity() would work if the interface were attached to memref.subview. +namespace { +struct SubViewOpPartialEntityAccessOpInterface + : public acc::PartialEntityAccessOpInterface::ExternalModel< + SubViewOpPartialEntityAccessOpInterface, memref::SubViewOp> { + Value getBaseEntity(Operation *op) const { + auto subviewOp = cast(op); + return subviewOp.getSource(); + } + + bool isCompleteView(Operation *op) const { + // For testing purposes, we'll consider it a partial view (return false). + // The real implementation would need to look at the offsets. + return false; + } +}; +} // namespace + +TEST_F(OpenACCUtilsTest, getBaseEntityFromSubview) { + // Register the local interface implementation for memref.subview + memref::SubViewOp::attachInterface( + context); + + // Create a base memref + auto memrefTy = MemRefType::get({10, 20}, b.getF32Type()); + OwningOpRef allocOp = + memref::AllocaOp::create(b, loc, memrefTy); + Value baseMemref = allocOp->getResult(); + + // Create a subview of the base memref with non-zero offsets + // This creates a 5x10 view starting at [2, 3] in the original 10x20 memref + SmallVector offsets = {b.getIndexAttr(2), b.getIndexAttr(3)}; + SmallVector sizes = {b.getIndexAttr(5), b.getIndexAttr(10)}; + SmallVector strides = {b.getIndexAttr(1), b.getIndexAttr(1)}; + + OwningOpRef subviewOp = + memref::SubViewOp::create(b, loc, baseMemref, offsets, sizes, strides); + Value subview = subviewOp->getResult(); + + // Test that getBaseEntity returns the base memref, not the subview + Value baseEntity = getBaseEntity(subview); + EXPECT_EQ(baseEntity, baseMemref); +} + +TEST_F(OpenACCUtilsTest, getBaseEntityNoInterface) { + // Create a memref without the interface + auto memrefTy = MemRefType::get({10}, b.getI32Type()); + OwningOpRef allocOp = + memref::AllocaOp::create(b, loc, memrefTy); + Value varPtr = allocOp->getResult(); + + // Test that getBaseEntity returns the value itself when there's no interface + Value baseEntity = getBaseEntity(varPtr); + EXPECT_EQ(baseEntity, varPtr); +} + +TEST_F(OpenACCUtilsTest, getBaseEntityChainedSubviews) { + // Register the local interface implementation for memref.subview + memref::SubViewOp::attachInterface( + context); + + // Create a base memref + auto memrefTy = MemRefType::get({100, 200}, b.getI64Type()); + OwningOpRef allocOp = + memref::AllocaOp::create(b, loc, memrefTy); + Value baseMemref = allocOp->getResult(); + + // Create first subview + SmallVector offsets1 = {b.getIndexAttr(10), b.getIndexAttr(20)}; + SmallVector sizes1 = {b.getIndexAttr(50), b.getIndexAttr(80)}; + SmallVector strides1 = {b.getIndexAttr(1), b.getIndexAttr(1)}; + + OwningOpRef subview1Op = + memref::SubViewOp::create(b, loc, baseMemref, offsets1, sizes1, strides1); + Value subview1 = subview1Op->getResult(); + + // Create second subview (subview of subview) + SmallVector offsets2 = {b.getIndexAttr(5), b.getIndexAttr(10)}; + SmallVector sizes2 = {b.getIndexAttr(20), b.getIndexAttr(30)}; + SmallVector strides2 = {b.getIndexAttr(1), b.getIndexAttr(1)}; + + OwningOpRef subview2Op = + memref::SubViewOp::create(b, loc, subview1, offsets2, sizes2, strides2); + Value subview2 = subview2Op->getResult(); + + // Test that getBaseEntity on the nested subview returns the first subview + // (since our implementation returns the immediate source, not the ultimate + // base) + Value baseEntity = getBaseEntity(subview2); + EXPECT_EQ(baseEntity, subview1); + + // Test that calling getBaseEntity again returns the original base + Value ultimateBase = getBaseEntity(baseEntity); + EXPECT_EQ(ultimateBase, baseMemref); +} From 1c0187ce909e835afd0d992a513055a37fd03d19 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Fri, 31 Oct 2025 13:33:34 -0700 Subject: [PATCH 394/539] [lldb-dap] Correctly trigger 'entry' stop reasons. (#165901) Noticed this while looking into test stability that the 'entry' stop reason is not triggering correctly. This should ensure we correctly trigger the 'entry' stop reason when launching a process with `"stopOnEntry": true`. I've also updated the tests to ensure we receive the 'entry' stop reason to catch this regression. --- .../test/tools/lldb-dap/lldbdap_testcase.py | 10 ++++++ .../tools/lldb-dap/restart/TestDAP_restart.py | 28 ++--------------- .../restart/TestDAP_restart_console.py | 31 ++----------------- lldb/tools/lldb-dap/EventHelper.cpp | 2 +- lldb/tools/lldb-dap/JSONUtils.cpp | 2 +- 5 files changed, 18 insertions(+), 55 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py index 29935bb8046ff..c6c4a3e2a4e1e 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py @@ -223,6 +223,16 @@ def verify_stop_exception_info(self, expected_description): return True return False + def verify_stop_on_entry(self) -> None: + """Waits for the process to be stopped and then verifies at least one + thread has the stop reason 'entry'.""" + self.dap_server.wait_for_stopped() + self.assertIn( + "entry", + (t["reason"] for t in self.dap_server.thread_stop_reasons.values()), + "Expected at least one thread to report stop reason 'entry' in {self.dap_server.thread_stop_reasons}", + ) + def verify_commands(self, flavor: str, output: str, commands: list[str]): self.assertTrue(output and len(output) > 0, "expect console output") lines = output.splitlines() diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py index 83faf276852f8..e8e07e1e86fc4 100644 --- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py +++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py @@ -51,20 +51,8 @@ def test_stopOnEntry(self): self.build_and_launch(program, stopOnEntry=True) [bp_main] = self.set_function_breakpoints(["main"]) - self.dap_server.request_configurationDone() - self.dap_server.wait_for_stopped() - # Once the "configuration done" event is sent, we should get a stopped - # event immediately because of stopOnEntry. - self.assertTrue( - len(self.dap_server.thread_stop_reasons) > 0, - "expected stopped event during launch", - ) - for _, body in self.dap_server.thread_stop_reasons.items(): - if "reason" in body: - reason = body["reason"] - self.assertNotEqual( - reason, "breakpoint", 'verify stop isn\'t "main" breakpoint' - ) + self.continue_to_next_stop() + self.verify_stop_on_entry() # Then, if we continue, we should hit the breakpoint at main. self.continue_to_breakpoints([bp_main]) @@ -73,17 +61,7 @@ def test_stopOnEntry(self): # main. resp = self.dap_server.request_restart() self.assertTrue(resp["success"]) - stopped_events = self.dap_server.wait_for_stopped() - for stopped_event in stopped_events: - if "body" in stopped_event: - body = stopped_event["body"] - if "reason" in body: - reason = body["reason"] - self.assertNotEqual( - reason, - "breakpoint", - 'verify stop after restart isn\'t "main" breakpoint', - ) + self.verify_stop_on_entry() @skipIfWindows def test_arguments(self): diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py index e1ad1425a993d..7d4949907df0d 100644 --- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py +++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py @@ -11,27 +11,6 @@ @skipIfBuildType(["debug"]) class TestDAP_restart_console(lldbdap_testcase.DAPTestCaseBase): - def verify_stopped_on_entry(self, stopped_events: List[Dict[str, Any]]): - seen_stopped_event = 0 - for stopped_event in stopped_events: - body = stopped_event.get("body") - if body is None: - continue - - reason = body.get("reason") - if reason is None: - continue - - self.assertNotEqual( - reason, - "breakpoint", - 'verify stop after restart isn\'t "main" breakpoint', - ) - if reason == "entry": - seen_stopped_event += 1 - - self.assertEqual(seen_stopped_event, 1, "expect only one stopped entry event.") - @skipIfAsan @skipIfWindows @skipIf(oslist=["linux"], archs=["arm$"]) # Always times out on buildbot @@ -92,11 +71,8 @@ def test_stopOnEntry(self): self.build_and_launch(program, console="integratedTerminal", stopOnEntry=True) [bp_main] = self.set_function_breakpoints(["main"]) - self.dap_server.request_continue() # sends configuration done - stopped_events = self.dap_server.wait_for_stopped() - # We should be stopped at the entry point. - self.assertGreaterEqual(len(stopped_events), 0, "expect stopped events") - self.verify_stopped_on_entry(stopped_events) + self.dap_server.request_configurationDone() + self.verify_stop_on_entry() # Then, if we continue, we should hit the breakpoint at main. self.dap_server.request_continue() @@ -105,8 +81,7 @@ def test_stopOnEntry(self): # Restart and check that we still get a stopped event before reaching # main. self.dap_server.request_restart() - stopped_events = self.dap_server.wait_for_stopped() - self.verify_stopped_on_entry(stopped_events) + self.verify_stop_on_entry() # continue to main self.dap_server.request_continue() diff --git a/lldb/tools/lldb-dap/EventHelper.cpp b/lldb/tools/lldb-dap/EventHelper.cpp index c5d5f2bb59b42..12d9e21c52ab3 100644 --- a/lldb/tools/lldb-dap/EventHelper.cpp +++ b/lldb/tools/lldb-dap/EventHelper.cpp @@ -176,7 +176,7 @@ llvm::Error SendThreadStoppedEvent(DAP &dap, bool on_entry) { llvm::DenseSet old_thread_ids; old_thread_ids.swap(dap.thread_ids); - uint32_t stop_id = process.GetStopID(); + uint32_t stop_id = on_entry ? 0 : process.GetStopID(); const uint32_t num_threads = process.GetNumThreads(); // First make a pass through the threads to see if the focused thread diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp index 2780a5b7748e8..1a3a6701b194d 100644 --- a/lldb/tools/lldb-dap/JSONUtils.cpp +++ b/lldb/tools/lldb-dap/JSONUtils.cpp @@ -711,7 +711,7 @@ llvm::json::Value CreateThreadStopped(DAP &dap, lldb::SBThread &thread, break; } if (stop_id == 0) - body.try_emplace("reason", "entry"); + body["reason"] = "entry"; const lldb::tid_t tid = thread.GetThreadID(); body.try_emplace("threadId", (int64_t)tid); // If no description has been set, then set it to the default thread stopped From 974af85b15e76fb649a35fc1b0a1c0521440e882 Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Fri, 31 Oct 2025 13:38:18 -0700 Subject: [PATCH 395/539] [DirectX] Annotate interfaces for DLL export (#165914) This is largely based off of #143615, but for the DirectX target which is still in experimental. --- llvm/include/llvm/InitializePasses.h | 1 - llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp | 3 ++- llvm/lib/Target/DirectX/DirectXTargetMachine.cpp | 3 ++- llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp | 3 ++- llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp | 3 ++- 5 files changed, 8 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 581b4ad161daa..c8196d8a7ef48 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -90,7 +90,6 @@ LLVM_ABI void initializeDSELegacyPassPass(PassRegistry &); LLVM_ABI void initializeDXILMetadataAnalysisWrapperPassPass(PassRegistry &); LLVM_ABI void initializeDXILMetadataAnalysisWrapperPrinterPass(PassRegistry &); LLVM_ABI void initializeDXILResourceBindingWrapperPassPass(PassRegistry &); -LLVM_ABI void initializeDXILResourceImplicitBindingLegacyPass(PassRegistry &); LLVM_ABI void initializeDXILResourceTypeWrapperPassPass(PassRegistry &); LLVM_ABI void initializeDXILResourceWrapperPassPass(PassRegistry &); LLVM_ABI void initializeDeadMachineInstructionElimPass(PassRegistry &); diff --git a/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp index 15def3637c5a7..b6bbb201f5c5d 100644 --- a/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp +++ b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp @@ -52,6 +52,7 @@ void DXILAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { emitGlobalConstant(GV->getDataLayout(), GV->getInitializer()); } -extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXAsmPrinter() { +extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void +LLVMInitializeDirectXAsmPrinter() { RegisterAsmPrinter X(getTheDirectXTarget()); } diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp index bcf84403b2c0d..84b1a313df2ea 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp @@ -53,7 +53,8 @@ using namespace llvm; -extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() { +extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void +LLVMInitializeDirectXTarget() { RegisterTargetMachine X(getTheDirectXTarget()); auto *PR = PassRegistry::getPassRegistry(); initializeDXILIntrinsicExpansionLegacyPass(*PR); diff --git a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp index 9a14c01f62ae7..62ad014f3739f 100644 --- a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp +++ b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp @@ -132,7 +132,8 @@ static MCRegisterInfo *createDirectXMCRegisterInfo(const Triple &Triple) { static MCInstrInfo *createDirectXMCInstrInfo() { return new MCInstrInfo(); } -extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTargetMC() { +extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void +LLVMInitializeDirectXTargetMC() { Target &T = getTheDirectXTarget(); RegisterMCAsmInfo X(T); TargetRegistry::RegisterMCInstrInfo(T, createDirectXMCInstrInfo); diff --git a/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp index ae01626e5229d..934bd1b0e8adb 100644 --- a/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp +++ b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp @@ -24,7 +24,8 @@ Target &getTheDirectXTarget() { using namespace llvm; -extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTargetInfo() { +extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void +LLVMInitializeDirectXTargetInfo() { RegisterTarget X( getTheDirectXTarget(), "dxil", "DirectX Intermediate Language", "DXIL"); } From 0e048b060a881eb03a58a92858019a61aff0305e Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Fri, 31 Oct 2025 20:36:48 +0000 Subject: [PATCH 396/539] [MLGO] Update MLRegAlloc Test This was broken by 5322fb6268208a8fc031fb13573dac9729d05db6. Update the test to be a little more resilient to flaky failures and to pass after those changes. We should probably delete this now that we have MIR2Vec, but punting that for now. --- llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll index bd8d882cda39b..9dd402d13b8e0 100644 --- a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll +++ b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll @@ -26,7 +26,7 @@ ; Also, the first eviction problem is significantly less than 300 instructions. Check ; that there is a zero value. ; Note: we're regex-ing some of the opcodes to avoid test flakyness. -; CHECK: instructions: 20,{{([0-9]{4})}},1{{([0-9]{3})}},2{{([0-9]{3})}},{{.*}},0, +; CHECK: instructions: 20,{{([0-9]{4})}},{{([0-9]{4})}},{{([0-9]{4})}},{{.*}},0, ; Only the candidate virtreg and the 10th LR are included in this problem. Make ; sure the other LRs have values of zero. There are 2700 0s followed by some 1s. ; There's a limit to how many repetitions can be matched. From 2fea151a7c8e8e3bf8adc8b77e8fdd9ff040ad3d Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Fri, 31 Oct 2025 21:02:55 +0000 Subject: [PATCH 397/539] Revert "[SLP][NFC]Add a test with the incorrect minbitwidth in alternate nodes, NFC" This reverts commit 0dca7ee4480f11cd0230d316ccc5d2c7234a4b31. This broke check-llvm, including on premerge. https://lab.llvm.org/buildbot/#/builders/137/builds/28194 https://lab.llvm.org/staging/#/builders/21/builds/7649 --- ...ernate-opcode-strict-bitwidth-than-main.ll | 36 ------------------- 1 file changed, 36 deletions(-) delete mode 100644 llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll deleted file mode 100644 index 959b2350d9d78..0000000000000 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll +++ /dev/null @@ -1,36 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 -; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s - -define float @test(i8 %0) { -; CHECK-LABEL: define float @test( -; CHECK-SAME: i8 [[TMP0:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> , i8 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = or i32 [[TMP6]], [[TMP7]] -; CHECK-NEXT: switch i32 [[TMP8]], label %[[EXIT:.*]] [ -; CHECK-NEXT: i32 0, label %[[EXIT]] -; CHECK-NEXT: i32 1, label %[[EXIT]] -; CHECK-NEXT: ] -; CHECK: [[EXIT]]: -; CHECK-NEXT: ret float 0.000000e+00 -; -entry: - %1 = sext i8 0 to i32 - %2 = lshr i32 %1, 27 - %3 = sext i8 %0 to i32 - %reass.add.epil = mul i32 %3, 2 - %4 = or i32 %reass.add.epil, %2 - switch i32 %4, label %exit [ - i32 0, label %exit - i32 1, label %exit - ] - -exit: - ret float 0.000000e+00 -} From f3ea4030a714ec5a7ab9dfd33780a8bb837c69a5 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 31 Oct 2025 14:50:17 -0700 Subject: [PATCH 398/539] AMDGPU: Add baseline test for #161651 (#165921) --- .../umin-sub-to-usubo-select-combine.ll | 236 ++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll diff --git a/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll new file mode 100644 index 0000000000000..22e4a24435f12 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll @@ -0,0 +1,236 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s + +define i16 @v_underflow_compare_fold_i16(i16 %a, i16 %b) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u16_e32 v1, v0, v1 +; GFX9-NEXT: v_min_u16_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_nc_u16 v0.h, v0.l, v1.l +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_u16 v0.l, v0.h, v0.l +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i16 %a, %b + %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a) + ret i16 %cond +} + +define i32 @v_underflow_compare_fold_i32(i32 %a, i32 %b) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1 +; GFX9-NEXT: v_min_u32_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i32 %a, %b + %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a) + ret i32 %cond +} + +define i32 @v_underflow_compare_fold_i32_commute(i32 %a, i32 %b) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i32_commute: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1 +; GFX9-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i32_commute: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i32 %a, %b + %cond = call i32 @llvm.umin.i32(i32 %a, i32 %sub) + ret i32 %cond +} + +define i32 @v_underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i32_multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1 +; GFX9-NEXT: v_min_u32_e32 v0, v1, v0 +; GFX9-NEXT: global_store_dword v[2:3], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i32_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, v1, v0 +; GFX11-NEXT: global_store_b32 v[2:3], v1, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i32 %a, %b + store i32 %sub, ptr addrspace(1) %ptr + %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a) + ret i32 %cond +} + +define i64 @v_underflow_compare_fold_i64(i64 %a, i64 %b) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i64 %a, %b + %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a) + ret i64 %cond +} + +define i64 @v_underflow_compare_fold_i64_commute(i64 %a, i64 %b) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i64_commute: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i64_commute: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i64 %a, %b + %cond = call i64 @llvm.umin.i64(i64 %a, i64 %sub) + ret i64 %cond +} + +define i64 @v_underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i64_multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i64_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: global_store_b64 v[4:5], v[2:3], off +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i64 %a, %b + store i64 %sub, ptr addrspace(1) %ptr + %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a) + ret i64 %cond +} + +define amdgpu_ps i16 @s_underflow_compare_fold_i16(i16 inreg %a, i16 inreg %b) #0 { +; GFX9-LABEL: s_underflow_compare_fold_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_sub_i32 s1, s0, s1 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: s_min_u32 s0, s1, s0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_underflow_compare_fold_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_sub_i32 s1, s0, s1 +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_min_u32 s0, s1, s0 +; GFX11-NEXT: ; return to shader part epilog + %sub = sub i16 %a, %b + %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a) + ret i16 %cond +} + +define amdgpu_ps i32 @s_underflow_compare_fold_i32(i32 inreg %a, i32 inreg %b) #0 { +; GFX9-LABEL: s_underflow_compare_fold_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_sub_i32 s1, s0, s1 +; GFX9-NEXT: s_min_u32 s0, s1, s0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_underflow_compare_fold_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_sub_i32 s1, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_min_u32 s0, s1, s0 +; GFX11-NEXT: ; return to shader part epilog + %sub = sub i32 %a, %b + %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a) + ret i32 %cond +} + +define amdgpu_ps i64 @s_underflow_compare_fold_i64(i64 inreg %a, i64 inreg %b) #0 { +; GFX9-LABEL: s_underflow_compare_fold_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_sub_u32 s2, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_subb_u32 s3, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cselect_b32 s1, s3, s1 +; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_underflow_compare_fold_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_sub_u32 s2, s0, s2 +; GFX11-NEXT: s_subb_u32 s3, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[2:3], s[0:1] +; GFX11-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11-NEXT: s_cselect_b32 s0, s2, s0 +; GFX11-NEXT: s_cselect_b32 s1, s3, s1 +; GFX11-NEXT: ; return to shader part epilog + %sub = sub i64 %a, %b + %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a) + ret i64 %cond +} + +attributes #0 = { nounwind } From 78755087b757221f0ec82bb433c79a45e9fcb461 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 31 Oct 2025 13:13:58 -0700 Subject: [PATCH 399/539] [SLP][NFC]Add a test with the incorrect minbitwidth in alternate nodes, NFC --- ...ernate-opcode-strict-bitwidth-than-main.ll | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll new file mode 100644 index 0000000000000..cc2e16e2b099b --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define float @test(i8 %0) { +; CHECK-LABEL: define float @test( +; CHECK-SAME: i8 [[TMP0:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> , i8 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i16> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i16> [[TMP2]], +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = zext i16 [[TMP10]] to i32 +; CHECK-NEXT: [[TMP8:%.*]] = or i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: switch i32 [[TMP8]], label %[[EXIT:.*]] [ +; CHECK-NEXT: i32 0, label %[[EXIT]] +; CHECK-NEXT: i32 1, label %[[EXIT]] +; CHECK-NEXT: ] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret float 0.000000e+00 +; +entry: + %1 = sext i8 0 to i32 + %2 = lshr i32 %1, 27 + %3 = sext i8 %0 to i32 + %reass.add.epil = mul i32 %3, 2 + %4 = or i32 %reass.add.epil, %2 + switch i32 %4, label %exit [ + i32 0, label %exit + i32 1, label %exit + ] + +exit: + ret float 0.000000e+00 +} From a7f1ed063987dbc27377c1b3c7bbf86cf7a74f19 Mon Sep 17 00:00:00 2001 From: Tom Yang Date: Fri, 31 Oct 2025 15:08:39 -0700 Subject: [PATCH 400/539] update ManualDWARFIndex::Index to use std::once (#165896) Small change to use (what I think is) a better practice -- we were using the `m_indexed` bool member to make sure we called `Index()` once, but we should just use `std::once`! This change shouldn't affect functionality. This change may also make concurrent access to `Index()` thread-safe, though the ManualDWARFIndex API isn't completely thread-safe due to `Decode()`. I'm not sure if ManualDWARFIndex was ever intended to be thread-safe. Test Plan: `ninja check-lldb` Tested basic debugging workflow of a couple of large projects I had built. Basically: ``` (lldb) target create (lldb) b main (lldb) r (lldb) step ... ``` I A/B tested the performance of launching several modules with parallel module loading and didn't observe any performance regressions. --------- Co-authored-by: Tom Yang --- lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp | 7 +++---- lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h | 8 +++++++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp index d90108f687f84..36dee1470e0a2 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp @@ -22,7 +22,6 @@ #include "lldb/Utility/Stream.h" #include "lldb/Utility/Timer.h" #include "lldb/lldb-private-enumerations.h" -#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/ThreadPool.h" #include #include @@ -33,10 +32,10 @@ using namespace lldb_private::plugin::dwarf; using namespace llvm::dwarf; void ManualDWARFIndex::Index() { - if (m_indexed) - return; - m_indexed = true; + std::call_once(m_indexed_flag, [this]() { IndexImpl(); }); +} +void ManualDWARFIndex::IndexImpl() { ElapsedTime elapsed(m_index_time); LLDB_SCOPED_TIMERF("%p", static_cast(m_dwarf)); if (LoadFromCache()) { diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h index 0b5b2f3e84309..41e0e620a4896 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h @@ -66,8 +66,14 @@ class ManualDWARFIndex : public DWARFIndex { void Dump(Stream &s) override; private: + /// Reads the DWARF debug info to build the index once. + /// + /// Should be called before attempting to retrieve symbols. void Index(); + /// Call `ManualDWARFIndex::Index()` instead. + void IndexImpl(); + /// Decode a serialized version of this object from data. /// /// \param data @@ -170,7 +176,7 @@ class ManualDWARFIndex : public DWARFIndex { llvm::DenseSet m_type_sigs_to_avoid; IndexSet m_set; - bool m_indexed = false; + std::once_flag m_indexed_flag; }; } // namespace dwarf } // namespace lldb_private::plugin From 27a07065ff919c71606f46cc5adaab3d23cc1ddb Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 31 Oct 2025 15:11:13 -0700 Subject: [PATCH 401/539] [SelectionDAG] Use GetPromotedInteger when promoting integer operands of PATCHPOINT/STACKMAP. (#165926) This is consistent with other promotion, but causes negative constants to be sign extended instead of zero extended in some cases. I guess getNode and type legalizer are inconsistent about what ANY_EXTEND of a constant does. --- llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 8 ++------ llvm/test/CodeGen/AArch64/stackmap.ll | 4 ++-- llvm/test/CodeGen/SystemZ/stackmap.ll | 4 ++-- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index b1776eaae6e86..44e5a187c4281 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -2871,18 +2871,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SET_ROUNDING(SDNode *N) { SDValue DAGTypeLegalizer::PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo) { assert(OpNo > 1); // Because the first two arguments are guaranteed legal. SmallVector NewOps(N->ops()); - SDValue Operand = N->getOperand(OpNo); - EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType()); - NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand); + NewOps[OpNo] = GetPromotedInteger(NewOps[OpNo]); return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo) { assert(OpNo >= 7); SmallVector NewOps(N->ops()); - SDValue Operand = N->getOperand(OpNo); - EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType()); - NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand); + NewOps[OpNo] = GetPromotedInteger(NewOps[OpNo]); return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } diff --git a/llvm/test/CodeGen/AArch64/stackmap.ll b/llvm/test/CodeGen/AArch64/stackmap.ll index 995d2545c6359..26221d0c26eb2 100644 --- a/llvm/test/CodeGen/AArch64/stackmap.ll +++ b/llvm/test/CodeGen/AArch64/stackmap.ll @@ -81,14 +81,14 @@ ; CHECK-NEXT: .hword 8 ; CHECK-NEXT: .hword 0 ; CHECK-NEXT: .hword 0 -; CHECK-NEXT: .word 65535 +; CHECK-NEXT: .word -1 ; SmallConstant ; CHECK-NEXT: .byte 4 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .hword 8 ; CHECK-NEXT: .hword 0 ; CHECK-NEXT: .hword 0 -; CHECK-NEXT: .word 65535 +; CHECK-NEXT: .word -1 ; SmallConstant ; CHECK-NEXT: .byte 4 ; CHECK-NEXT: .byte 0 diff --git a/llvm/test/CodeGen/SystemZ/stackmap.ll b/llvm/test/CodeGen/SystemZ/stackmap.ll index 05b8de756c032..f414ea33a6e80 100644 --- a/llvm/test/CodeGen/SystemZ/stackmap.ll +++ b/llvm/test/CodeGen/SystemZ/stackmap.ll @@ -84,14 +84,14 @@ ; CHECK-NEXT: .short 8 ; CHECK-NEXT: .short 0 ; CHECK-NEXT: .short 0 -; CHECK-NEXT: .long 65535 +; CHECK-NEXT: .long -1 ; SmallConstant ; CHECK-NEXT: .byte 4 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .short 8 ; CHECK-NEXT: .short 0 ; CHECK-NEXT: .short 0 -; CHECK-NEXT: .long 65535 +; CHECK-NEXT: .long -1 ; SmallConstant ; CHECK-NEXT: .byte 4 ; CHECK-NEXT: .byte 0 From 4ad190fb46b57293a5992e1e5dba46e983ca4020 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 31 Oct 2025 15:07:26 -0700 Subject: [PATCH 402/539] [SLP]Fix the minbitwidth analysis for slternate opcodes If the laternate operation is more stricter than the main operation, we cannot rely on the analysis of the main operation. In such case, better to avoid doing the analysis at all, since it may affect the overall result and lead to incorrect optimization Fixes #165878 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 21 +++++++++++++++++++ ...ernate-opcode-strict-bitwidth-than-main.ll | 14 ++++++------- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 1b55a3b235228..34b405ced8c0a 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -22134,6 +22134,27 @@ bool BoUpSLP::collectValuesToDemote( {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(), VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()}); + if (E.isAltShuffle()) { + // Combining these opcodes may lead to incorrect analysis, skip for now. + auto IsDangerousOpcode = [](unsigned Opcode) { + switch (Opcode) { + case Instruction::Shl: + case Instruction::AShr: + case Instruction::LShr: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + return true; + default: + break; + } + return false; + }; + if (IsDangerousOpcode(E.getAltOpcode())) + return FinalAnalysis(); + } + switch (E.getOpcode()) { // We can always demote truncations and extensions. Since truncations can diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll index cc2e16e2b099b..959b2350d9d78 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll @@ -6,14 +6,12 @@ define float @test(i8 %0) { ; CHECK-SAME: i8 [[TMP0:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> , i8 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i16> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i16> [[TMP2]], -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP9]] to i32 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = zext i16 [[TMP10]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = or i32 [[TMP6]], [[TMP7]] ; CHECK-NEXT: switch i32 [[TMP8]], label %[[EXIT:.*]] [ ; CHECK-NEXT: i32 0, label %[[EXIT]] From 7d3c209f9858b3d0bf17b96c2eea280d08efdc2f Mon Sep 17 00:00:00 2001 From: Yifei Xu Date: Fri, 31 Oct 2025 22:29:36 +0000 Subject: [PATCH 403/539] [MLIR][XeGPU] Remove an unused include and break circular dependency in bazel build (#165930) It will otherwise introduce a circular dependency XeGPUDialect -> XeGPUUtils -> XeGPUDialect. --- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 1 - utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 - 2 files changed, 2 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 6b4c185d7d897..83406c8c75dcf 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -11,7 +11,6 @@ #include "mlir/Dialect/Index/IR/IndexOps.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" -#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" #include "mlir/IR/Builders.h" #include "mlir/IR/DialectImplementation.h" diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 1385e1a802d5b..83414ceed5ca5 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -3729,7 +3729,6 @@ cc_library( ":XeGPUAttrInterfaceIncGen", ":XeGPUEnumsIncGen", ":XeGPUIncGen", - ":XeGPUUtils", ":XeGPUuArch", ":XeVMDialect", "//llvm:Support", From 2ed791f83e50bcaac7815805a48bfcdac2458aea Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Fri, 31 Oct 2025 15:45:36 -0700 Subject: [PATCH 404/539] Add test cases to profcheck-xfail.txt in unfixed (yet) areas (#165933) A remaining failing one, under SimplifyCFG (which is pass that we did fix) is covered in #165931 --- llvm/utils/profcheck-xfail.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt index 83bffc70574a8..380b162d8c58c 100644 --- a/llvm/utils/profcheck-xfail.txt +++ b/llvm/utils/profcheck-xfail.txt @@ -517,8 +517,11 @@ Instrumentation/TypeSanitizer/alloca-only.ll Instrumentation/TypeSanitizer/anon.ll Instrumentation/TypeSanitizer/basic.ll Instrumentation/TypeSanitizer/basic-nosan.ll +Instrumentation/TypeSanitizer/basic_outlined.ll +Instrumentation/TypeSanitizer/basic_verify_outlined.ll Instrumentation/TypeSanitizer/byval.ll Instrumentation/TypeSanitizer/globals.ll +Instrumentation/TypeSanitizer/globals_outlined.ll Instrumentation/TypeSanitizer/invalid-metadata.ll Instrumentation/TypeSanitizer/memintrinsics.ll Instrumentation/TypeSanitizer/nosanitize.ll @@ -729,6 +732,7 @@ Transforms/ExpandVariadics/expand-va-intrinsic-split-simple.ll Transforms/ExpandVariadics/intrinsics.ll Transforms/FixIrreducible/basic.ll Transforms/FixIrreducible/bug45623.ll +Transforms/FixIrreducible/callbr.ll Transforms/FixIrreducible/nested.ll Transforms/FixIrreducible/switch.ll Transforms/GCOVProfiling/atomic-counter.ll @@ -1106,6 +1110,7 @@ Transforms/LoopSimplifyCFG/update_parents.ll Transforms/LoopUnroll/peel-last-iteration-expansion-cost.ll Transforms/LoopUnroll/peel-last-iteration-with-guards.ll Transforms/LoopUnroll/peel-last-iteration-with-variable-trip-count.ll +Transforms/LoopUnroll/runtime-loop-multiple-exits.ll Transforms/LoopVersioning/add-phi-update-users.ll Transforms/LoopVersioning/basic.ll Transforms/LoopVersioning/bound-check-partially-known.ll From 80d317eea57b26adb757f0ac36b8dd481af19361 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Fri, 31 Oct 2025 16:10:59 -0700 Subject: [PATCH 405/539] [SimplifyCFG] Don't propagate weights to unconditional branches in `turnSwitchRangeIntoICmp` (#165931) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #161000 introduced a bug whereby the IR would become invalid by having an unconditional branch have `!prof`​attached to it. This only became evident in PR #165744, because the IR of `test/Transforms/SimplifyCFG/pr165301.ll`​was simple enough to both (1) introduce the unconditional branch, and (2) survive in that fashion until the end of the pass (simplifycfg) and thus trip the verifier. --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 2 +- llvm/test/Transforms/SimplifyCFG/pr165301.ll | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 6addcfab15125..cbc604e87cf1a 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -5956,7 +5956,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, } // Update weight for the newly-created conditional branch. - if (hasBranchWeightMD(*SI)) { + if (hasBranchWeightMD(*SI) && NewBI->isConditional()) { SmallVector Weights; getBranchWeights(SI, Weights); if (Weights.size() == 1 + SI->getNumCases()) { diff --git a/llvm/test/Transforms/SimplifyCFG/pr165301.ll b/llvm/test/Transforms/SimplifyCFG/pr165301.ll index 4a539d77af3cb..1df655250f57e 100644 --- a/llvm/test/Transforms/SimplifyCFG/pr165301.ll +++ b/llvm/test/Transforms/SimplifyCFG/pr165301.ll @@ -1,11 +1,11 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6 ; RUN: opt -S -passes="simplifycfg" < %s | FileCheck %s ; Make sure there's no use after free when removing incoming values from PHI nodes -define i32 @pr165301(i1 %cond) { +define i32 @pr165301(i1 %cond) !prof !0 { ; CHECK-LABEL: define i32 @pr165301( -; CHECK-SAME: i1 [[COND:%.*]]) { +; CHECK-SAME: i1 [[COND:%.*]]) !prof [[PROF0:![0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[SWITCHBB:.*]] ; CHECK: [[SWITCHBB]]: @@ -18,9 +18,14 @@ switchbb: switch i1 %cond, label %default [ i1 false, label %switchbb i1 true, label %switchbb - ] + ], !prof !1 default: %phi.lcssa = phi i32 [ 0, %switchbb ] ret i32 %phi.lcssa } +!0 = !{!"function_entry_count", i32 10} +!1 = !{!"branch_weights", i32 2, i32 3, i32 5} +;. +; CHECK: [[PROF0]] = !{!"function_entry_count", i32 10} +;. From 9d092394e34060c567ab7dbd969258bf4394c889 Mon Sep 17 00:00:00 2001 From: Amara Emerson Date: Fri, 31 Oct 2025 16:16:43 -0700 Subject: [PATCH 406/539] [docs] Fix GlobalISel sync up gcal link to point to the new one. --- llvm/docs/GettingInvolved.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst index 039d61624093d..0dba9412564d4 100644 --- a/llvm/docs/GettingInvolved.rst +++ b/llvm/docs/GettingInvolved.rst @@ -225,7 +225,7 @@ what to add to your calendar invite. - * - GlobalISel - Every 2nd Tuesday of the month - - `gcal `__ + - `gcal `__ - `Meeting details/agenda `__ From 8dd03ae844c6d6c790e0eede9a396650edc33cab Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Fri, 31 Oct 2025 16:48:20 -0700 Subject: [PATCH 407/539] [AMDGPU] Set VADDR4 field to NULL for tensor ops for gfx1250 (#165917) This is based on the latest spec. --- llvm/lib/Target/AMDGPU/MIMGInstructions.td | 4 ++- llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s | 16 +++++----- .../AMDGPU/gfx1250_dasm_vimage.txt | 32 +++++++++---------- 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index d95013123aced..65dce74a1e894 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -2116,8 +2116,10 @@ class VIMAGE_TENSOR_Real op, VIMAGE_TENSOR_Pseudo ps, string opName = p let vaddr2 = !if(ps.UpTo2D, !cast(SGPR_NULL_gfx11plus.HWEncoding), ?); let vaddr3 = !if(ps.UpTo2D, !cast(SGPR_NULL_gfx11plus.HWEncoding), ?); + // Set VADDR4 to NULL + let vaddr4 = !cast(SGPR_NULL_gfx11plus.HWEncoding); + // set to 0 based on SPG. - let vaddr4 = 0; let rsrc = 0; let vdata = 0; let d16 = 0; diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s index fec8ba19f93fe..0a480a73cde5b 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s @@ -2,33 +2,33 @@ ; RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s tensor_load_to_lds s[0:3], s[4:11] -// GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c] +// GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS -// GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c] +// GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] -// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10] +// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV -// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10] +// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU tensor_store_from_lds s[0:3], s[4:11] -// GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c] +// GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS -// GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c] +// GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] -// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10] +// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV -// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10] +// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt index 9afaa075ea838..800579391d8eb 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt @@ -1,25 +1,25 @@ # RUN: llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s -# GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c] -0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c +# GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c] +0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c -# GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c] -0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c +# GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c] +0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c -# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10] -0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10 +# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10] +0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10 -# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10] -0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10 +# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10] +0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10 -# GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c] -0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c +# GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c] +0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c -# GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c] -0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c +# GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c] +0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c -# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10] -0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10 +# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10] +0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10 -# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10] -0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10 +# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10] +0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10 From 6fb385952a28f5629bd8cb21834bc672de7ede63 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 31 Oct 2025 18:13:35 -0700 Subject: [PATCH 408/539] [lldb-dap] Move Options.td into tool subdirectory (NFC) (#165925) --- lldb/tools/lldb-dap/CMakeLists.txt | 3 --- lldb/tools/lldb-dap/tool/CMakeLists.txt | 4 ++++ lldb/tools/lldb-dap/{ => tool}/Options.td | 0 3 files changed, 4 insertions(+), 3 deletions(-) rename lldb/tools/lldb-dap/{ => tool}/Options.td (100%) diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt index 7db334ca56bcf..dd1bbbdddfc59 100644 --- a/lldb/tools/lldb-dap/CMakeLists.txt +++ b/lldb/tools/lldb-dap/CMakeLists.txt @@ -1,9 +1,6 @@ # We need to include the llvm components we depend on manually, as liblldb does # not re-export those. set(LLVM_LINK_COMPONENTS Support) -set(LLVM_TARGET_DEFINITIONS Options.td) -tablegen(LLVM Options.inc -gen-opt-parser-defs) -add_public_tablegen_target(LLDBDAPOptionsTableGen) add_lldb_library(lldbDAP Breakpoint.cpp diff --git a/lldb/tools/lldb-dap/tool/CMakeLists.txt b/lldb/tools/lldb-dap/tool/CMakeLists.txt index b39a4ed9c40e7..5335d25c5d450 100644 --- a/lldb/tools/lldb-dap/tool/CMakeLists.txt +++ b/lldb/tools/lldb-dap/tool/CMakeLists.txt @@ -1,3 +1,7 @@ +set(LLVM_TARGET_DEFINITIONS Options.td) +tablegen(LLVM Options.inc -gen-opt-parser-defs) +add_public_tablegen_target(LLDBDAPOptionsTableGen) + add_lldb_tool(lldb-dap lldb-dap.cpp diff --git a/lldb/tools/lldb-dap/Options.td b/lldb/tools/lldb-dap/tool/Options.td similarity index 100% rename from lldb/tools/lldb-dap/Options.td rename to lldb/tools/lldb-dap/tool/Options.td From db4e94999280fdd68b038ff56489af9b16f8e190 Mon Sep 17 00:00:00 2001 From: Mike Date: Sat, 1 Nov 2025 04:25:18 +0300 Subject: [PATCH 409/539] [mlir][memref] Refine doc examples for operations (#165889) Some of the examples contain typos; some of them use outdated assembly format, and some annotations are missing. This is the best effort to keep them "parsable" (assuming that most of the types are already defined). --- .../mlir/Dialect/MemRef/IR/MemRefOps.td | 72 ++++++++++--------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td index b39207fc30dd7..e00f3c1526005 100644 --- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td +++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td @@ -323,8 +323,8 @@ def MemRef_ReallocOp : MemRef_Op<"realloc"> { ```mlir %new = memref.realloc %old : memref<64xf32> to memref<124xf32> - %4 = memref.load %new[%index] // ok - %5 = memref.load %old[%index] // undefined behavior + %4 = memref.load %new[%index] : memref<124xf32> // ok + %5 = memref.load %old[%index] : memref<64xf32> // undefined behavior ``` }]; @@ -445,9 +445,10 @@ def MemRef_AllocaScopeOp : MemRef_Op<"alloca_scope", operation: ```mlir - %result = memref.alloca_scope { + %result = memref.alloca_scope -> f32 { + %value = arith.constant 1.0 : f32 ... - memref.alloca_scope.return %value + memref.alloca_scope.return %value : f32 } ``` @@ -478,7 +479,7 @@ def MemRef_AllocaScopeReturnOp : MemRef_Op<"alloca_scope.return", to indicate which values are going to be returned. For example: ```mlir - memref.alloca_scope.return %value + memref.alloca_scope.return %value : f32 ``` }]; @@ -543,11 +544,11 @@ def MemRef_CastOp : MemRef_Op<"cast", [ Example: ```mlir - Cast to concrete shape. - %4 = memref.cast %1 : memref<*xf32> to memref<4x?xf32> + // Cast to concrete shape. + %4 = memref.cast %1 : memref<*xf32> to memref<4x?xf32> - Erase rank information. - %5 = memref.cast %1 : memref<4x?xf32> to memref<*xf32> + // Erase rank information. + %5 = memref.cast %1 : memref<4x?xf32> to memref<*xf32> ``` }]; @@ -613,8 +614,8 @@ def MemRef_DeallocOp : MemRef_Op<"dealloc", [MemRefsNormalizable]> { Example: ```mlir - %0 = memref.alloc() : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1), 1>> - memref.dealloc %0 : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1), 1>> + %0 = memref.alloc() : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1> + memref.dealloc %0 : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1> ``` }]; @@ -728,13 +729,13 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> { space 1 at indices [%k, %l], would be specified as follows: ```mlir - %num_elements = arith.constant 256 + %num_elements = arith.constant 256 : index %idx = arith.constant 0 : index - %tag = memref.alloc() : memref<1 x i32, affine_map<(d0) -> (d0)>, 4> - dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx] : - memref<40 x 128 x f32>, affine_map<(d0) -> (d0)>, 0>, - memref<2 x 1024 x f32>, affine_map<(d0) -> (d0)>, 1>, - memref<1 x i32>, affine_map<(d0) -> (d0)>, 2> + %tag = memref.alloc() : memref<1 x i32, affine_map<(d0) -> (d0)>, 2> + memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx] : + memref<40 x 128 x f32, affine_map<(d0, d1) -> (d0, d1)>, 0>, + memref<2 x 1024 x f32, affine_map<(d0, d1) -> (d0, d1)>, 1>, + memref<1 x i32, affine_map<(d0) -> (d0)>, 2> ``` If %stride and %num_elt_per_stride are specified, the DMA is expected to @@ -742,8 +743,8 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> { memory space 0 until %num_elements are transferred. ```mlir - dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx], %stride, - %num_elt_per_stride : + memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx], %stride, + %num_elt_per_stride : ``` * TODO: add additional operands to allow source and destination striding, and @@ -891,10 +892,10 @@ def MemRef_DmaWaitOp : MemRef_Op<"dma_wait"> { Example: ```mlir - dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%index] : - memref<2048 x f32>, affine_map<(d0) -> (d0)>, 0>, - memref<256 x f32>, affine_map<(d0) -> (d0)>, 1> - memref<1 x i32>, affine_map<(d0) -> (d0)>, 2> + memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%index] : + memref<2048 x f32, affine_map<(d0) -> (d0)>, 0>, + memref<256 x f32, affine_map<(d0) -> (d0)>, 1>, + memref<1 x i32, affine_map<(d0) -> (d0)>, 2> ... ... dma_wait %tag[%index], %num_elements : memref<1 x i32, affine_map<(d0) -> (d0)>, 2> @@ -1004,8 +1005,8 @@ def MemRef_ExtractStridedMetadataOp : MemRef_Op<"extract_strided_metadata", [ ```mlir %base, %offset, %sizes:2, %strides:2 = - memref.extract_strided_metadata %memref : - memref<10x?xf32>, index, index, index, index, index + memref.extract_strided_metadata %memref : memref<10x?xf32> + -> memref, index, index, index, index, index // After folding, the type of %m2 can be memref<10x?xf32> and further // folded to %memref. @@ -1013,7 +1014,7 @@ def MemRef_ExtractStridedMetadataOp : MemRef_Op<"extract_strided_metadata", [ offset: [%offset], sizes: [%sizes#0, %sizes#1], strides: [%strides#0, %strides#1] - : memref to memref + : memref to memref> ``` }]; @@ -1182,10 +1183,10 @@ def MemRef_GlobalOp : MemRef_Op<"global", [Symbol]> { ```mlir // Private variable with an initial value. - memref.global "private" @x : memref<2xf32> = dense<0.0,2.0> + memref.global "private" @x : memref<2xf32> = dense<[0.0, 2.0]> // Private variable with an initial value and an alignment (power of 2). - memref.global "private" @x : memref<2xf32> = dense<0.0,2.0> {alignment = 64} + memref.global "private" @x : memref<2xf32> = dense<[0.0, 2.0]> {alignment = 64} // Declaration of an external variable. memref.global "private" @y : memref<4xi32> @@ -1194,7 +1195,7 @@ def MemRef_GlobalOp : MemRef_Op<"global", [Symbol]> { memref.global @z : memref<3xf16> = uninitialized // Externally visible constant variable. - memref.global constant @c : memref<2xi32> = dense<1, 4> + memref.global constant @c : memref<2xi32> = dense<[1, 4]> ``` }]; @@ -1555,7 +1556,8 @@ def MemRef_ReinterpretCastOp %dst = memref.reinterpret_cast %src to offset: [%offset], sizes: [%sizes], - strides: [%strides] + strides: [%strides] : + memref<*xf32> to memref> ``` means that `%dst`'s descriptor will be: ```mlir @@ -1695,12 +1697,12 @@ def MemRef_ReshapeOp: MemRef_Op<"reshape", [ ```mlir // Reshape statically-shaped memref. %dst = memref.reshape %src(%shape) - : (memref<4x1xf32>, memref<1xi32>) to memref<4xf32> + : (memref<4x1xf32>, memref<1xi32>) -> memref<4xf32> %dst0 = memref.reshape %src(%shape0) - : (memref<4x1xf32>, memref<2xi32>) to memref<2x2xf32> + : (memref<4x1xf32>, memref<2xi32>) -> memref<2x2xf32> // Flatten unranked memref. %dst = memref.reshape %src(%shape) - : (memref<*xf32>, memref<1xi32>) to memref + : (memref<*xf32>, memref<1xi32>) -> memref ``` b. Source type is ranked or unranked. Shape argument has dynamic size. @@ -1709,10 +1711,10 @@ def MemRef_ReshapeOp: MemRef_Op<"reshape", [ ```mlir // Reshape dynamically-shaped 1D memref. %dst = memref.reshape %src(%shape) - : (memref, memref) to memref<*xf32> + : (memref, memref) -> memref<*xf32> // Reshape unranked memref. %dst = memref.reshape %src(%shape) - : (memref<*xf32>, memref) to memref<*xf32> + : (memref<*xf32>, memref) -> memref<*xf32> ``` }]; From b53667ccfef9562adccb00d6219413d321945808 Mon Sep 17 00:00:00 2001 From: Mike Date: Sat, 1 Nov 2025 04:25:44 +0300 Subject: [PATCH 410/539] [mlir] Fix mlir-runner memref-reshape test with unranked inputs (#165902) Were using ranked before. --- mlir/test/mlir-runner/memref-reshape.mlir | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mlir/test/mlir-runner/memref-reshape.mlir b/mlir/test/mlir-runner/memref-reshape.mlir index 8c17f1fd02358..b264e0285953f 100644 --- a/mlir/test/mlir-runner/memref-reshape.mlir +++ b/mlir/test/mlir-runner/memref-reshape.mlir @@ -65,8 +65,8 @@ func.func @reshape_ranked_memref_to_ranked(%input : memref<2x3xf32>, func.func @reshape_unranked_memref_to_ranked(%input : memref<2x3xf32>, %shape : memref<2xindex>) { %unranked_input = memref.cast %input : memref<2x3xf32> to memref<*xf32> - %output = memref.reshape %input(%shape) - : (memref<2x3xf32>, memref<2xindex>) -> memref + %output = memref.reshape %unranked_input(%shape) + : (memref<*xf32>, memref<2xindex>) -> memref %unranked_output = memref.cast %output : memref to memref<*xf32> call @printMemrefF32(%unranked_output) : (memref<*xf32>) -> () @@ -95,8 +95,8 @@ func.func @reshape_unranked_memref_to_unranked(%input : memref<2x3xf32>, %shape : memref<2xindex>) { %unranked_input = memref.cast %input : memref<2x3xf32> to memref<*xf32> %dyn_size_shape = memref.cast %shape : memref<2xindex> to memref - %output = memref.reshape %input(%dyn_size_shape) - : (memref<2x3xf32>, memref) -> memref<*xf32> + %output = memref.reshape %unranked_input(%dyn_size_shape) + : (memref<*xf32>, memref) -> memref<*xf32> call @printMemrefF32(%output) : (memref<*xf32>) -> () // CHECK: rank = 2 offset = 0 sizes = [3, 2] strides = [2, 1] data = From 8653ec190bb04f25b19250e7711673564b1de77d Mon Sep 17 00:00:00 2001 From: Zhaoxin Yang Date: Sat, 1 Nov 2025 09:29:00 +0800 Subject: [PATCH 411/539] [LoongArch] Make ceil,floor,trunc,roundeven legal for lsx/lasx (#165217) --- .../LoongArch/LoongArchISelLowering.cpp | 8 + .../LoongArch/LoongArchLASXInstrInfo.td | 6 + .../Target/LoongArch/LoongArchLSXInstrInfo.td | 5 + .../CodeGen/LoongArch/lasx/fp-rounding.ll | 200 ++---------------- .../test/CodeGen/LoongArch/lsx/fp-rounding.ll | 88 +------- 5 files changed, 35 insertions(+), 272 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index a6de839de7c28..904aabed9a843 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -371,6 +371,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, ISD::SETUGE, ISD::SETUGT}, VT, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::FROUNDEVEN, VT, Legal); } setOperationAction(ISD::CTPOP, GRLenVT, Legal); setOperationAction(ISD::FCEIL, {MVT::f32, MVT::f64}, Legal); @@ -453,6 +457,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, ISD::SETUGE, ISD::SETUGT}, VT, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::FROUNDEVEN, VT, Legal); } } diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index ca4ee5f89573a..610ba052fbdd5 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -2424,6 +2424,12 @@ def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm), def : Pat<(int_loongarch_lasx_xvpickve_d_f v4f64:$xj, timm:$imm), (XVPICKVE_D v4f64:$xj, (to_valid_timm timm:$imm))>; +// Vector floating-point conversion +defm : PatXrF; +defm : PatXrF; +defm : PatXrF; +defm : PatXrF; + // load def : Pat<(int_loongarch_lasx_xvld GPR:$rj, timm:$imm), (XVLD GPR:$rj, (to_valid_timm timm:$imm))>; diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 92402baa0fa0f..64708421c4ed4 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -2552,6 +2552,11 @@ def : Pat<(f64 (froundeven FPR64:$fj)), (f64 (EXTRACT_SUBREG (VFRINTRNE_D (VREPLVEI_D (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), 0)), sub_64))>; +defm : PatVrF; +defm : PatVrF; +defm : PatVrF; +defm : PatVrF; + // load def : Pat<(int_loongarch_lsx_vld GPR:$rj, timm:$imm), (VLD GPR:$rj, (to_valid_timm timm:$imm))>; diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll index 79407c3fd4c8b..fa5f27edf615e 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll @@ -7,38 +7,8 @@ define void @ceil_v8f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: ceil_v8f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.s $vr1, $vr1 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4 -; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrp.s $vr2, $vr2 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 48 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.s $vr1, $vr1 -; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0 -; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0 -; CHECK-NEXT: vfrintrp.s $vr3, $vr3 -; CHECK-NEXT: vextrins.w $vr3, $vr1, 16 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr3, $vr1, 32 -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 -; CHECK-NEXT: vfrintrp.s $vr0, $vr0 -; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 -; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2 -; CHECK-NEXT: xvst $xr3, $a0, 0 +; CHECK-NEXT: xvfrintrp.s $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <8 x float>, ptr %a0 @@ -52,21 +22,7 @@ define void @ceil_v4f64(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: ceil_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.d $vr1, $vr1 -; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2 -; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrp.d $vr2, $vr2 -; CHECK-NEXT: vextrins.d $vr2, $vr1, 16 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.d $vr1, $vr1 -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vfrintrp.d $vr0, $vr0 -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 -; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2 +; CHECK-NEXT: xvfrintrp.d $xr0, $xr0 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -81,38 +37,8 @@ define void @floor_v8f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: floor_v8f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.s $vr1, $vr1 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4 -; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrm.s $vr2, $vr2 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 48 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.s $vr1, $vr1 -; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0 -; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0 -; CHECK-NEXT: vfrintrm.s $vr3, $vr3 -; CHECK-NEXT: vextrins.w $vr3, $vr1, 16 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr3, $vr1, 32 -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 -; CHECK-NEXT: vfrintrm.s $vr0, $vr0 -; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 -; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2 -; CHECK-NEXT: xvst $xr3, $a0, 0 +; CHECK-NEXT: xvfrintrm.s $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <8 x float>, ptr %a0 @@ -126,21 +52,7 @@ define void @floor_v4f64(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: floor_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.d $vr1, $vr1 -; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2 -; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrm.d $vr2, $vr2 -; CHECK-NEXT: vextrins.d $vr2, $vr1, 16 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.d $vr1, $vr1 -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vfrintrm.d $vr0, $vr0 -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 -; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2 +; CHECK-NEXT: xvfrintrm.d $xr0, $xr0 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -155,38 +67,8 @@ define void @trunc_v8f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: trunc_v8f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.s $vr1, $vr1 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4 -; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrz.s $vr2, $vr2 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 48 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.s $vr1, $vr1 -; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0 -; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0 -; CHECK-NEXT: vfrintrz.s $vr3, $vr3 -; CHECK-NEXT: vextrins.w $vr3, $vr1, 16 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr3, $vr1, 32 -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 -; CHECK-NEXT: vfrintrz.s $vr0, $vr0 -; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 -; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2 -; CHECK-NEXT: xvst $xr3, $a0, 0 +; CHECK-NEXT: xvfrintrz.s $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <8 x float>, ptr %a0 @@ -200,21 +82,7 @@ define void @trunc_v4f64(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: trunc_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.d $vr1, $vr1 -; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2 -; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrz.d $vr2, $vr2 -; CHECK-NEXT: vextrins.d $vr2, $vr1, 16 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.d $vr1, $vr1 -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vfrintrz.d $vr0, $vr0 -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 -; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2 +; CHECK-NEXT: xvfrintrz.d $xr0, $xr0 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -229,38 +97,8 @@ define void @roundeven_v8f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: roundeven_v8f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.s $vr1, $vr1 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4 -; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrne.s $vr2, $vr2 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 48 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.s $vr1, $vr1 -; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0 -; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0 -; CHECK-NEXT: vfrintrne.s $vr3, $vr3 -; CHECK-NEXT: vextrins.w $vr3, $vr1, 16 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr3, $vr1, 32 -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 -; CHECK-NEXT: vfrintrne.s $vr0, $vr0 -; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 -; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2 -; CHECK-NEXT: xvst $xr3, $a0, 0 +; CHECK-NEXT: xvfrintrne.s $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <8 x float>, ptr %a0 @@ -274,21 +112,7 @@ define void @roundeven_v4f64(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: roundeven_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.d $vr1, $vr1 -; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2 -; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrne.d $vr2, $vr2 -; CHECK-NEXT: vextrins.d $vr2, $vr1, 16 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.d $vr1, $vr1 -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vfrintrne.d $vr0, $vr0 -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 -; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2 +; CHECK-NEXT: xvfrintrne.d $xr0, $xr0 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll index 1ca6290a2239b..cb01ac0358ab3 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll @@ -7,22 +7,8 @@ define void @ceil_v4f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: ceil_v4f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.s $vr1, $vr1 -; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0 -; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrp.s $vr2, $vr2 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 ; CHECK-NEXT: vfrintrp.s $vr0, $vr0 -; CHECK-NEXT: vextrins.w $vr2, $vr0, 48 -; CHECK-NEXT: vst $vr2, $a0, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <4 x float>, ptr %a0 @@ -36,13 +22,7 @@ define void @ceil_v2f64(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: ceil_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.d $vr1, $vr1 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 ; CHECK-NEXT: vfrintrp.d $vr0, $vr0 -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -57,22 +37,8 @@ define void @floor_v4f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: floor_v4f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.s $vr1, $vr1 -; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0 -; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrm.s $vr2, $vr2 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 ; CHECK-NEXT: vfrintrm.s $vr0, $vr0 -; CHECK-NEXT: vextrins.w $vr2, $vr0, 48 -; CHECK-NEXT: vst $vr2, $a0, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <4 x float>, ptr %a0 @@ -86,13 +52,7 @@ define void @floor_v2f64(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: floor_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.d $vr1, $vr1 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 ; CHECK-NEXT: vfrintrm.d $vr0, $vr0 -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -107,22 +67,8 @@ define void @trunc_v4f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: trunc_v4f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.s $vr1, $vr1 -; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0 -; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrz.s $vr2, $vr2 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 ; CHECK-NEXT: vfrintrz.s $vr0, $vr0 -; CHECK-NEXT: vextrins.w $vr2, $vr0, 48 -; CHECK-NEXT: vst $vr2, $a0, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <4 x float>, ptr %a0 @@ -136,13 +82,7 @@ define void @trunc_v2f64(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: trunc_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.d $vr1, $vr1 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 ; CHECK-NEXT: vfrintrz.d $vr0, $vr0 -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -157,22 +97,8 @@ define void @roundeven_v4f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: roundeven_v4f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.s $vr1, $vr1 -; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0 -; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrne.s $vr2, $vr2 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 ; CHECK-NEXT: vfrintrne.s $vr0, $vr0 -; CHECK-NEXT: vextrins.w $vr2, $vr0, 48 -; CHECK-NEXT: vst $vr2, $a0, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <4 x float>, ptr %a0 @@ -186,13 +112,7 @@ define void @roundeven_v2f64(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: roundeven_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.d $vr1, $vr1 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 ; CHECK-NEXT: vfrintrne.d $vr0, $vr0 -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: From b3895c84ec351c3ff3515d56e70168b57fbbdc15 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 31 Oct 2025 20:05:02 -0700 Subject: [PATCH 412/539] [VPlan] Add VPRegionBlock::getCanonicalIVType (NFC). (#164127) Split off from https://github.com/llvm/llvm-project/pull/156262. Similar to VPRegionBlock::getCanonicalIV, add helper to get the type of the canonical IV, in preparation for removing VPCanonicalIVPHIRecipe. PR: https://github.com/llvm/llvm-project/pull/164127 --- llvm/lib/Transforms/Vectorize/VPlan.h | 6 ++++++ llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 3 +-- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 14 +++++++------- llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 3 +-- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 1f10058ab4a9a..1504acfcf7e52 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -4109,6 +4109,12 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase { const VPCanonicalIVPHIRecipe *getCanonicalIV() const { return const_cast(this)->getCanonicalIV(); } + + /// Return the type of the canonical IV for loop regions. + Type *getCanonicalIVType() { return getCanonicalIV()->getScalarType(); } + const Type *getCanonicalIVType() const { + return getCanonicalIV()->getScalarType(); + } }; inline VPRegionBlock *VPRecipeBase::getRegion() { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index bde62dd6dd4bc..f9c15a31167fa 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2372,9 +2372,8 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const { return false; auto *StepC = dyn_cast(getStepValue()->getLiveInIRValue()); auto *StartC = dyn_cast(getStartValue()->getLiveInIRValue()); - auto *CanIV = getRegion()->getCanonicalIV(); return StartC && StartC->isZero() && StepC && StepC->isOne() && - getScalarType() == CanIV->getScalarType(); + getScalarType() == getRegion()->getCanonicalIVType(); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 986c801abf684..d491d5669ef18 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -820,7 +820,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, // Calculate the final index. VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); auto *CanonicalIV = LoopRegion->getCanonicalIV(); - Type *CanonicalIVType = CanonicalIV->getScalarType(); + Type *CanonicalIVType = LoopRegion->getCanonicalIVType(); VPBuilder B(cast(PredVPBB)); DebugLoc DL = cast(Op)->getDebugLoc(); @@ -2402,8 +2402,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( "index.part.next"); // Create the active lane mask instruction in the VPlan preheader. - VPValue *ALMMultiplier = Plan.getOrAddLiveIn( - ConstantInt::get(TopRegion->getCanonicalIV()->getScalarType(), 1)); + VPValue *ALMMultiplier = + Plan.getOrAddLiveIn(ConstantInt::get(TopRegion->getCanonicalIVType(), 1)); auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC, ALMMultiplier}, DL, "active.lane.mask.entry"); @@ -2503,7 +2503,7 @@ void VPlanTransforms::addActiveLaneMask( } else { VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); VPValue *ALMMultiplier = Plan.getOrAddLiveIn( - ConstantInt::get(LoopRegion->getCanonicalIV()->getScalarType(), 1)); + ConstantInt::get(LoopRegion->getCanonicalIVType(), 1)); LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier}, @@ -2775,7 +2775,7 @@ void VPlanTransforms::addExplicitVectorLength( VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); auto *CanonicalIVPHI = LoopRegion->getCanonicalIV(); - auto *CanIVTy = CanonicalIVPHI->getScalarType(); + auto *CanIVTy = LoopRegion->getCanonicalIVType(); VPValue *StartV = CanonicalIVPHI->getStartValue(); // Create the ExplicitVectorLengthPhi recipe in the main loop. @@ -4336,10 +4336,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, VPBuilder PHBuilder(Plan.getVectorPreheader()); VPValue *UF = Plan.getOrAddLiveIn( - ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF())); + ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF())); if (VF.isScalable()) { VPValue *VScale = PHBuilder.createElementCount( - CanIV->getScalarType(), ElementCount::getScalable(1)); + VectorLoop->getCanonicalIVType(), ElementCount::getScalable(1)); VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF}); Inc->setOperand(1, VScaleUF); Plan.getVF().replaceAllUsesWith(VScale); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index cfd1a741ee841..f15113c6293bc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -69,8 +69,7 @@ class UnrollState { VPBasicBlock::iterator InsertPtForPhi); VPValue *getConstantVPV(unsigned Part) { - Type *CanIVIntTy = - Plan.getVectorLoopRegion()->getCanonicalIV()->getScalarType(); + Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType(); return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part)); } From 1c060a81aa534f10d994c2bfd2843a8d69ceb267 Mon Sep 17 00:00:00 2001 From: LU-JOHN Date: Fri, 31 Oct 2025 22:26:28 -0500 Subject: [PATCH 413/539] [AMDGPU][NFC] Refactor SCC optimization (#165871) Refactor SCC optimization --------- Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 84 +++++++++++++------------- 1 file changed, 41 insertions(+), 43 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d930a21c2d7f5..d9f76c9a59d00 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10618,6 +10618,42 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, return false; } +// SCC is already valid after SCCValid. +// SCCRedefine will redefine SCC to the same value already available after +// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and +// update kill/dead flags if necessary. +static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, + const SIRegisterInfo &RI) { + MachineInstr *KillsSCC = nullptr; + for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()), + SCCRedefine->getIterator())) { + if (MI.modifiesRegister(AMDGPU::SCC, &RI)) + return false; + if (MI.killsRegister(AMDGPU::SCC, &RI)) + KillsSCC = &MI; + } + if (MachineOperand *SccDef = + SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) + SccDef->setIsDead(false); + if (KillsSCC) + KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); + SCCRedefine->eraseFromParent(); + return true; +} + +static bool foldableSelect(const MachineInstr &Def) { + if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 && + Def.getOpcode() != AMDGPU::S_CSELECT_B64) + return false; + bool Op1IsNonZeroImm = + Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0; + bool Op2IsZeroImm = + Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0; + if (!Op1IsNonZeroImm || !Op2IsZeroImm) + return false; + return true; +} + bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, @@ -10637,19 +10673,6 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (!Def || Def->getParent() != CmpInstr.getParent()) return false; - const auto foldableSelect = [](MachineInstr *Def) -> bool { - if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 || - Def->getOpcode() == AMDGPU::S_CSELECT_B64) { - bool Op1IsNonZeroImm = - Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0; - bool Op2IsZeroImm = - Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0; - if (Op1IsNonZeroImm && Op2IsZeroImm) - return true; - } - return false; - }; - // For S_OP that set SCC = DST!=0, do the transformation // // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...) @@ -10660,24 +10683,12 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero // imm), 0) - if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def)) + if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def)) return false; - MachineInstr *KillsSCC = nullptr; - for (MachineInstr &MI : - make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { - if (MI.modifiesRegister(AMDGPU::SCC, &RI)) - return false; - if (MI.killsRegister(AMDGPU::SCC, &RI)) - KillsSCC = &MI; - } + if (!optimizeSCC(Def, &CmpInstr, RI)) + return false; - if (MachineOperand *SccDef = - Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) - SccDef->setIsDead(false); - if (KillsSCC) - KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); - CmpInstr.eraseFromParent(); return true; }; @@ -10755,21 +10766,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg)) return false; - MachineInstr *KillsSCC = nullptr; - for (MachineInstr &MI : - make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { - if (MI.modifiesRegister(AMDGPU::SCC, &RI)) - return false; - if (MI.killsRegister(AMDGPU::SCC, &RI)) - KillsSCC = &MI; - } - - MachineOperand *SccDef = - Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr); - SccDef->setIsDead(false); - if (KillsSCC) - KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); - CmpInstr.eraseFromParent(); + if (!optimizeSCC(Def, &CmpInstr, RI)) + return false; if (!MRI->use_nodbg_empty(DefReg)) { assert(!IsReversedCC); From 9194fecc6cf2394c8cded14c97a288d5139927ba Mon Sep 17 00:00:00 2001 From: Erick Velez Date: Fri, 31 Oct 2025 20:38:48 -0700 Subject: [PATCH 414/539] [clang-doc] create a separate cmake file for clang-doc's lit tests (#165935) To avoid depending on all of the tools in clang-tools-extra, the `check-clang-extra-clang-doc` target is specialized in its own CMake file in clang-tools-extra/test/clang-doc. This eliminates around 800 files to be processed when building that target, plus linking every tool. Similar to [#155929](https://github.com/llvm/llvm-project/pull/155929). --- clang-tools-extra/test/CMakeLists.txt | 3 +++ clang-tools-extra/test/clang-doc/CMakeLists.txt | 7 +++++++ 2 files changed, 10 insertions(+) create mode 100644 clang-tools-extra/test/clang-doc/CMakeLists.txt diff --git a/clang-tools-extra/test/CMakeLists.txt b/clang-tools-extra/test/CMakeLists.txt index a70d2ef2d92f2..78447e7a00db8 100644 --- a/clang-tools-extra/test/CMakeLists.txt +++ b/clang-tools-extra/test/CMakeLists.txt @@ -87,4 +87,7 @@ add_lit_testsuite(check-clang-extra "Running clang-tools-extra/test" add_lit_testsuites(CLANG-EXTRA ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${CLANG_TOOLS_TEST_DEPS} + SKIP "^clang-doc" ) + +add_subdirectory(clang-doc) diff --git a/clang-tools-extra/test/clang-doc/CMakeLists.txt b/clang-tools-extra/test/clang-doc/CMakeLists.txt new file mode 100644 index 0000000000000..4446b2a3c897f --- /dev/null +++ b/clang-tools-extra/test/clang-doc/CMakeLists.txt @@ -0,0 +1,7 @@ +# Specialize the clang-doc target to avoid building other projects +add_lit_testsuite(check-clang-extra-clang-doc "Running clang-doc tests" + ${CMAKE_CURRENT_BINARY_DIR} + EXCLUDE_FROM_CHECK_ALL + DEPENDS clang-doc + DEPENDS ${LLVM_UTILS_DEPS} +) From 9f7bf9abce7d4bfc33f7af2d9859a4beb1975057 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 1 Nov 2025 04:12:59 +0000 Subject: [PATCH 415/539] [VPlan] Add getConstantInt helpers for constant int creation (NFC). Add getConstantInt helper methods to VPlan to simplify the common pattern of creating constant integer live-ins. Suggested as follow-up in https://github.com/llvm/llvm-project/pull/164127. --- .../Vectorize/LoopVectorizationPlanner.h | 3 +- .../Transforms/Vectorize/LoopVectorize.cpp | 18 +++---- llvm/lib/Transforms/Vectorize/VPlan.h | 24 +++++++--- .../Vectorize/VPlanConstruction.cpp | 7 ++- .../Transforms/Vectorize/VPlanTransforms.cpp | 48 ++++++++----------- llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 16 +++---- .../Vectorize/VPlanVerifierTest.cpp | 15 +++--- 7 files changed, 61 insertions(+), 70 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 3fed003282f2b..5298728e555ab 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -280,8 +280,7 @@ class VPBuilder { VPValue *createElementCount(Type *Ty, ElementCount EC) { VPlan &Plan = *getInsertBlock()->getPlan(); - VPValue *RuntimeEC = - Plan.getOrAddLiveIn(ConstantInt::get(Ty, EC.getKnownMinValue())); + VPValue *RuntimeEC = Plan.getConstantInt(Ty, EC.getKnownMinValue()); if (EC.isScalable()) { VPValue *VScale = createNaryOp(VPInstruction::VScale, {}, Ty); RuntimeEC = EC.getKnownMinValue() == 1 diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 25bf49db0e073..e5c3f17860103 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7752,8 +7752,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, if (CM.isPredicatedInst(I)) { SmallVector Ops(Operands); VPValue *Mask = getBlockInMask(Builder.getInsertBlock()); - VPValue *One = - Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false)); + VPValue *One = Plan.getConstantInt(I->getType(), 1u); auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc()); Ops[1] = SafeRHS; return new VPWidenRecipe(*I, Ops); @@ -7806,11 +7805,10 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, } case Instruction::ExtractValue: { SmallVector NewOps(Operands); - Type *I32Ty = IntegerType::getInt32Ty(I->getContext()); auto *EVI = cast(I); assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index"); unsigned Idx = EVI->getIndices()[0]; - NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false))); + NewOps.push_back(Plan.getConstantInt(32, Idx)); return new VPWidenRecipe(*I, NewOps); } }; @@ -8179,8 +8177,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, "Expected an ADD or SUB operation for predicated partial " "reductions (because the neutral element in the mask is zero)!"); Cond = getBlockInMask(Builder.getInsertBlock()); - VPValue *Zero = - Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0)); + VPValue *Zero = Plan.getConstantInt(Reduction->getType(), 0); BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc()); } return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond, @@ -8643,7 +8640,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs && CurrentLinkI->getOpcode() == Instruction::Sub) { Type *PhiTy = PhiR->getUnderlyingValue()->getType(); - auto *Zero = Plan->getOrAddLiveIn(ConstantInt::get(PhiTy, 0)); + auto *Zero = Plan->getConstantInt(PhiTy, 0); VPWidenRecipe *Sub = new VPWidenRecipe( Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {}, VPIRMetadata(), CurrentLinkI->getDebugLoc()); @@ -8857,8 +8854,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( ToDelete.push_back(Select); // Convert the reduction phi to operate on bools. - PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse( - OrigLoop->getHeader()->getContext()))); + PhiR->setOperand(0, Plan->getFalse()); continue; } @@ -8880,9 +8876,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( unsigned ScaleFactor = RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr()) .value_or(1); - Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext()); - auto *ScaleFactorVPV = - Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor)); + auto *ScaleFactorVPV = Plan->getConstantInt(32, ScaleFactor); VPValue *StartV = PHBuilder.createNaryOp( VPInstruction::ReductionStartVector, {PhiR->getStartValue(), Iden, ScaleFactorVPV}, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 1504acfcf7e52..08c9c15a6b0fc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -4393,15 +4393,25 @@ class VPlan { } /// Return a VPValue wrapping i1 true. - VPValue *getTrue() { - LLVMContext &Ctx = getContext(); - return getOrAddLiveIn(ConstantInt::getTrue(Ctx)); - } + VPValue *getTrue() { return getConstantInt(1, 1); } /// Return a VPValue wrapping i1 false. - VPValue *getFalse() { - LLVMContext &Ctx = getContext(); - return getOrAddLiveIn(ConstantInt::getFalse(Ctx)); + VPValue *getFalse() { return getConstantInt(1, 0); } + + /// Return a VPValue wrapping a ConstantInt with the given type and value. + VPValue *getConstantInt(Type *Ty, uint64_t Val, bool IsSigned = false) { + return getOrAddLiveIn(ConstantInt::get(Ty, Val, IsSigned)); + } + + /// Return a VPValue wrapping a ConstantInt with the given bitwidth and value. + VPValue *getConstantInt(unsigned BitWidth, uint64_t Val, + bool IsSigned = false) { + return getConstantInt(APInt(BitWidth, Val, IsSigned)); + } + + /// Return a VPValue wrapping a ConstantInt with the given APInt value. + VPValue *getConstantInt(const APInt &Val) { + return getOrAddLiveIn(ConstantInt::get(getContext(), Val)); } /// Return the live-in VPValue for \p V, if there is one or nullptr otherwise. diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 65688a3f0b6be..1a66d2049a8db 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -612,8 +612,7 @@ void VPlanTransforms::addMiddleCheck(VPlan &Plan, if (!RequiresScalarEpilogueCheck) Cmp = Plan.getFalse(); else if (TailFolded) - Cmp = Plan.getOrAddLiveIn( - ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext()))); + Cmp = Plan.getTrue(); else Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(), &Plan.getVectorTripCount(), LatchDL, "cmp.n"); @@ -712,8 +711,8 @@ void VPlanTransforms::addMinimumIterationCheck( // additional overflow check is required before entering the vector loop. // Get the maximum unsigned value for the type. - VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get( - TripCountTy, cast(TripCountTy)->getMask())); + VPValue *MaxUIntTripCount = + Plan.getConstantInt(cast(TripCountTy)->getMask()); VPValue *DistanceToMax = Builder.createNaryOp( Instruction::Sub, {MaxUIntTripCount, TripCountVPV}, DebugLoc::getUnknown()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index d491d5669ef18..6a8231bd34a11 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -699,8 +699,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { continue; const InductionDescriptor &ID = PtrIV->getInductionDescriptor(); - VPValue *StartV = - Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0)); + VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0); VPValue *StepV = PtrIV->getOperand(1); VPScalarIVStepsRecipe *Steps = createScalarIVSteps( Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr, @@ -836,7 +835,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, // changed it means the exit is using the incremented value, so we need to // add the step. if (Incoming != WideIV) { - VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1)); + VPValue *One = Plan.getConstantInt(CanonicalIVType, 1); EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL); } @@ -882,7 +881,7 @@ static VPValue *optimizeLatchExitInductionUser( return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape"); if (ScalarTy->isPointerTy()) { Type *StepTy = TypeInfo.inferScalarType(Step); - auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0)); + auto *Zero = Plan.getConstantInt(StepTy, 0); return B.createPtrAdd(EndValue, B.createNaryOp(Instruction::Sub, {Zero, Step}), DebugLoc::getUnknown(), "ind.escape"); @@ -1574,9 +1573,9 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, continue; // Update IV operands and comparison bound to use new narrower type. - auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0)); + auto *NewStart = Plan.getConstantInt(NewIVTy, 0); WideIV->setStartValue(NewStart); - auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1)); + auto *NewStep = Plan.getConstantInt(NewIVTy, 1); WideIV->setStepValue(NewStep); auto *NewBTC = new VPWidenCastRecipe( @@ -1695,8 +1694,7 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, // When using wide lane masks, the return type of the get.active.lane.mask // intrinsic is VF x UF (last operand). - VPValue *ALMMultiplier = - Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF)); + VPValue *ALMMultiplier = Plan.getConstantInt(64, UF); EntryALM->setOperand(2, ALMMultiplier); LoopALM->setOperand(2, ALMMultiplier); @@ -2403,7 +2401,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( // Create the active lane mask instruction in the VPlan preheader. VPValue *ALMMultiplier = - Plan.getOrAddLiveIn(ConstantInt::get(TopRegion->getCanonicalIVType(), 1)); + Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1); auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC, ALMMultiplier}, DL, "active.lane.mask.entry"); @@ -2790,8 +2788,7 @@ void VPlanTransforms::addExplicitVectorLength( if (MaxSafeElements) { // Support for MaxSafeDist for correct loop emission. - VPValue *AVLSafe = - Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements)); + VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements); VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe); AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(), "safe_avl"); @@ -2904,9 +2901,8 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext); VPBuilder Builder(LatchExitingBr); - VPValue *Cmp = - Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, - Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy))); + VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, + Plan.getConstantInt(AVLTy, 0)); Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp); LatchExitingBr->eraseFromParent(); } @@ -2930,8 +2926,7 @@ void VPlanTransforms::replaceSymbolicStrides( // Only handle constant strides for now. continue; - auto *CI = - Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst)); + auto *CI = Plan.getConstantInt(*StrideConst); if (VPValue *StrideVPV = Plan.getLiveIn(StrideV)) StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); @@ -2946,7 +2941,7 @@ void VPlanTransforms::replaceSymbolicStrides( unsigned BW = U->getType()->getScalarSizeInBits(); APInt C = isa(U) ? StrideConst->sext(BW) : StrideConst->zext(BW); - VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C)); + VPValue *CI = Plan.getConstantInt(C); StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); } RewriteMap[StrideV] = PSE.getSCEV(StrideV); @@ -3125,8 +3120,7 @@ void VPlanTransforms::createInterleaveGroups( DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) * IG->getIndex(IRInsertPos), /*IsSigned=*/true); - VPValue *OffsetVPV = - Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset)); + VPValue *OffsetVPV = Plan.getConstantInt(-Offset); VPBuilder B(InsertPos); Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW); } @@ -3867,8 +3861,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan, VPBuilder Builder(VectorPH, VectorPH->begin()); auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount()); auto *TCMO = Builder.createNaryOp( - Instruction::Sub, - {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))}, + Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)}, DebugLoc::getCompilerGenerated(), "trip.count.minus.1"); BTC->replaceAllUsesWith(TCMO); } @@ -3993,9 +3986,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, if (TailByMasking) { TC = Builder.createNaryOp( Instruction::Add, - {TC, Builder.createNaryOp( - Instruction::Sub, - {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})}, + {TC, Builder.createNaryOp(Instruction::Sub, + {Step, Plan.getConstantInt(TCTy, 1)})}, DebugLoc::getCompilerGenerated(), "n.rnd.up"); } @@ -4017,8 +4009,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, if (RequiresScalarEpilogue) { assert(!TailByMasking && "requiring scalar epilogue is not supported with fail folding"); - VPValue *IsZero = Builder.createICmp( - CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0))); + VPValue *IsZero = + Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0)); R = Builder.createSelect(IsZero, Step, R); } @@ -4056,7 +4048,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, } VF.replaceAllUsesWith(RuntimeVF); - VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF())); + VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF()); VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF}); VFxUF.replaceAllUsesWith(MulByUF); } @@ -4346,7 +4338,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, } else { Inc->setOperand(1, UF); Plan.getVF().replaceAllUsesWith( - Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); + Plan.getConstantInt(CanIV->getScalarType(), 1)); } removeDeadRecipes(Plan); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index f15113c6293bc..d6a002825e38d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -68,9 +68,9 @@ class UnrollState { void unrollWidenInductionByUF(VPWidenInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi); - VPValue *getConstantVPV(unsigned Part) { + VPValue *getConstantInt(unsigned Part) { Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType(); - return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part)); + return Plan.getConstantInt(CanIVIntTy, Part); } public: @@ -137,7 +137,7 @@ void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) { remapOperands(&PartIR, Part); if (auto *ScalarIVSteps = dyn_cast(&PartIR)) { - ScalarIVSteps->addOperand(getConstantVPV(Part)); + ScalarIVSteps->addOperand(getConstantInt(Part)); } addRecipeForPart(&Part0R, &PartIR, Part); @@ -249,7 +249,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, for (unsigned Part = 1; Part != UF; ++Part) VPV2Parts[VPI][Part - 1] = StartV; } - Copy->addOperand(getConstantVPV(Part)); + Copy->addOperand(getConstantInt(Part)); } else { assert(isa(R) && "unexpected header phi recipe not needing unrolled part"); @@ -318,7 +318,7 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Copy) || match(Copy, m_VPInstruction())) - Copy->addOperand(getConstantVPV(Part)); + Copy->addOperand(getConstantInt(Part)); if (isa(R)) Copy->setOperand(0, R.getOperand(0)); @@ -474,8 +474,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, if (LaneDefs != Def2LaneDefs.end()) return LaneDefs->second[Lane.getKnownLane()]; - VPValue *Idx = - Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); + VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane()); return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); } @@ -509,8 +508,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, cast(Op)->getOperand(Lane.getKnownLane())); continue; } - VPValue *Idx = - Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); + VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane()); VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); NewOps.push_back(Ext); } diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp index 50ad4d5fa61ff..46802826fe090 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp @@ -21,7 +21,7 @@ using VPVerifierTest = VPlanTestBase; namespace { TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) { VPlan &Plan = getPlan(); - VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0)); + VPValue *Zero = Plan.getConstantInt(32, 0); VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero}); VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI}); auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {}); @@ -56,7 +56,7 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) { TEST_F(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) { VPlan &Plan = getPlan(); - VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0)); + VPValue *Zero = Plan.getConstantInt(32, 0); VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero}); VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI}); auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {}); @@ -184,7 +184,7 @@ TEST_F(VPVerifierTest, VPPhiIncomingValueDoesntDominateIncomingBlock) { TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { VPlan &Plan = getPlan(); - VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0)); + VPValue *Zero = Plan.getConstantInt(32, 0); VPInstruction *I1 = new VPInstruction(Instruction::Add, {Zero}); auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {}); VPInstruction *BranchOnCond = @@ -218,7 +218,7 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) { VPlan &Plan = getPlan(); - VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0)); + VPValue *Zero = Plan.getConstantInt(32, 0); VPInstruction *I1 = new VPInstruction(Instruction::Add, {Zero}); auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {}); VPInstruction *BranchOnCond = @@ -259,7 +259,7 @@ TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) { VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = Plan.createVPBasicBlock(""); - VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0)); + VPValue *Zero = Plan.getConstantInt(32, 0); auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {}); VPBB2->appendRecipe(CanIV); @@ -288,7 +288,7 @@ TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) { TEST_F(VPVerifierTest, NonHeaderPHIInHeader) { VPlan &Plan = getPlan(); - VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0)); + VPValue *Zero = Plan.getConstantInt(32, 0); auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {}); auto *BranchOnCond = new VPInstruction(VPInstruction::BranchOnCond, {CanIV}); @@ -351,8 +351,7 @@ TEST_F(VPIRVerifierTest, testVerifyIRPhi) { BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); auto Plan = buildVPlan(LoopHeader); - Plan->getExitBlocks()[0]->front().addOperand( - Plan->getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(*Ctx), 0))); + Plan->getExitBlocks()[0]->front().addOperand(Plan->getConstantInt(32, 0)); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); From 91daacc1a4980c43c02c41b0bead79c12e17382c Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 31 Oct 2025 21:41:53 -0700 Subject: [PATCH 416/539] AMDGPU/GlobalISel: Fix vgpr abs tests using SGPR return (#165965) Fix the calling convention to use normal functions instead of amdgpu_cs --- .../CodeGen/AMDGPU/GlobalISel/llvm.abs.ll | 199 ++++++++---------- 1 file changed, 83 insertions(+), 116 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll index 02d0e521e3b00..6facdfdec64ae 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll @@ -104,109 +104,110 @@ define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) { ret <4 x i32> %res } -define amdgpu_cs i16 @abs_vgpr_i16(i16 %arg) { +define i16 @abs_vgpr_i16(i16 %arg) { ; GFX6-LABEL: abs_vgpr_i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0 ; GFX6-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_i16: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0 ; GFX8-NEXT: v_max_i16_e32 v0, v0, v1 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_i16: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_nc_u16 v1, 0, v0 ; GFX10-NEXT: v_max_i16 v0, v0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_sub_nc_u16 v1, 0, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_max_i16 v0, v0, v1 -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call i16 @llvm.abs.i16(i16 %arg, i1 false) ret i16 %res } -define amdgpu_cs i32 @abs_vgpr_i32(i32 %arg) { +define i32 @abs_vgpr_i32(i32 %arg) { ; GFX6-LABEL: abs_vgpr_i32: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0 ; GFX6-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_i32: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 0, v0 ; GFX8-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 0, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call i32 @llvm.abs.i32(i32 %arg, i1 false) ret i32 %res } -define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) { +define i64 @abs_vgpr_i64(i64 %arg) { ; GFX6-LABEL: abs_vgpr_i64: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_i64: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_xor_b32_e32 v1, v1, v2 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_i64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo ; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_xor_b32_e32 v1, v1, v2 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_mov_b32_e32 v3, v2 @@ -214,17 +215,15 @@ define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX1250-NEXT: v_xor_b32_e32 v1, v1, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call i64 @llvm.abs.i64(i64 %arg, i1 false) ret i64 %res } -define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { +define <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { ; GFX6-LABEL: abs_vgpr_v4i32: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0 ; GFX6-NEXT: v_max_i32_e32 v0, v0, v4 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 @@ -233,14 +232,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { ; GFX6-NEXT: v_max_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 ; GFX6-NEXT: v_max_i32_e32 v3, v3, v4 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v2 -; GFX6-NEXT: v_readfirstlane_b32 s3, v3 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_v4i32: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v0 ; GFX8-NEXT: v_max_i32_e32 v0, v0, v4 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v1 @@ -249,14 +245,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { ; GFX8-NEXT: v_max_i32_e32 v2, v2, v4 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v3 ; GFX8-NEXT: v_max_i32_e32 v3, v3, v4 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: v_readfirstlane_b32 s2, v2 -; GFX8-NEXT: v_readfirstlane_b32 s3, v3 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_v4i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v4, 0, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, 0, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 0, v2 @@ -265,14 +258,12 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { ; GFX10-NEXT: v_max_i32_e32 v1, v1, v5 ; GFX10-NEXT: v_max_i32_e32 v2, v2, v6 ; GFX10-NEXT: v_max_i32_e32 v3, v3, v7 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_v4i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1 ; GFX1250-NEXT: v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -281,13 +272,7 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_max_i32_e32 v2, v2, v6 ; GFX1250-NEXT: v_max_i32_e32 v3, v3, v7 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1250-NEXT: v_readfirstlane_b32 s3, v3 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false) ret <4 x i32> %res } @@ -304,44 +289,43 @@ define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) { ret <2 x i8> %res } -define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) { +define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) { ; GFX6-LABEL: abs_vgpr_v2i8: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX6-NEXT: v_max_i32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 ; GFX6-NEXT: v_max_i32_e32 v1, v1, v2 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_v2i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_v2i8: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-NEXT: v_sub_nc_u16 v2, 0, v0 ; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1 ; GFX10-NEXT: v_max_i16 v0, v0, v2 ; GFX10-NEXT: v_max_i16 v1, v1, v3 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_v2i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -350,10 +334,7 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_max_i16 v0, v0, v2 ; GFX1250-NEXT: v_max_i16 v1, v1, v3 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false) ret <2 x i8> %res } @@ -372,9 +353,10 @@ define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) { ret <3 x i8> %res } -define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { +define <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX6-LABEL: abs_vgpr_v3i8: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 @@ -384,13 +366,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX6-NEXT: v_max_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, v2, v3 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v2 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_v3i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -398,13 +378,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX8-NEXT: v_sub_u16_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v2, sext(v2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: v_readfirstlane_b32 s2, v2 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_v3i8: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-NEXT: v_bfe_i32 v2, v2, 0, 8 @@ -414,13 +392,12 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX10-NEXT: v_max_i16 v0, v0, v3 ; GFX10-NEXT: v_max_i16 v1, v1, v4 ; GFX10-NEXT: v_max_i16 v2, v2, v5 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_v3i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 8 @@ -433,12 +410,7 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_max_i16 v1, v1, v4 ; GFX1250-NEXT: v_max_i16 v2, v2, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1250-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false) ret <3 x i8> %res } @@ -485,44 +457,44 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) { ret <2 x i16> %res } -define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) { +define <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) { ; GFX6-LABEL: abs_vgpr_v2i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 ; GFX6-NEXT: v_max_i32_e32 v1, v1, v2 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_v2i16: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0 ; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v1, v0, v1 ; GFX8-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_v2i16: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v1, 0, v0 ; GFX10-NEXT: v_pk_max_i16 v0, v0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_v2i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_pk_sub_i16 v1, 0, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_max_i16 v0, v0, v1 -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false) ret <2 x i16> %res } @@ -576,9 +548,10 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) { ret <3 x i16> %res } -define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { +define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { ; GFX6-LABEL: abs_vgpr_v3i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 @@ -588,13 +561,11 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { ; GFX6-NEXT: v_max_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, v2, v3 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v2 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_v3i16: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_sub_u16_e32 v2, 0, v0 ; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -603,31 +574,27 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { ; GFX8-NEXT: v_max_i16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v4 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_v3i16: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v2, 0, v0 ; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1 ; GFX10-NEXT: v_pk_max_i16 v0, v0, v2 ; GFX10-NEXT: v_max_i16 v1, v1, v3 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_v3i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_pk_sub_i16 v2, 0, v0 ; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_pk_max_i16 v0, v0, v2 ; GFX1250-NEXT: v_max_i16 v1, v1, v3 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false) ret <3 x i16> %res } From 15ef2a50238ae29bb04072de5c9256174fde0c40 Mon Sep 17 00:00:00 2001 From: Daniel Chen Date: Sat, 1 Nov 2025 01:16:19 -0400 Subject: [PATCH 417/539] Use the RHS of a pointer assignment inside of FORALL if it is already of `boxType` instead of `convertToBox` again. (#165771) Fixes #165055. --- .../Optimizer/Builder/TemporaryStorage.cpp | 8 +--- .../forall-pointer-assignment-codegen.fir | 6 +-- ...phic.f90 => forall-pointer-assignment.f90} | 46 ++++++++++++++++++- 3 files changed, 49 insertions(+), 11 deletions(-) rename flang/test/Lower/{forall-polymorphic.f90 => forall-pointer-assignment.f90} (85%) diff --git a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp index 7e329e357d7b3..5db40aff91878 100644 --- a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp +++ b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp @@ -258,13 +258,9 @@ void fir::factory::AnyVariableStack::pushValue(mlir::Location loc, fir::FirOpBuilder &builder, mlir::Value variable) { hlfir::Entity entity{variable}; - mlir::Type storageElementType = - hlfir::getFortranElementType(retValueBox.getType()); - auto [box, maybeCleanUp] = - hlfir::convertToBox(loc, builder, entity, storageElementType); + mlir::Value box = + hlfir::genVariableBox(loc, builder, entity, entity.getBoxType()); fir::runtime::genPushDescriptor(loc, builder, opaquePtr, fir::getBase(box)); - if (maybeCleanUp) - (*maybeCleanUp)(); } void fir::factory::AnyVariableStack::resetFetchPosition( diff --git a/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir b/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir index 1d198765aff9e..855b62ca0ed39 100644 --- a/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir +++ b/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir @@ -91,10 +91,8 @@ func.func @test_need_to_save_rhs(%n: i64, %arg1: !fir.box>>}>>>, i64) -> !fir.ref>>}>> // CHECK: %[[VAL_22:.*]] = hlfir.designate %[[VAL_21]]{"p"} {fortran_attrs = #fir.var_attrs} : (!fir.ref>>}>>) -> !fir.ref>>> // CHECK: %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref>>> -// CHECK: %[[VAL_24:.*]] = fir.box_addr %[[VAL_23]] : (!fir.box>>) -> !fir.ptr> -// CHECK: %[[VAL_25:.*]] = fir.embox %[[VAL_24]] : (!fir.ptr>) -> !fir.box> -// CHECK: %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (!fir.box>) -> !fir.box -// CHECK: fir.call @_FortranAPushDescriptor(%[[VAL_16]], %[[VAL_26]]) : (!fir.llvm_ptr, !fir.box) -> () +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (!fir.box>>) -> !fir.box +// CHECK: fir.call @_FortranAPushDescriptor(%[[VAL_16]], %[[VAL_24]]) : (!fir.llvm_ptr, !fir.box) -> () // CHECK: } // CHECK: %[[VAL_27:.*]] = fir.convert %[[VAL_4]] : (i64) -> index // CHECK: %[[VAL_28:.*]] = fir.convert %[[VAL_0]] : (i64) -> index diff --git a/flang/test/Lower/forall-polymorphic.f90 b/flang/test/Lower/forall-pointer-assignment.f90 similarity index 85% rename from flang/test/Lower/forall-polymorphic.f90 rename to flang/test/Lower/forall-pointer-assignment.f90 index 656b6ecf00628..ec142e3f13ebc 100644 --- a/flang/test/Lower/forall-polymorphic.f90 +++ b/flang/test/Lower/forall-pointer-assignment.f90 @@ -1,4 +1,4 @@ -! Test lower of FORALL polymorphic pointer assignment +! Test lower of FORALL pointer assignment ! RUN: bbc -emit-fir %s -o - | FileCheck %s @@ -128,3 +128,47 @@ subroutine forallPolymorphic3() ! CHECK: } end subroutine forallPolymorphic3 + + +!! Test the LHS of a pointer assignment gets the isPointer flag from the +!! RHS that is a reference to a function that returns a pointer. +! CHECK-LABEL: c.func @_QPforallpointerassignment1 + subroutine forallPointerAssignment1() + type base + real, pointer :: data => null() + end type + + interface + pure function makeData (i) + real, pointer :: makeData + integer*4, intent(in) :: i + end function + end interface + + type(base) :: co1(10) + + forall (i=1:10) + co1(i)%data => makeData (i) + end forall + +! CHECK: %[[V_3:[0-9]+]] = fir.alloca i64 +! CHECK: %[[V_3:[0-9]+]] = fir.alloca i32 {bindc_name = "i"} +! CHECK: %[[V_4:[0-9]+]] = fir.alloca !fir.box> {bindc_name = ".result"} +! CHECK: %[[V_25:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index +! CHECK: %[[V_26:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index +! CHECK: %[[V_27:[0-9]+]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref> +! CHECK: %[[V_28:[0-9]+]] = fir.convert %[[V_27]] : (!fir.ref>) -> !fir.ref +! CHECK: %[[V_29:[0-9]+]] = fir.call @_FortranACreateDescriptorStack(%[[V_28]], %c{{.*}}) : (!fir.ref, i32) -> !fir.llvm_ptr +! CHECK: fir.do_loop %arg0 = %[[V_25]] to %[[V_26]] step %c1 +! CHECK: { +! CHECK: %[[V_32:[0-9]+]] = fir.convert %arg0 : (index) -> i32 +! CHECK: fir.store %[[V_32]] to %[[V_3]] : !fir.ref +! CHECK: %[[V_33:[0-9]+]] = fir.call @_QPmakedata(%[[V_3]]) proc_attrs fastmath : (!fir.ref) -> !fir.box> +! CHECK: fir.save_result %[[V_33]] to %[[V_4]] : !fir.box>, !fir.ref>> +! CHECK: %[[V_34:[0-9]+]] = fir.declare %[[V_4]] {uniq_name = ".tmp.func_result"} : (!fir.ref>>) -> !fir.ref>> +! CHECK: %[[V_35:[0-9]+]] = fir.load %[[V_34]] : !fir.ref>> +! CHECK: %[[V_36:[0-9]+]] = fir.convert %[[V_35]] : (!fir.box>) -> !fir.box +! CHECK: fir.call @_FortranAPushDescriptor(%[[V_29]], %[[V_36]]) : (!fir.llvm_ptr, !fir.box) -> () +! CHECK: } + + end subroutine forallPointerAssignment1 From d6ec918fc9a1e8d0307ef1464188869f4c0304d4 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 31 Oct 2025 22:41:52 -0700 Subject: [PATCH 418/539] [M68k] Use non-deprecated CasesLower --- llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp index fe83dc6e1abfb..51bafe4a4c56c 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp @@ -49,7 +49,7 @@ class M68kAsmBackend : public MCAsmBackend { M68kAsmBackend(const Target &T, const MCSubtargetInfo &STI) : MCAsmBackend(llvm::endianness::big), Allows32BitBranch(llvm::StringSwitch(STI.getCPU()) - .CasesLower("m68020", "m68030", "m68040", true) + .CasesLower({"m68020", "m68030", "m68040"}, true) .Default(false)) {} void applyFixup(const MCFragment &, const MCFixup &, const MCValue &, From 2507163e1f6372a53a1d2033116b3fe9dac716eb Mon Sep 17 00:00:00 2001 From: SahilPatidar Date: Sat, 1 Nov 2025 11:19:28 +0530 Subject: [PATCH 419/539] REAPPLY [ORC] Add automatic shared library resolver for unresolved symbols. (attempt 2) (#165360) This PR reapplies the changes previously introduced in https://github.com/llvm/llvm-project/pull/148410. It introduces a redesigned and rebuilt Cling-based auto-loading workaround that enables scanning libraries and resolving unresolved symbols within those libraries. Fix build failures in LibraryResolverTest and silence symlink warning This commit resolves issues observed in the build bots: 1. Silences the -Wunused-result warning by handling the return value of ::symlink in LibraryResolverTest.cpp. Previously, ignoring the return value triggered compiler warnings. 2. Fixes a linker error in OrcJITTests caused by an undefined symbol: llvm::yaml::convertYAML. The test setup in LibraryResolverTest.cpp now correctly links against the required LLVM YAML library symbols. 3. Fixes persistent build bot failure caused by a path difference issue. This resolves the build failures for PR https://github.com/llvm/llvm-project/pull/148410 on the affected bots. --- .../ExecutionEngine/Orc/Shared/SymbolFilter.h | 173 +++ .../Orc/TargetProcess/LibraryResolver.h | 511 ++++++++ .../Orc/TargetProcess/LibraryScanner.h | 474 +++++++ .../Orc/TargetProcess/CMakeLists.txt | 4 + .../Orc/TargetProcess/LibraryResolver.cpp | 370 ++++++ .../Orc/TargetProcess/LibraryScanner.cpp | 1161 +++++++++++++++++ .../ExecutionEngine/Orc/CMakeLists.txt | 1 + .../ExecutionEngine/Orc/Inputs/A/A_linux.yaml | 460 +++++++ .../ExecutionEngine/Orc/Inputs/A/A_macho.yaml | 723 ++++++++++ .../ExecutionEngine/Orc/Inputs/B/B_linux.yaml | 460 +++++++ .../ExecutionEngine/Orc/Inputs/B/B_macho.yaml | 723 ++++++++++ .../ExecutionEngine/Orc/Inputs/C/C_linux.yaml | 450 +++++++ .../ExecutionEngine/Orc/Inputs/C/C_macho.yaml | 870 ++++++++++++ .../ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml | 460 +++++++ .../ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml | 723 ++++++++++ .../Orc/LibraryResolverTest.cpp | 762 +++++++++++ 16 files changed, 8325 insertions(+) create mode 100644 llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h create mode 100644 llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h create mode 100644 llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h create mode 100644 llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp create mode 100644 llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp create mode 100644 llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_linux.yaml create mode 100644 llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_macho.yaml create mode 100644 llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_linux.yaml create mode 100644 llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_macho.yaml create mode 100644 llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_linux.yaml create mode 100644 llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_macho.yaml create mode 100644 llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml create mode 100644 llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml create mode 100644 llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h new file mode 100644 index 0000000000000..517089341978a --- /dev/null +++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h @@ -0,0 +1,173 @@ +//===- SymbolFilter.h - Utilities for Symbol Filtering ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H +#define LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H + +#include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h" + +#include +#include +#include + +namespace llvm { +namespace orc { + +namespace shared { +using SPSBloomFilter = + SPSTuple>; +} + +class BloomFilter { +public: + using HashFunc = std::function; + + BloomFilter() = default; + BloomFilter(BloomFilter &&) noexcept = default; + BloomFilter &operator=(BloomFilter &&) noexcept = default; + BloomFilter(const BloomFilter &) = delete; + BloomFilter &operator=(const BloomFilter &) = delete; + + BloomFilter(uint32_t SymbolCount, float FalsePositiveRate, HashFunc hashFn) + : HashFn(std::move(hashFn)) { + initialize(SymbolCount, FalsePositiveRate); + } + bool isInitialized() const { return Initialized; } + + void add(StringRef Sym) { + assert(Initialized); + addHash(HashFn(Sym)); + } + + bool mayContain(StringRef Sym) const { + return !isEmpty() && testHash(HashFn(Sym)); + } + + bool isEmpty() const { return SymbolCount == 0; } + +private: + friend class shared::SPSSerializationTraits; + static constexpr uint32_t BitsPerEntry = 64; + + bool Initialized = false; + uint32_t SymbolCount = 0; + uint32_t BloomSize = 0; + uint32_t BloomShift = 0; + std::vector BloomTable; + HashFunc HashFn; + + void initialize(uint32_t SymCount, float FalsePositiveRate) { + assert(SymCount > 0); + SymbolCount = SymCount; + Initialized = true; + + float ln2 = std::log(2.0f); + float M = -1.0f * SymbolCount * std::log(FalsePositiveRate) / (ln2 * ln2); + BloomSize = static_cast(std::ceil(M / BitsPerEntry)); + BloomShift = std::min(6u, log2ceil(SymbolCount)); + BloomTable.resize(BloomSize, 0); + } + + void addHash(uint32_t Hash) { + uint32_t Hash2 = Hash >> BloomShift; + uint32_t N = (Hash / BitsPerEntry) % BloomSize; + uint64_t Mask = + (1ULL << (Hash % BitsPerEntry)) | (1ULL << (Hash2 % BitsPerEntry)); + BloomTable[N] |= Mask; + } + + bool testHash(uint32_t Hash) const { + uint32_t Hash2 = Hash >> BloomShift; + uint32_t N = (Hash / BitsPerEntry) % BloomSize; + uint64_t Mask = + (1ULL << (Hash % BitsPerEntry)) | (1ULL << (Hash2 % BitsPerEntry)); + return (BloomTable[N] & Mask) == Mask; + } + + static constexpr uint32_t log2ceil(uint32_t V) { + return V <= 1 ? 0 : 32 - countl_zero(V - 1); + } +}; + +class BloomFilterBuilder { +public: + using HashFunc = BloomFilter::HashFunc; + + BloomFilterBuilder() = default; + + BloomFilterBuilder &setFalsePositiveRate(float Rate) { + assert(Rate > 0.0f && Rate < 1.0f); + FalsePositiveRate = Rate; + return *this; + } + + BloomFilterBuilder &setHashFunction(HashFunc Fn) { + HashFn = std::move(Fn); + return *this; + } + + BloomFilter build(ArrayRef Symbols) const { + assert(!Symbols.empty() && "Cannot build filter from empty symbol list."); + BloomFilter F(static_cast(Symbols.size()), FalsePositiveRate, + HashFn); + for (const auto &Sym : Symbols) + F.add(Sym); + + return F; + } + +private: + float FalsePositiveRate = 0.02f; + HashFunc HashFn = [](StringRef S) -> uint32_t { + uint32_t H = 5381; + for (char C : S) + H = ((H << 5) + H) + static_cast(C); // H * 33 + C + return H; + }; +}; + +namespace shared { + +template <> class SPSSerializationTraits { +public: + static size_t size(const BloomFilter &Filter) { + return SPSBloomFilter::AsArgList::size( + Filter.Initialized, Filter.SymbolCount, Filter.BloomSize, + Filter.BloomShift, Filter.BloomTable); + } + + static bool serialize(SPSOutputBuffer &OB, const BloomFilter &Filter) { + return SPSBloomFilter::AsArgList::serialize( + OB, Filter.Initialized, Filter.SymbolCount, Filter.BloomSize, + Filter.BloomShift, Filter.BloomTable); + } + + static bool deserialize(SPSInputBuffer &IB, BloomFilter &Filter) { + bool IsInitialized; + uint32_t SymbolCount = 0, BloomSize = 0, BloomShift = 0; + std::vector BloomTable; + + if (!SPSBloomFilter::AsArgList::deserialize( + IB, IsInitialized, SymbolCount, BloomSize, BloomShift, BloomTable)) + return false; + + Filter.Initialized = IsInitialized; + Filter.SymbolCount = SymbolCount; + Filter.BloomSize = BloomSize; + Filter.BloomShift = BloomShift; + Filter.BloomTable = std::move(BloomTable); + + return true; + } +}; + +} // end namespace shared +} // end namespace orc +} // end namespace llvm +#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h new file mode 100644 index 0000000000000..91829953a2405 --- /dev/null +++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h @@ -0,0 +1,511 @@ +//===- LibraryResolver.h - Automatic Library Symbol Resolution -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides support for automatically searching symbols across +// dynamic libraries that have not yet been loaded. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H +#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H + +#include "llvm/ADT/FunctionExtras.h" +#include "llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h" +#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h" +#include "llvm/Support/Path.h" + +#include +#include +#include + +namespace llvm { +namespace orc { + +/// Manages library metadata and state for symbol resolution. +/// +/// Tracks libraries by load state and kind (user/system), and stores +/// associated Bloom filters and hash maps to speed up symbol lookups. +/// Thread-safe for concurrent access. +class LibraryManager { +public: + enum class LibState : uint8_t { Unloaded = 0, Loaded = 1, Queried = 2 }; + + class LibraryInfo { + public: + LibraryInfo(const LibraryInfo &) = delete; + LibraryInfo &operator=(const LibraryInfo &) = delete; + + LibraryInfo(std::string FilePath, LibState S, PathType K, + std::optional Filter = std::nullopt) + : FilePath(std::move(FilePath)), S(S), K(K), Filter(std::move(Filter)) { + } + + StringRef getBasePath() const { return sys::path::parent_path(FilePath); } + StringRef getFileName() const { return sys::path::filename(FilePath); } + + std::string getFullPath() const { return FilePath; } + + void setFilter(BloomFilter F) { + std::lock_guard Lock(Mtx); + if (Filter) + return; + Filter.emplace(std::move(F)); + } + + void ensureFilterBuilt(const BloomFilterBuilder &FB, + ArrayRef Symbols) { + std::lock_guard Lock(Mtx); + if (Filter) + return; + Filter.emplace(FB.build(Symbols)); + } + + bool mayContain(StringRef Symbol) const { + assert(hasFilter()); + std::shared_lock Lock(Mtx); + return Filter->mayContain(Symbol); + } + + bool hasFilter() const { + std::shared_lock Lock(Mtx); + return Filter.has_value(); + } + + LibState getState() const { return S.load(); } + PathType getKind() const { return K; } + + void setState(LibState s) { S.store(s); } + + bool operator==(const LibraryInfo &other) const { + return FilePath == other.FilePath; + } + + private: + std::string FilePath; + std::atomic S; + PathType K; + std::optional Filter; + mutable std::shared_mutex Mtx; + }; + + /// A read-only view of libraries filtered by state and kind. + /// + /// Lets you loop over only the libraries in a map that match a given State + /// and PathType. + class FilteredView { + public: + using Map = StringMap>; + using Iterator = typename Map::const_iterator; + class FilterIterator { + public: + FilterIterator(Iterator it_, Iterator end_, LibState S, PathType K) + : it(it_), end(end_), S(S), K(K) { + advance(); + } + + bool operator!=(const FilterIterator &other) const { + return it != other.it; + } + + const std::shared_ptr &operator*() const { + return it->second; + } + + FilterIterator &operator++() { + ++it; + advance(); + return *this; + } + + private: + void advance() { + for (; it != end; ++it) + if (it->second->getState() == S && it->second->getKind() == K) + break; + } + Iterator it; + Iterator end; + LibState S; + PathType K; + }; + FilteredView(Iterator begin, Iterator end, LibState s, PathType k) + : mapBegin(begin), mapEnd(end), state(s), kind(k) {} + + FilterIterator begin() const { + return FilterIterator(mapBegin, mapEnd, state, kind); + } + + FilterIterator end() const { + return FilterIterator(mapEnd, mapEnd, state, kind); + } + + private: + Iterator mapBegin; + Iterator mapEnd; + LibState state; + PathType kind; + }; + +private: + StringMap> Libraries; + mutable std::shared_mutex Mtx; + +public: + using LibraryVisitor = std::function; + + LibraryManager() = default; + ~LibraryManager() = default; + + bool addLibrary(std::string Path, PathType Kind, + std::optional Filter = std::nullopt) { + std::unique_lock Lock(Mtx); + if (Libraries.count(Path) > 0) + return false; + Libraries.insert({std::move(Path), + std::make_shared(Path, LibState::Unloaded, + Kind, std::move(Filter))}); + return true; + } + + bool hasLibrary(StringRef Path) const { + std::shared_lock Lock(Mtx); + if (Libraries.count(Path) > 0) + return true; + return false; + } + + void removeLibrary(StringRef Path) { + std::unique_lock Lock(Mtx); + auto I = Libraries.find(Path); + if (I == Libraries.end()) + return; + Libraries.erase(I); + } + + void markLoaded(StringRef Path) { + std::unique_lock Lock(Mtx); + if (auto It = Libraries.find(Path); It != Libraries.end()) + It->second->setState(LibState::Loaded); + } + + void markQueried(StringRef Path) { + std::unique_lock Lock(Mtx); + if (auto It = Libraries.find(Path); It != Libraries.end()) + It->second->setState(LibState::Queried); + } + + std::shared_ptr getLibrary(StringRef Path) { + std::shared_lock Lock(Mtx); + if (auto It = Libraries.find(Path); It != Libraries.end()) + return It->second; + return nullptr; + } + + FilteredView getView(LibState S, PathType K) const { + std::shared_lock Lock(Mtx); + return FilteredView(Libraries.begin(), Libraries.end(), S, K); + } + + void forEachLibrary(const LibraryVisitor &visitor) const { + std::unique_lock Lock(Mtx); + for (const auto &[_, entry] : Libraries) { + if (!visitor(*entry)) + break; + } + } + + bool isLoaded(StringRef Path) const { + std::unique_lock Lock(Mtx); + if (auto It = Libraries.find(Path.str()); It != Libraries.end()) + return It->second->getState() == LibState::Loaded; + return false; + } + + bool isQueried(StringRef Path) const { + std::unique_lock Lock(Mtx); + if (auto It = Libraries.find(Path.str()); It != Libraries.end()) + return It->second->getState() == LibState::Queried; + return false; + } + + void clear() { + std::unique_lock Lock(Mtx); + Libraries.clear(); + } +}; + +using LibraryInfo = LibraryManager::LibraryInfo; + +struct SearchPlanEntry { + LibraryManager::LibState State; // Loaded, Queried, Unloaded + PathType Type; // User, System +}; + +struct SearchPolicy { + std::vector Plan; + + static SearchPolicy defaultPlan() { + return {{{LibraryManager::LibState::Loaded, PathType::User}, + {LibraryManager::LibState::Queried, PathType::User}, + {LibraryManager::LibState::Unloaded, PathType::User}, + {LibraryManager::LibState::Loaded, PathType::System}, + {LibraryManager::LibState::Queried, PathType::System}, + {LibraryManager::LibState::Unloaded, PathType::System}}}; + } +}; + +struct SymbolEnumeratorOptions { + enum Filter : uint32_t { + None = 0, + IgnoreUndefined = 1 << 0, + IgnoreWeak = 1 << 1, + IgnoreIndirect = 1 << 2, + IgnoreHidden = 1 << 3, + IgnoreNonGlobal = 1 << 4 + }; + + static SymbolEnumeratorOptions defaultOptions() { + return {Filter::IgnoreUndefined | Filter::IgnoreWeak | + Filter::IgnoreIndirect}; + } + uint32_t FilterFlags = Filter::None; +}; + +struct SearchConfig { + SearchPolicy Policy; + SymbolEnumeratorOptions Options; + + SearchConfig() + : Policy(SearchPolicy::defaultPlan()), // default plan + Options(SymbolEnumeratorOptions::defaultOptions()) {} +}; + +/// Scans libraries and resolves Symbols across user and system paths. +/// +/// Supports symbol enumeration and filtering via SymbolEnumerator, and tracks +/// symbol resolution results through SymbolQuery. Thread-safe and uses +/// LibraryScanHelper for efficient path resolution and caching. +class LibraryResolver { + friend class LibraryResolutionDriver; + +public: + class SymbolEnumerator { + public: + enum class EnumerateResult { Continue, Stop, Error }; + + using OnEachSymbolFn = std::function; + + static bool enumerateSymbols(StringRef Path, OnEachSymbolFn OnEach, + const SymbolEnumeratorOptions &Opts); + }; + + /// Tracks a set of symbols and the libraries where they are resolved. + /// + /// SymbolQuery is used to keep track of which symbols have been resolved + /// to which libraries. It supports concurrent read/write access using a + /// shared mutex, allowing multiple readers or a single writer at a time. + class SymbolQuery { + public: + /// Holds the result for a single symbol. + struct Result { + std::string Name; + std::string ResolvedLibPath; + }; + + private: + mutable std::shared_mutex Mtx; + StringMap Results; + std::atomic ResolvedCount = 0; + + public: + explicit SymbolQuery(const std::vector &Symbols) { + for (const auto &s : Symbols) { + if (!Results.contains(s)) + Results.insert({s, Result{s, ""}}); + } + } + + SmallVector getUnresolvedSymbols() const { + SmallVector Unresolved; + std::shared_lock Lock(Mtx); + for (const auto &[name, res] : Results) { + if (res.ResolvedLibPath.empty()) + Unresolved.push_back(name); + } + return Unresolved; + } + + void resolve(StringRef Sym, const std::string &LibPath) { + std::unique_lock Lock(Mtx); + auto It = Results.find(Sym); + if (It != Results.end() && It->second.ResolvedLibPath.empty()) { + It->second.ResolvedLibPath = LibPath; + ResolvedCount.fetch_add(1, std::memory_order_relaxed); + } + } + + bool allResolved() const { + return ResolvedCount.load(std::memory_order_relaxed) == Results.size(); + } + + bool hasUnresolved() const { + return ResolvedCount.load(std::memory_order_relaxed) < Results.size(); + } + + std::optional getResolvedLib(StringRef Sym) const { + std::shared_lock Lock(Mtx); + auto It = Results.find(Sym); + if (It != Results.end() && !It->second.ResolvedLibPath.empty()) + return StringRef(It->second.ResolvedLibPath); + return std::nullopt; + } + + bool isResolved(StringRef Sym) const { + std::shared_lock Lock(Mtx); + auto It = Results.find(Sym.str()); + return It != Results.end() && !It->second.ResolvedLibPath.empty(); + } + + std::vector getAllResults() const { + std::shared_lock Lock(Mtx); + std::vector Out; + Out.reserve(Results.size()); + for (const auto &[_, res] : Results) + Out.push_back(&res); + return Out; + } + }; + + struct Setup { + std::vector BasePaths; + std::shared_ptr Cache; + std::shared_ptr PResolver; + + size_t ScanBatchSize = 0; + + LibraryScanner::ShouldScanFn ShouldScanCall = [](StringRef) { + return true; + }; + + BloomFilterBuilder FilterBuilder = BloomFilterBuilder(); + + static Setup + create(std::vector BasePaths, + std::shared_ptr existingCache = nullptr, + std::shared_ptr existingResolver = nullptr, + LibraryScanner::ShouldScanFn customShouldScan = nullptr) { + Setup S; + S.BasePaths = std::move(BasePaths); + + S.Cache = + existingCache ? existingCache : std::make_shared(); + + S.PResolver = existingResolver ? existingResolver + : std::make_shared(S.Cache); + + if (customShouldScan) + S.ShouldScanCall = std::move(customShouldScan); + + return S; + } + }; + + LibraryResolver() = delete; + explicit LibraryResolver(const Setup &S); + ~LibraryResolver() = default; + + using OnSearchComplete = unique_function; + + void dump() { + int i = 0; + LibMgr.forEachLibrary([&](const LibraryInfo &Lib) -> bool { + dbgs() << ++i << ". Library Path : " << Lib.getFullPath() << " -> \n\t\t:" + << " ({Type : (" + << (Lib.getKind() == PathType::User ? "User" : "System") + << ") }, { State : " + << (Lib.getState() == LibraryManager::LibState::Loaded + ? "Loaded" + : "Unloaded") + << "})\n"; + return true; + }); + } + + void searchSymbolsInLibraries(std::vector &SymList, + OnSearchComplete OnComplete, + const SearchConfig &Config = SearchConfig()); + +private: + bool scanLibrariesIfNeeded(PathType K, size_t BatchSize = 0); + void resolveSymbolsInLibrary(LibraryInfo &Lib, SymbolQuery &Q, + const SymbolEnumeratorOptions &Opts); + bool + symbolExistsInLibrary(const LibraryInfo &Lib, StringRef Sym, + std::vector *MatchedSymbols = nullptr); + + bool symbolExistsInLibrary(const LibraryInfo &Lib, StringRef SymName, + std::vector *AllSymbols, + const SymbolEnumeratorOptions &Opts); + + std::shared_ptr LibPathCache; + std::shared_ptr LibPathResolver; + LibraryScanHelper ScanHelper; + BloomFilterBuilder FB; + LibraryManager LibMgr; + LibraryScanner::ShouldScanFn ShouldScanCall; + size_t scanBatchSize; +}; + +using SymbolEnumerator = LibraryResolver::SymbolEnumerator; +using SymbolQuery = LibraryResolver::SymbolQuery; +using EnumerateResult = SymbolEnumerator::EnumerateResult; + +class LibraryResolutionDriver { +public: + static std::unique_ptr + create(const LibraryResolver::Setup &S); + + void addScanPath(const std::string &Path, PathType Kind); + bool markLibraryLoaded(StringRef Path); + bool markLibraryUnLoaded(StringRef Path); + bool isLibraryLoaded(StringRef Path) const { + return LR->LibMgr.isLoaded(Path); + } + + void resetAll() { + LR->LibMgr.clear(); + LR->ScanHelper.resetToScan(); + LR->LibPathCache->clear(); + } + + void scanAll(size_t BatchSize = 0) { + LR->scanLibrariesIfNeeded(PathType::User, BatchSize); + LR->scanLibrariesIfNeeded(PathType::System, BatchSize); + } + + void scan(PathType PK, size_t BatchSize = 0) { + LR->scanLibrariesIfNeeded(PK, BatchSize); + } + + void resolveSymbols(std::vector Symbols, + LibraryResolver::OnSearchComplete OnCompletion, + const SearchConfig &Config = SearchConfig()); + + ~LibraryResolutionDriver() = default; + +private: + LibraryResolutionDriver(std::unique_ptr L) + : LR(std::move(L)) {} + + std::unique_ptr LR; +}; + +} // end namespace orc +} // end namespace llvm + +#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h new file mode 100644 index 0000000000000..d1c201306bf54 --- /dev/null +++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h @@ -0,0 +1,474 @@ +//===- LibraryScanner.h - Scanner for Shared Libraries ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides functionality for scanning dynamic (shared) libraries. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H +#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H + +#include "llvm/ADT/FunctionExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/StringSaver.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace llvm { +namespace orc { + +class LibraryManager; + +class LibraryPathCache { + friend class PathResolver; + +public: + LibraryPathCache() = default; + + void clear(bool isRealPathCache = false) { + std::unique_lock lock(Mtx); + Seen.clear(); + if (isRealPathCache) { + RealPathCache.clear(); +#ifndef _WIN32 + ReadlinkCache.clear(); + LstatCache.clear(); +#endif + } + } + + void markSeen(const std::string &CanonPath) { + std::unique_lock lock(Mtx); + Seen.insert(CanonPath); + } + + bool hasSeen(StringRef CanonPath) const { + std::shared_lock lock(Mtx); + return Seen.contains(CanonPath); + } + + bool hasSeenOrMark(StringRef CanonPath) { + std::string s = CanonPath.str(); + { + std::shared_lock lock(Mtx); + if (Seen.contains(s)) + return true; + } + { + std::unique_lock lock(Mtx); + Seen.insert(s); + } + return false; + } + +private: + mutable std::shared_mutex Mtx; + + struct PathInfo { + std::string canonicalPath; + std::error_code ErrnoCode; + }; + + void insert_realpath(StringRef Path, const PathInfo &Info) { + std::unique_lock lock(Mtx); + RealPathCache.insert({Path, Info}); + } + + std::optional read_realpath(StringRef Path) const { + std::shared_lock lock(Mtx); + auto It = RealPathCache.find(Path); + if (It != RealPathCache.end()) + return It->second; + + return std::nullopt; + } + + StringSet<> Seen; + StringMap RealPathCache; + +#ifndef _WIN32 + StringMap ReadlinkCache; + StringMap LstatCache; + + void insert_link(StringRef Path, const std::string &s) { + std::unique_lock lock(Mtx); + ReadlinkCache.insert({Path, s}); + } + + std::optional read_link(StringRef Path) const { + std::shared_lock lock(Mtx); + auto It = ReadlinkCache.find(Path); + if (It != ReadlinkCache.end()) + return It->second; + + return std::nullopt; + } + + void insert_lstat(StringRef Path, mode_t m) { + std::unique_lock lock(Mtx); + LstatCache.insert({Path, m}); + } + + std::optional read_lstat(StringRef Path) const { + std::shared_lock lock(Mtx); + auto It = LstatCache.find(Path); + if (It != LstatCache.end()) + return It->second; + + return std::nullopt; + } + +#endif +}; + +/// Resolves file system paths with optional caching of results. +/// +/// Supports lstat, readlink, and realpath operations. Can resolve paths +/// relative to a base and handle symbolic links. Caches results to reduce +/// repeated system calls when enabled. +class PathResolver { +private: + std::shared_ptr LibPathCache; + +public: + PathResolver(std::shared_ptr cache) + : LibPathCache(std::move(cache)) {} + + std::optional resolve(StringRef Path, std::error_code &ec) { + return realpathCached(Path, ec); + } +#ifndef _WIN32 + mode_t lstatCached(StringRef Path); + std::optional readlinkCached(StringRef Path); +#endif + std::optional realpathCached(StringRef Path, std::error_code &ec, + StringRef base = "", + bool baseIsResolved = false, + long symloopLevel = 40); +}; + +/// Performs placeholder substitution in dynamic library paths. +/// +/// Configures known placeholders (like @loader_path) and replaces them +/// in input paths with their resolved values. +class DylibSubstitutor { +public: + void configure(StringRef loaderPath); + + std::string substitute(StringRef input) const { + for (const auto &[ph, value] : Placeholders) { + if (input.starts_with_insensitive(ph)) + return (Twine(value) + input.drop_front(ph.size())).str(); + } + return input.str(); + } + +private: + StringMap Placeholders; +}; + +/// Validates and normalizes dynamic library paths. +/// +/// Uses a `PathResolver` to resolve paths to their canonical form and +/// checks whether they point to valid shared libraries. +class DylibPathValidator { +public: + DylibPathValidator(PathResolver &PR) : LibPathResolver(PR) {} + + static bool isSharedLibrary(StringRef Path); + + std::optional normalize(StringRef Path) const { + std::error_code ec; + auto real = LibPathResolver.resolve(Path, ec); + if (!real || ec) + return std::nullopt; + + return real; + } + + /// Validate the given path as a shared library. + std::optional validate(StringRef Path) const { + auto realOpt = normalize(Path); + if (!realOpt) + return std::nullopt; + + if (!isSharedLibrary(*realOpt)) + return std::nullopt; + + return realOpt; + } + +private: + PathResolver &LibPathResolver; +}; + +enum class SearchPathType { + RPath, + UsrOrSys, + RunPath, +}; + +struct SearchPathConfig { + ArrayRef Paths; + SearchPathType type; +}; + +class SearchPathResolver { +public: + SearchPathResolver(const SearchPathConfig &Cfg, + StringRef PlaceholderPrefix = "") + : Kind(Cfg.type), PlaceholderPrefix(PlaceholderPrefix) { + for (auto &path : Cfg.Paths) + Paths.emplace_back(path.str()); + } + + std::optional resolve(StringRef libStem, + const DylibSubstitutor &Subst, + DylibPathValidator &Validator) const; + SearchPathType searchPathType() const { return Kind; } + +private: + std::vector Paths; + SearchPathType Kind; + std::string PlaceholderPrefix; +}; + +class DylibResolverImpl { +public: + DylibResolverImpl(DylibSubstitutor Substitutor, DylibPathValidator &Validator, + std::vector Resolvers) + : Substitutor(std::move(Substitutor)), Validator(Validator), + Resolvers(std::move(Resolvers)) {} + + std::optional resolve(StringRef Stem, + bool VariateLibStem = false) const; + +private: + std::optional tryWithExtensions(StringRef libstem) const; + + DylibSubstitutor Substitutor; + DylibPathValidator &Validator; + std::vector Resolvers; +}; + +class DylibResolver { +public: + DylibResolver(DylibPathValidator &Validator) : Validator(Validator) {} + + void configure(StringRef loaderPath, + ArrayRef SearchPathCfg) { + DylibSubstitutor Substitutor; + Substitutor.configure(loaderPath); + + std::vector Resolvers; + for (const auto &cfg : SearchPathCfg) { + Resolvers.emplace_back(cfg, + cfg.type == SearchPathType::RPath ? "@rpath" : ""); + } + + impl_ = std::make_unique( + std::move(Substitutor), Validator, std::move(Resolvers)); + } + + std::optional resolve(StringRef libStem, + bool VariateLibStem = false) const { + if (!impl_) + return std::nullopt; + return impl_->resolve(libStem, VariateLibStem); + } + + static std::string resolvelinkerFlag(StringRef libStem, + StringRef loaderPath) { + DylibSubstitutor Substitutor; + Substitutor.configure(loaderPath); + return Substitutor.substitute(libStem); + } + +private: + DylibPathValidator &Validator; + std::unique_ptr impl_; +}; + +enum class PathType : uint8_t { User, System, Unknown }; + +enum class ScanState : uint8_t { NotScanned, Scanning, Scanned }; + +struct LibrarySearchPath { + std::string BasePath; // Canonical base directory path + PathType Kind; // User or System + std::atomic State; + + LibrarySearchPath(std::string Base, PathType K) + : BasePath(std::move(Base)), Kind(K), State(ScanState::NotScanned) {} +}; + +/// Scans and tracks libraries for symbol resolution. +/// +/// Maintains a list of library paths to scan, caches scanned units, +/// and resolves paths canonically for consistent tracking. +class LibraryScanHelper { +public: + explicit LibraryScanHelper(const std::vector &SPaths, + std::shared_ptr LibPathCache, + std::shared_ptr LibPathResolver) + : LibPathCache(std::move(LibPathCache)), + LibPathResolver(std::move(LibPathResolver)) { + DEBUG_WITH_TYPE( + "orc", dbgs() << "LibraryScanHelper::LibraryScanHelper: base paths : " + << SPaths.size() << "\n";); + for (const auto &p : SPaths) + addBasePath(p); + } + + void + addBasePath(const std::string &P, + PathType Kind = + PathType::Unknown); // Add a canonical directory for scanning + std::vector> + getNextBatch(PathType Kind, size_t batchSize); + + bool leftToScan(PathType K) const; + void resetToScan(); + + bool isTrackedBasePath(StringRef P) const; + std::vector> getAllUnits() const; + + SmallVector getSearchPaths() const { + SmallVector SearchPaths; + for (const auto &[_, SP] : LibSearchPaths) + SearchPaths.push_back(SP->BasePath); + return SearchPaths; + } + + PathResolver &getPathResolver() const { return *LibPathResolver; } + + LibraryPathCache &getCache() const { return *LibPathCache; } + + bool hasSeenOrMark(StringRef P) const { + return LibPathCache->hasSeenOrMark(P); + } + + std::optional resolve(StringRef P, std::error_code &ec) const { + return LibPathResolver->resolve(P.str(), ec); + } + +private: + std::string resolveCanonical(StringRef P, std::error_code &ec) const; + PathType classifyKind(StringRef P) const; + + mutable std::shared_mutex Mtx; + std::shared_ptr LibPathCache; + std::shared_ptr LibPathResolver; + + StringMap> + LibSearchPaths; // key: canonical path + std::deque UnscannedUsr; + std::deque UnscannedSys; +}; + +/// Loads an object file and provides access to it. +/// +/// Owns the underlying `ObjectFile` and ensures it is valid. +/// Any errors encountered during construction are stored and +/// returned when attempting to access the file. +class ObjectFileLoader { +public: + /// Construct an object file loader from the given path. + explicit ObjectFileLoader(StringRef Path) { + auto ObjOrErr = loadObjectFileWithOwnership(Path); + if (ObjOrErr) + Obj = std::move(*ObjOrErr); + else { + consumeError(std::move(Err)); + Err = ObjOrErr.takeError(); + } + } + + ObjectFileLoader(const ObjectFileLoader &) = delete; + ObjectFileLoader &operator=(const ObjectFileLoader &) = delete; + + ObjectFileLoader(ObjectFileLoader &&) = default; + ObjectFileLoader &operator=(ObjectFileLoader &&) = default; + + /// Get the loaded object file, or return an error if loading failed. + Expected getObjectFile() { + if (Err) + return std::move(Err); + return *Obj.getBinary(); + } + + static bool isArchitectureCompatible(const object::ObjectFile &Obj); + +private: + object::OwningBinary Obj; + Error Err = Error::success(); + + static Expected> + loadObjectFileWithOwnership(StringRef FilePath); +}; + +/// Scans libraries, resolves dependencies, and registers them. +class LibraryScanner { +public: + using ShouldScanFn = std::function; + + LibraryScanner( + LibraryScanHelper &H, LibraryManager &LibMgr, + ShouldScanFn ShouldScanCall = [](StringRef path) { return true; }) + : ScanHelper(H), LibMgr(LibMgr), + ShouldScanCall(std::move(ShouldScanCall)) {} + + void scanNext(PathType Kind, size_t batchSize = 1); + + /// Dependency info for a library. + struct LibraryDepsInfo { + llvm::BumpPtrAllocator Alloc; + llvm::StringSaver Saver{Alloc}; + + SmallVector rpath; + SmallVector runPath; + SmallVector deps; + bool isPIE = false; + + void addRPath(StringRef s) { rpath.push_back(Saver.save(s)); } + + void addRunPath(StringRef s) { runPath.push_back(Saver.save(s)); } + + void addDep(StringRef s) { deps.push_back(Saver.save(s)); } + }; + +private: + LibraryScanHelper &ScanHelper; + LibraryManager &LibMgr; + ShouldScanFn ShouldScanCall; + + std::optional shouldScan(StringRef FilePath); + Expected extractDeps(StringRef FilePath); + + void handleLibrary(StringRef P, PathType K, int level = 1); + + void scanBaseDir(std::shared_ptr U); +}; + +using LibraryDepsInfo = LibraryScanner::LibraryDepsInfo; + +} // end namespace orc +} // end namespace llvm + +#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt index 927558649eb4d..ca8192bb99492 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt @@ -16,9 +16,11 @@ add_llvm_component_library(LLVMOrcTargetProcess ExecutorSharedMemoryMapperService.cpp DefaultHostBootstrapValues.cpp ExecutorResolver.cpp + LibraryResolver.cpp JITLoaderGDB.cpp JITLoaderPerf.cpp JITLoaderVTune.cpp + LibraryScanner.cpp OrcRTBootstrap.cpp RegisterEHFrames.cpp SimpleExecutorDylibManager.cpp @@ -36,6 +38,8 @@ add_llvm_component_library(LLVMOrcTargetProcess LINK_COMPONENTS ${intel_jit_profiling} + BinaryFormat + Object OrcShared Support TargetParser diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp new file mode 100644 index 0000000000000..35da82a10306a --- /dev/null +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp @@ -0,0 +1,370 @@ +//===- LibraryResolver.cpp - Library Resolution of Unresolved Symbols ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Library resolution impl for unresolved symbols +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h" +#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h" + +#include "llvm/ADT/StringSet.h" + +#include "llvm/BinaryFormat/MachO.h" +#include "llvm/Object/COFF.h" +#include "llvm/Object/ELF.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/MachO.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Error.h" + +#include +#include + +#define DEBUG_TYPE "orc-resolver" + +namespace llvm::orc { + +LibraryResolver::LibraryResolver(const LibraryResolver::Setup &S) + : LibPathCache(S.Cache ? S.Cache : std::make_shared()), + LibPathResolver(S.PResolver + ? S.PResolver + : std::make_shared(LibPathCache)), + ScanHelper(S.BasePaths, LibPathCache, LibPathResolver), + FB(S.FilterBuilder), LibMgr(), + ShouldScanCall(S.ShouldScanCall ? S.ShouldScanCall + : [](StringRef) -> bool { return true; }), + scanBatchSize(S.ScanBatchSize) { + + if (ScanHelper.getAllUnits().empty()) { + LLVM_DEBUG(dbgs() << "Warning: No base paths provided for scanning.\n"); + } +} + +std::unique_ptr +LibraryResolutionDriver::create(const LibraryResolver::Setup &S) { + auto LR = std::make_unique(S); + return std::unique_ptr( + new LibraryResolutionDriver(std::move(LR))); +} + +void LibraryResolutionDriver::addScanPath(const std::string &Path, PathType K) { + LR->ScanHelper.addBasePath(Path, K); +} + +bool LibraryResolutionDriver::markLibraryLoaded(StringRef Path) { + auto Lib = LR->LibMgr.getLibrary(Path); + if (!Lib) + return false; + + Lib->setState(LibraryManager::LibState::Loaded); + + return true; +} + +bool LibraryResolutionDriver::markLibraryUnLoaded(StringRef Path) { + auto Lib = LR->LibMgr.getLibrary(Path); + if (!Lib) + return false; + + Lib->setState(LibraryManager::LibState::Unloaded); + + return true; +} + +void LibraryResolutionDriver::resolveSymbols( + std::vector Syms, + LibraryResolver::OnSearchComplete OnCompletion, + const SearchConfig &Config) { + LR->searchSymbolsInLibraries(Syms, std::move(OnCompletion), Config); +} + +static bool shouldIgnoreSymbol(const object::SymbolRef &Sym, + uint32_t IgnoreFlags) { + Expected FlagsOrErr = Sym.getFlags(); + if (!FlagsOrErr) { + consumeError(FlagsOrErr.takeError()); + return true; + } + + uint32_t Flags = *FlagsOrErr; + + using Filter = SymbolEnumeratorOptions; + if ((IgnoreFlags & Filter::IgnoreUndefined) && + (Flags & object::SymbolRef::SF_Undefined)) + return true; + if ((IgnoreFlags & Filter::IgnoreIndirect) && + (Flags & object::SymbolRef::SF_Indirect)) + return true; + if ((IgnoreFlags & Filter::IgnoreWeak) && + (Flags & object::SymbolRef::SF_Weak)) + return true; + + return false; +} + +bool SymbolEnumerator::enumerateSymbols(StringRef Path, OnEachSymbolFn OnEach, + const SymbolEnumeratorOptions &Opts) { + if (Path.empty()) + return false; + + ObjectFileLoader ObjLoader(Path); + + auto ObjOrErr = ObjLoader.getObjectFile(); + if (!ObjOrErr) { + std::string ErrMsg; + handleAllErrors(ObjOrErr.takeError(), + [&](const ErrorInfoBase &EIB) { ErrMsg = EIB.message(); }); + LLVM_DEBUG(dbgs() << "Failed loading object file: " << Path + << "\nError: " << ErrMsg << "\n"); + return false; + } + + object::ObjectFile *Obj = &ObjOrErr.get(); + + auto processSymbolRange = + [&](object::ObjectFile::symbol_iterator_range Range) -> EnumerateResult { + for (const auto &Sym : Range) { + if (shouldIgnoreSymbol(Sym, Opts.FilterFlags)) + continue; + + auto NameOrErr = Sym.getName(); + if (!NameOrErr) { + consumeError(NameOrErr.takeError()); + continue; + } + + StringRef Name = *NameOrErr; + if (Name.empty()) + continue; + + EnumerateResult Res = OnEach(Name); + if (Res != EnumerateResult::Continue) + return Res; + } + return EnumerateResult::Continue; + }; + + EnumerateResult Res = processSymbolRange(Obj->symbols()); + if (Res != EnumerateResult::Continue) + return Res == EnumerateResult::Stop; + + if (Obj->isELF()) { + const auto *ElfObj = cast(Obj); + Res = processSymbolRange(ElfObj->getDynamicSymbolIterators()); + if (Res != EnumerateResult::Continue) + return Res == EnumerateResult::Stop; + } else if (Obj->isCOFF()) { + const auto *CoffObj = cast(Obj); + for (auto I = CoffObj->export_directory_begin(), + E = CoffObj->export_directory_end(); + I != E; ++I) { + StringRef Name; + if (I->getSymbolName(Name)) + continue; + if (Name.empty()) + continue; + + EnumerateResult Res = OnEach(Name); + if (Res != EnumerateResult::Continue) + return Res == EnumerateResult::Stop; + } + } else if (Obj->isMachO()) { + } + + return true; +} + +class SymbolSearchContext { +public: + SymbolSearchContext(SymbolQuery &Q) : Q(Q) {} + + bool hasSearched(LibraryInfo *Lib) const { return Searched.count(Lib); } + + void markSearched(LibraryInfo *Lib) { Searched.insert(Lib); } + + inline bool allResolved() const { return Q.allResolved(); } + + SymbolQuery &query() { return Q; } + +private: + SymbolQuery &Q; + DenseSet Searched; +}; + +void LibraryResolver::resolveSymbolsInLibrary( + LibraryInfo &Lib, SymbolQuery &UnresolvedSymbols, + const SymbolEnumeratorOptions &Opts) { + LLVM_DEBUG(dbgs() << "Checking unresolved symbols " + << " in library : " << Lib.getFileName() << "\n";); + StringSet<> DiscoveredSymbols; + + if (!UnresolvedSymbols.hasUnresolved()) { + LLVM_DEBUG(dbgs() << "Skipping library: " << Lib.getFullPath() + << " — unresolved symbols exist.\n";); + return; + } + + bool HasEnumerated = false; + auto enumerateSymbolsIfNeeded = [&]() { + if (HasEnumerated) + return; + + HasEnumerated = true; + + LLVM_DEBUG(dbgs() << "Enumerating symbols in library: " << Lib.getFullPath() + << "\n";); + SymbolEnumerator::enumerateSymbols( + Lib.getFullPath(), + [&](StringRef sym) { + DiscoveredSymbols.insert(sym); + return EnumerateResult::Continue; + }, + Opts); + + if (DiscoveredSymbols.empty()) { + LLVM_DEBUG(dbgs() << " No symbols and remove library : " + << Lib.getFullPath() << "\n";); + LibMgr.removeLibrary(Lib.getFullPath()); + return; + } + }; + + if (!Lib.hasFilter()) { + LLVM_DEBUG(dbgs() << "Building filter for library: " << Lib.getFullPath() + << "\n";); + enumerateSymbolsIfNeeded(); + SmallVector SymbolVec; + SymbolVec.reserve(DiscoveredSymbols.size()); + for (const auto &KV : DiscoveredSymbols) + SymbolVec.push_back(KV.first()); + + Lib.ensureFilterBuilt(FB, SymbolVec); + LLVM_DEBUG({ + dbgs() << "DiscoveredSymbols : " << DiscoveredSymbols.size() << "\n"; + for (const auto &KV : DiscoveredSymbols) + dbgs() << "DiscoveredSymbols : " << KV.first() << "\n"; + }); + } + + const auto &Unresolved = UnresolvedSymbols.getUnresolvedSymbols(); + bool HadAnySym = false; + LLVM_DEBUG(dbgs() << "Total unresolved symbols : " << Unresolved.size() + << "\n";); + for (const auto &Sym : Unresolved) { + if (Lib.mayContain(Sym)) { + LLVM_DEBUG(dbgs() << "Checking symbol '" << Sym + << "' in library: " << Lib.getFullPath() << "\n";); + enumerateSymbolsIfNeeded(); + if (DiscoveredSymbols.count(Sym) > 0) { + LLVM_DEBUG(dbgs() << " Resolved symbol: " << Sym + << " in library: " << Lib.getFullPath() << "\n";); + UnresolvedSymbols.resolve(Sym, Lib.getFullPath()); + HadAnySym = true; + } + } + } + + using LibraryState = LibraryManager::LibState; + if (HadAnySym && Lib.getState() != LibraryState::Loaded) + Lib.setState(LibraryState::Queried); +} + +void LibraryResolver::searchSymbolsInLibraries( + std::vector &SymbolList, OnSearchComplete OnComplete, + const SearchConfig &Config) { + SymbolQuery Q(SymbolList); + + using LibraryState = LibraryManager::LibState; + using LibraryType = PathType; + auto tryResolveFrom = [&](LibraryState S, LibraryType K) { + LLVM_DEBUG(dbgs() << "Trying resolve from state=" << static_cast(S) + << " type=" << static_cast(K) << "\n";); + + SymbolSearchContext Ctx(Q); + while (!Ctx.allResolved()) { + + for (auto &Lib : LibMgr.getView(S, K)) { + if (Ctx.hasSearched(Lib.get())) + continue; + + // can use Async here? + resolveSymbolsInLibrary(*Lib, Ctx.query(), Config.Options); + Ctx.markSearched(Lib.get()); + + if (Ctx.allResolved()) + return; + } + + if (Ctx.allResolved()) + return; + + if (!scanLibrariesIfNeeded(K, scanBatchSize)) + break; // no more new libs to scan + } + }; + + for (const auto &[St, Ty] : Config.Policy.Plan) { + tryResolveFrom(St, Ty); + if (Q.allResolved()) + break; + } + + // done: + LLVM_DEBUG({ + dbgs() << "Search complete.\n"; + for (const auto &r : Q.getAllResults()) + dbgs() << "Resolved Symbol:" << r->Name << " -> " << r->ResolvedLibPath + << "\n"; + }); + + OnComplete(Q); +} + +bool LibraryResolver::scanLibrariesIfNeeded(PathType PK, size_t BatchSize) { + LLVM_DEBUG(dbgs() << "LibraryResolver::scanLibrariesIfNeeded: Scanning for " + << (PK == PathType::User ? "User" : "System") + << " libraries\n";); + if (!ScanHelper.leftToScan(PK)) + return false; + + LibraryScanner Scanner(ScanHelper, LibMgr, ShouldScanCall); + Scanner.scanNext(PK, BatchSize); + return true; +} + +bool LibraryResolver::symbolExistsInLibrary(const LibraryInfo &Lib, + StringRef SymName, + std::vector *AllSyms) { + SymbolEnumeratorOptions Opts; + return symbolExistsInLibrary(Lib, SymName, AllSyms, Opts); +} + +bool LibraryResolver::symbolExistsInLibrary( + const LibraryInfo &Lib, StringRef SymName, + std::vector *AllSyms, const SymbolEnumeratorOptions &Opts) { + bool Found = false; + + SymbolEnumerator::enumerateSymbols( + Lib.getFullPath(), + [&](StringRef Sym) { + if (AllSyms) + AllSyms->emplace_back(Sym.str()); + + if (Sym == SymName) { + Found = true; + } + + return EnumerateResult::Continue; + }, + Opts); + + return Found; +} + +} // end namespace llvm::orc diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp new file mode 100644 index 0000000000000..d93f68622fcc2 --- /dev/null +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp @@ -0,0 +1,1161 @@ +//===- LibraryScanner.cpp - Provide Library Scanning Implementation ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h" +#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h" + +#include "llvm/ADT/StringExtras.h" +#include "llvm/Object/COFF.h" +#include "llvm/Object/ELF.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/ELFTypes.h" +#include "llvm/Object/MachO.h" +#include "llvm/Object/MachOUniversal.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" +#include "llvm/TargetParser/Host.h" +#include "llvm/TargetParser/Triple.h" + +#ifdef LLVM_ON_UNIX +#include +#include +#endif // LLVM_ON_UNIX + +#ifdef __APPLE__ +#include +#undef LC_LOAD_DYLIB +#undef LC_RPATH +#endif // __APPLE__ + +#define DEBUG_TYPE "orc-scanner" + +namespace llvm::orc { + +void handleError(Error Err, StringRef context = "") { + consumeError(handleErrors(std::move(Err), [&](const ErrorInfoBase &EIB) { + dbgs() << "LLVM Error"; + if (!context.empty()) + dbgs() << " [" << context << "]"; + dbgs() << ": " << EIB.message() << "\n"; + })); +} + +bool ObjectFileLoader::isArchitectureCompatible(const object::ObjectFile &Obj) { + Triple HostTriple(sys::getDefaultTargetTriple()); + Triple ObjTriple = Obj.makeTriple(); + + LLVM_DEBUG({ + dbgs() << "Host triple: " << HostTriple.str() + << ", Object triple: " << ObjTriple.str() << "\n"; + }); + + if (ObjTriple.getArch() != Triple::UnknownArch && + HostTriple.getArch() != ObjTriple.getArch()) + return false; + + if (ObjTriple.getOS() != Triple::UnknownOS && + HostTriple.getOS() != ObjTriple.getOS()) + return false; + + if (ObjTriple.getEnvironment() != Triple::UnknownEnvironment && + HostTriple.getEnvironment() != Triple::UnknownEnvironment && + HostTriple.getEnvironment() != ObjTriple.getEnvironment()) + return false; + + return true; +} + +Expected> +ObjectFileLoader::loadObjectFileWithOwnership(StringRef FilePath) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Attempting to open file " << FilePath + << "\n";); + auto BinOrErr = object::createBinary(FilePath); + if (!BinOrErr) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Failed to open file " << FilePath + << "\n";); + return BinOrErr.takeError(); + } + + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Successfully opened file " << FilePath + << "\n";); + + auto OwningBin = BinOrErr->takeBinary(); + object::Binary *Bin = OwningBin.first.get(); + + if (Bin->isArchive()) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: File is an archive, not supported: " + << FilePath << "\n";); + return createStringError(std::errc::invalid_argument, + "Archive files are not supported: %s", + FilePath.str().c_str()); + } + +#if defined(__APPLE__) + if (auto *UB = dyn_cast(Bin)) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Detected Mach-O universal binary: " + << FilePath << "\n";); + for (auto ObjForArch : UB->objects()) { + auto ObjOrErr = ObjForArch.getAsObjectFile(); + if (!ObjOrErr) { + LLVM_DEBUG( + dbgs() + << "ObjectFileLoader: Skipping invalid architecture slice\n";); + + consumeError(ObjOrErr.takeError()); + continue; + } + + std::unique_ptr Obj = std::move(ObjOrErr.get()); + if (isArchitectureCompatible(*Obj)) { + LLVM_DEBUG( + dbgs() << "ObjectFileLoader: Found compatible object slice\n";); + + return object::OwningBinary( + std::move(Obj), std::move(OwningBin.second)); + + } else { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Incompatible architecture " + "slice skipped\n";); + } + } + LLVM_DEBUG(dbgs() << "ObjectFileLoader: No compatible slices found in " + "universal binary\n";); + return createStringError(inconvertibleErrorCode(), + "No compatible object found in fat binary: %s", + FilePath.str().c_str()); + } +#endif + + auto ObjOrErr = + object::ObjectFile::createObjectFile(Bin->getMemoryBufferRef()); + if (!ObjOrErr) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Failed to create object file\n";); + return ObjOrErr.takeError(); + } + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Detected object file\n";); + + std::unique_ptr Obj = std::move(*ObjOrErr); + if (!isArchitectureCompatible(*Obj)) { + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Incompatible architecture: " + << FilePath << "\n";); + return createStringError(inconvertibleErrorCode(), + "Incompatible object file: %s", + FilePath.str().c_str()); + } + + LLVM_DEBUG(dbgs() << "ObjectFileLoader: Object file is compatible\n";); + + return object::OwningBinary(std::move(Obj), + std::move(OwningBin.second)); +} + +template +bool isELFSharedLibrary(const object::ELFFile &ELFObj) { + if (ELFObj.getHeader().e_type != ELF::ET_DYN) + return false; + + auto PHOrErr = ELFObj.program_headers(); + if (!PHOrErr) { + consumeError(PHOrErr.takeError()); + return true; + } + + for (auto Phdr : *PHOrErr) { + if (Phdr.p_type == ELF::PT_INTERP) + return false; + } + + return true; +} + +bool isSharedLibraryObject(object::ObjectFile &Obj) { + if (Obj.isELF()) { + if (auto *ELF32LE = dyn_cast(&Obj)) + return isELFSharedLibrary(ELF32LE->getELFFile()); + if (auto *ELF64LE = dyn_cast(&Obj)) + return isELFSharedLibrary(ELF64LE->getELFFile()); + if (auto *ELF32BE = dyn_cast(&Obj)) + return isELFSharedLibrary(ELF32BE->getELFFile()); + if (auto *ELF64BE = dyn_cast(&Obj)) + return isELFSharedLibrary(ELF64BE->getELFFile()); + } else if (Obj.isMachO()) { + const object::MachOObjectFile *MachO = + dyn_cast(&Obj); + if (!MachO) { + LLVM_DEBUG(dbgs() << "Failed to cast to MachOObjectFile.\n";); + return false; + } + LLVM_DEBUG({ + bool Result = + MachO->getHeader().filetype == MachO::HeaderFileType::MH_DYLIB; + dbgs() << "Mach-O filetype: " << MachO->getHeader().filetype + << " (MH_DYLIB == " << MachO::HeaderFileType::MH_DYLIB + << "), shared: " << Result << "\n"; + }); + + return MachO->getHeader().filetype == MachO::HeaderFileType::MH_DYLIB; + } else if (Obj.isCOFF()) { + const object::COFFObjectFile *coff = dyn_cast(&Obj); + if (!coff) + return false; + return coff->getCharacteristics() & COFF::IMAGE_FILE_DLL; + } else { + LLVM_DEBUG(dbgs() << "Binary is not an ObjectFile.\n";); + } + + return false; +} + +bool DylibPathValidator::isSharedLibrary(StringRef Path) { + LLVM_DEBUG(dbgs() << "Checking if path is a shared library: " << Path + << "\n";); + + auto FileType = sys::fs::get_file_type(Path, /*Follow*/ true); + if (FileType != sys::fs::file_type::regular_file) { + LLVM_DEBUG(dbgs() << "File type is not a regular file for path: " << Path + << "\n";); + return false; + } + + file_magic MagicCode; + identify_magic(Path, MagicCode); + + // Skip archives. + if (MagicCode == file_magic::archive) + return false; + + // Universal binary handling. +#if defined(__APPLE__) + if (MagicCode == file_magic::macho_universal_binary) { + ObjectFileLoader ObjLoader(Path); + auto ObjOrErr = ObjLoader.getObjectFile(); + if (!ObjOrErr) { + consumeError(ObjOrErr.takeError()); + return false; + } + return isSharedLibraryObject(ObjOrErr.get()); + } +#endif + + // Object file inspection for PE/COFF, ELF, and Mach-O + bool NeedsObjectInspection = +#if defined(_WIN32) + (MagicCode == file_magic::pecoff_executable); +#elif defined(__APPLE__) + (MagicCode == file_magic::macho_fixed_virtual_memory_shared_lib || + MagicCode == file_magic::macho_dynamically_linked_shared_lib || + MagicCode == file_magic::macho_dynamically_linked_shared_lib_stub); +#elif defined(LLVM_ON_UNIX) +#ifdef __CYGWIN__ + (MagicCode == file_magic::pecoff_executable); +#else + (MagicCode == file_magic::elf_shared_object); +#endif +#else +#error "Unsupported platform." +#endif + + if (NeedsObjectInspection) { + ObjectFileLoader ObjLoader(Path); + auto ObjOrErr = ObjLoader.getObjectFile(); + if (!ObjOrErr) { + consumeError(ObjOrErr.takeError()); + return false; + } + return isSharedLibraryObject(ObjOrErr.get()); + } + + LLVM_DEBUG(dbgs() << "Path is not identified as a shared library: " << Path + << "\n";); + return false; +} + +void DylibSubstitutor::configure(StringRef LoaderPath) { + SmallString<512> ExecPath(sys::fs::getMainExecutable(nullptr, nullptr)); + sys::path::remove_filename(ExecPath); + + SmallString<512> LoaderDir; + if (LoaderPath.empty()) { + LoaderDir = ExecPath; + } else { + LoaderDir = LoaderPath.str(); + if (!sys::fs::is_directory(LoaderPath)) + sys::path::remove_filename(LoaderDir); + } + +#ifdef __APPLE__ + Placeholders["@loader_path"] = std::string(LoaderDir); + Placeholders["@executable_path"] = std::string(ExecPath); +#else + Placeholders["$origin"] = std::string(LoaderDir); +#endif +} + +std::optional +SearchPathResolver::resolve(StringRef Stem, const DylibSubstitutor &Subst, + DylibPathValidator &Validator) const { + for (const auto &SP : Paths) { + std::string Base = Subst.substitute(SP); + + SmallString<512> FullPath(Base); + if (!PlaceholderPrefix.empty() && + Stem.starts_with_insensitive(PlaceholderPrefix)) + FullPath.append(Stem.drop_front(PlaceholderPrefix.size())); + else + sys::path::append(FullPath, Stem); + + LLVM_DEBUG(dbgs() << "SearchPathResolver::resolve FullPath = " << FullPath + << "\n";); + + if (auto Valid = Validator.validate(FullPath.str())) + return Valid; + } + + return std::nullopt; +} + +std::optional +DylibResolverImpl::tryWithExtensions(StringRef LibStem) const { + LLVM_DEBUG(dbgs() << "tryWithExtensions: baseName = " << LibStem << "\n";); + SmallVector, 8> Candidates; + + // Add extensions by platform +#if defined(__APPLE__) + Candidates.emplace_back(LibStem); + Candidates.back() += ".dylib"; +#elif defined(_WIN32) + Candidates.emplace_back(LibStem); + Candidates.back() += ".dll"; +#else + Candidates.emplace_back(LibStem); + Candidates.back() += ".so"; +#endif + + // Optionally try "lib" prefix if not already there + StringRef FileName = sys::path::filename(LibStem); + StringRef Base = sys::path::parent_path(LibStem); + if (!FileName.starts_with("lib")) { + SmallString<256> WithPrefix(Base); + if (!WithPrefix.empty()) + sys::path::append(WithPrefix, ""); // ensure separator if needed + WithPrefix += "lib"; + WithPrefix += FileName; + +#if defined(__APPLE__) + WithPrefix += ".dylib"; +#elif defined(_WIN32) + WithPrefix += ".dll"; +#else + WithPrefix += ".so"; +#endif + + Candidates.push_back(std::move(WithPrefix)); + } + + LLVM_DEBUG({ + dbgs() << " Candidates to try:\n"; + for (const auto &C : Candidates) + dbgs() << " " << C << "\n"; + }); + + // Try all variants using tryAllPaths + for (const auto &Name : Candidates) { + + LLVM_DEBUG(dbgs() << " Trying candidate: " << Name << "\n";); + + for (const auto &R : Resolvers) { + if (auto Res = R.resolve(Name, Substitutor, Validator)) + return Res; + } + } + + LLVM_DEBUG(dbgs() << " -> No candidate Resolved.\n";); + + return std::nullopt; +} + +std::optional +DylibResolverImpl::resolve(StringRef LibStem, bool VariateLibStem) const { + LLVM_DEBUG(dbgs() << "Resolving library stem: " << LibStem << "\n";); + + // If it is an absolute path, don't try iterate over the paths. + if (sys::path::is_absolute(LibStem)) { + LLVM_DEBUG(dbgs() << " -> Absolute path detected.\n";); + return Validator.validate(LibStem); + } + + if (!LibStem.starts_with_insensitive("@rpath")) { + if (auto norm = Validator.validate(Substitutor.substitute(LibStem))) { + LLVM_DEBUG(dbgs() << " -> Resolved after substitution: " << *norm + << "\n";); + + return norm; + } + } + + for (const auto &R : Resolvers) { + LLVM_DEBUG(dbgs() << " -> Resolving via search path ... \n";); + if (auto Result = R.resolve(LibStem, Substitutor, Validator)) { + LLVM_DEBUG(dbgs() << " -> Resolved via search path: " << *Result + << "\n";); + + return Result; + } + } + + // Expand libStem with paths, extensions, etc. + // std::string foundName; + if (VariateLibStem) { + LLVM_DEBUG(dbgs() << " -> Trying with extensions...\n";); + + if (auto Norm = tryWithExtensions(LibStem)) { + LLVM_DEBUG(dbgs() << " -> Resolved via tryWithExtensions: " << *Norm + << "\n";); + + return Norm; + } + } + + LLVM_DEBUG(dbgs() << " -> Could not resolve: " << LibStem << "\n";); + + return std::nullopt; +} + +#ifndef _WIN32 +mode_t PathResolver::lstatCached(StringRef Path) { + // If already cached - retun cached result + if (auto Cache = LibPathCache->read_lstat(Path)) + return *Cache; + + // Not cached: perform lstat and store + struct stat buf{}; + mode_t st_mode = (lstat(Path.str().c_str(), &buf) == -1) ? 0 : buf.st_mode; + + LibPathCache->insert_lstat(Path, st_mode); + + return st_mode; +} + +std::optional PathResolver::readlinkCached(StringRef Path) { + // If already cached - retun cached result + if (auto Cache = LibPathCache->read_link(Path)) + return Cache; + + // If result not in cache - call system function and cache result + char buf[PATH_MAX]; + ssize_t len; + if ((len = readlink(Path.str().c_str(), buf, sizeof(buf))) != -1) { + buf[len] = '\0'; + std::string s(buf); + LibPathCache->insert_link(Path, s); + return s; + } + return std::nullopt; +} + +void createComponent(StringRef Path, StringRef BasePath, bool BaseIsResolved, + SmallVector &Component) { + StringRef Separator = sys::path::get_separator(); + if (!BaseIsResolved) { + if (Path[0] == '~' && + (Path.size() == 1 || sys::path::is_separator(Path[1]))) { + static SmallString<128> HomeP; + if (HomeP.str().empty()) + sys::path::home_directory(HomeP); + StringRef(HomeP).split(Component, Separator, /*MaxSplit*/ -1, + /*KeepEmpty*/ false); + } else if (BasePath.empty()) { + static SmallString<256> CurrentPath; + if (CurrentPath.str().empty()) + sys::fs::current_path(CurrentPath); + StringRef(CurrentPath) + .split(Component, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false); + } else { + BasePath.split(Component, Separator, /*MaxSplit*/ -1, + /*KeepEmpty*/ false); + } + } + + Path.split(Component, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false); +} + +void normalizePathSegments(SmallVector &PathParts) { + SmallVector NormalizedPath; + for (auto &Part : PathParts) { + if (Part == ".") { + continue; + } else if (Part == "..") { + if (!NormalizedPath.empty() && NormalizedPath.back() != "..") { + NormalizedPath.pop_back(); + } else { + NormalizedPath.push_back(".."); + } + } else { + NormalizedPath.push_back(Part); + } + } + PathParts.swap(NormalizedPath); +} +#endif + +std::optional PathResolver::realpathCached(StringRef Path, + std::error_code &EC, + StringRef Base, + bool BaseIsResolved, + long SymLoopLevel) { + EC.clear(); + + if (Path.empty()) { + EC = std::make_error_code(std::errc::no_such_file_or_directory); + LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Empty path\n";); + + return std::nullopt; + } + + if (SymLoopLevel <= 0) { + EC = std::make_error_code(std::errc::too_many_symbolic_link_levels); + LLVM_DEBUG( + dbgs() << "PathResolver::realpathCached: Too many Symlink levels: " + << Path << "\n";); + + return std::nullopt; + } + + // If already cached - retun cached result + bool isRelative = sys::path::is_relative(Path); + if (!isRelative) { + if (auto Cached = LibPathCache->read_realpath(Path)) { + EC = Cached->ErrnoCode; + if (EC) { + LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Cached (error) for " + << Path << "\n";); + } else { + LLVM_DEBUG( + dbgs() << "PathResolver::realpathCached: Cached (success) for " + << Path << " => " << Cached->canonicalPath << "\n";); + } + return Cached->canonicalPath.empty() + ? std::nullopt + : std::make_optional(Cached->canonicalPath); + } + } + + LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Resolving path: " << Path + << "\n";); + + // If result not in cache - call system function and cache result + + StringRef Separator(sys::path::get_separator()); + SmallString<256> Resolved(Separator); +#ifndef _WIN32 + SmallVector Components; + + if (isRelative) { + if (BaseIsResolved) { + Resolved.assign(Base); + LLVM_DEBUG(dbgs() << " Using Resolved base: " << Base << "\n";); + } + createComponent(Path, Base, BaseIsResolved, Components); + } else { + Path.split(Components, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false); + } + + normalizePathSegments(Components); + LLVM_DEBUG({ + for (auto &C : Components) + dbgs() << " " << C << " "; + + dbgs() << "\n"; + }); + + // Handle path list items + for (const auto &Component : Components) { + if (Component == ".") + continue; + if (Component == "..") { + // collapse "a/b/../c" to "a/c" + size_t S = Resolved.rfind(Separator); + if (S != llvm::StringRef::npos) + Resolved.resize(S); + if (Resolved.empty()) + Resolved = Separator; + continue; + } + + size_t oldSize = Resolved.size(); + sys::path::append(Resolved, Component); + const char *ResolvedPath = Resolved.c_str(); + LLVM_DEBUG(dbgs() << " Processing Component: " << Component << " => " + << ResolvedPath << "\n";); + mode_t st_mode = lstatCached(ResolvedPath); + + if (S_ISLNK(st_mode)) { + LLVM_DEBUG(dbgs() << " Found symlink: " << ResolvedPath << "\n";); + + auto SymlinkOpt = readlinkCached(ResolvedPath); + if (!SymlinkOpt) { + EC = std::make_error_code(std::errc::no_such_file_or_directory); + LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC}); + LLVM_DEBUG(dbgs() << " Failed to read symlink: " << ResolvedPath + << "\n";); + + return std::nullopt; + } + + StringRef Symlink = *SymlinkOpt; + LLVM_DEBUG(dbgs() << " Symlink points to: " << Symlink << "\n";); + + std::string resolvedBase = ""; + if (sys::path::is_relative(Symlink)) { + Resolved.resize(oldSize); + resolvedBase = Resolved.str().str(); + } + + auto RealSymlink = + realpathCached(Symlink, EC, resolvedBase, + /*BaseIsResolved=*/true, SymLoopLevel - 1); + if (!RealSymlink) { + LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC}); + LLVM_DEBUG(dbgs() << " Failed to resolve symlink target: " << Symlink + << "\n";); + + return std::nullopt; + } + + Resolved.assign(*RealSymlink); + LLVM_DEBUG(dbgs() << " Symlink Resolved to: " << Resolved << "\n";); + + } else if (st_mode == 0) { + EC = std::make_error_code(std::errc::no_such_file_or_directory); + LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC}); + LLVM_DEBUG(dbgs() << " Component does not exist: " << ResolvedPath + << "\n";); + + return std::nullopt; + } + } +#else + EC = sys::fs::real_path(Path, Resolved); // Windows fallback +#endif + + std::string Canonical = Resolved.str().str(); + { + LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{ + Canonical, + std::error_code() // success + }); + } + LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Final Resolved: " << Path + << " => " << Canonical << "\n";); + return Canonical; +} + +void LibraryScanHelper::addBasePath(const std::string &Path, PathType K) { + std::error_code EC; + std::string Canon = resolveCanonical(Path, EC); + if (EC) { + LLVM_DEBUG( + dbgs() + << "LibraryScanHelper::addBasePath: Failed to canonicalize path: " + << Path << "\n";); + return; + } + std::unique_lock Lock(Mtx); + if (LibSearchPaths.count(Canon)) { + LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Already added: " + << Canon << "\n";); + return; + } + K = K == PathType::Unknown ? classifyKind(Canon) : K; + auto SP = std::make_shared(Canon, K); + LibSearchPaths[Canon] = SP; + + if (K == PathType::User) { + LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Added User path: " + << Canon << "\n";); + UnscannedUsr.push_back(StringRef(SP->BasePath)); + } else { + LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Added System path: " + << Canon << "\n";); + UnscannedSys.push_back(StringRef(SP->BasePath)); + } +} + +std::vector> +LibraryScanHelper::getNextBatch(PathType K, size_t BatchSize) { + std::vector> Result; + auto &Queue = (K == PathType::User) ? UnscannedUsr : UnscannedSys; + + std::unique_lock Lock(Mtx); + + while (!Queue.empty() && (BatchSize == 0 || Result.size() < BatchSize)) { + StringRef Base = Queue.front(); + auto It = LibSearchPaths.find(Base); + if (It != LibSearchPaths.end()) { + auto &SP = It->second; + ScanState Expected = ScanState::NotScanned; + if (SP->State.compare_exchange_strong(Expected, ScanState::Scanning)) { + Result.push_back(SP); + } + } + Queue.pop_front(); + } + + return Result; +} + +bool LibraryScanHelper::isTrackedBasePath(StringRef Path) const { + std::error_code EC; + std::string Canon = resolveCanonical(Path, EC); + if (EC) + return false; + + std::shared_lock Lock(Mtx); + return LibSearchPaths.count(Canon) > 0; +} + +bool LibraryScanHelper::leftToScan(PathType K) const { + std::shared_lock Lock(Mtx); + for (const auto &KV : LibSearchPaths) { + const auto &SP = KV.second; + if (SP->Kind == K && SP->State == ScanState::NotScanned) + return true; + } + return false; +} + +void LibraryScanHelper::resetToScan() { + std::shared_lock Lock(Mtx); + + for (auto &[_, SP] : LibSearchPaths) { + ScanState Expected = ScanState::Scanned; + + if (!SP->State.compare_exchange_strong(Expected, ScanState::NotScanned)) + continue; + + auto &TargetList = + (SP->Kind == PathType::User) ? UnscannedUsr : UnscannedSys; + TargetList.emplace_back(SP->BasePath); + } +} + +std::vector> +LibraryScanHelper::getAllUnits() const { + std::shared_lock Lock(Mtx); + std::vector> Result; + Result.reserve(LibSearchPaths.size()); + for (const auto &[_, SP] : LibSearchPaths) { + Result.push_back(SP); + } + return Result; +} + +std::string LibraryScanHelper::resolveCanonical(StringRef Path, + std::error_code &EC) const { + auto Canon = LibPathResolver->resolve(Path, EC); + return EC ? Path.str() : *Canon; +} + +PathType LibraryScanHelper::classifyKind(StringRef Path) const { + // Detect home directory + const char *Home = getenv("HOME"); + if (Home && Path.find(Home) == 0) + return PathType::User; + + static const std::array UserPrefixes = { + "/usr/local", // often used by users for manual installs + "/opt/homebrew", // common on macOS + "/opt/local", // MacPorts + "/home", // Linux home dirs + "/Users", // macOS user dirs + }; + + for (const auto &Prefix : UserPrefixes) { + if (Path.find(Prefix) == 0) + return PathType::User; + } + + return PathType::System; +} + +Expected parseMachODeps(const object::MachOObjectFile &Obj) { + LibraryDepsInfo Libdeps; + LLVM_DEBUG(dbgs() << "Parsing Mach-O dependencies...\n";); + for (const auto &Command : Obj.load_commands()) { + switch (Command.C.cmd) { + case MachO::LC_LOAD_DYLIB: { + MachO::dylib_command dylibCmd = Obj.getDylibIDLoadCommand(Command); + const char *name = Command.Ptr + dylibCmd.dylib.name; + Libdeps.addDep(name); + LLVM_DEBUG(dbgs() << " Found LC_LOAD_DYLIB: " << name << "\n";); + } break; + case MachO::LC_LOAD_WEAK_DYLIB: + case MachO::LC_REEXPORT_DYLIB: + case MachO::LC_LOAD_UPWARD_DYLIB: + case MachO::LC_LAZY_LOAD_DYLIB: + break; + case MachO::LC_RPATH: { + // Extract RPATH + MachO::rpath_command rpathCmd = Obj.getRpathCommand(Command); + const char *rpath = Command.Ptr + rpathCmd.path; + LLVM_DEBUG(dbgs() << " Found LC_RPATH: " << rpath << "\n";); + + SmallVector RawPaths; + SplitString(StringRef(rpath), RawPaths, + sys::EnvPathSeparator == ':' ? ":" : ";"); + + for (const auto &raw : RawPaths) { + Libdeps.addRPath(raw.str()); // Convert to std::string + LLVM_DEBUG(dbgs() << " Parsed RPATH entry: " << raw << "\n";); + } + break; + } + } + } + + return Expected(std::move(Libdeps)); +} + +template +static Expected getDynamicStrTab(const object::ELFFile &Elf) { + auto DynamicEntriesOrError = Elf.dynamicEntries(); + if (!DynamicEntriesOrError) + return DynamicEntriesOrError.takeError(); + + for (const typename ELFT::Dyn &Dyn : *DynamicEntriesOrError) { + if (Dyn.d_tag == ELF::DT_STRTAB) { + auto MappedAddrOrError = Elf.toMappedAddr(Dyn.getPtr()); + if (!MappedAddrOrError) + return MappedAddrOrError.takeError(); + return StringRef(reinterpret_cast(*MappedAddrOrError)); + } + } + + // If the dynamic segment is not present, we fall back on the sections. + auto SectionsOrError = Elf.sections(); + if (!SectionsOrError) + return SectionsOrError.takeError(); + + for (const typename ELFT::Shdr &Sec : *SectionsOrError) { + if (Sec.sh_type == ELF::SHT_DYNSYM) + return Elf.getStringTableForSymtab(Sec); + } + + return make_error("dynamic string table not found", + inconvertibleErrorCode()); +} + +template +Expected parseELF(const object::ELFFile &Elf) { + LibraryDepsInfo Deps; + Expected StrTabOrErr = getDynamicStrTab(Elf); + if (!StrTabOrErr) + return StrTabOrErr.takeError(); + + const char *Data = StrTabOrErr->data(); + + auto DynamicEntriesOrError = Elf.dynamicEntries(); + if (!DynamicEntriesOrError) { + return DynamicEntriesOrError.takeError(); + } + + for (const typename ELFT::Dyn &Dyn : *DynamicEntriesOrError) { + switch (Dyn.d_tag) { + case ELF::DT_NEEDED: + Deps.addDep(Data + Dyn.d_un.d_val); + break; + case ELF::DT_RPATH: { + SmallVector RawPaths; + SplitString(Data + Dyn.d_un.d_val, RawPaths, + sys::EnvPathSeparator == ':' ? ":" : ";"); + for (const auto &raw : RawPaths) + Deps.addRPath(raw.str()); + break; + } + case ELF::DT_RUNPATH: { + SmallVector RawPaths; + SplitString(Data + Dyn.d_un.d_val, RawPaths, + sys::EnvPathSeparator == ':' ? ":" : ";"); + for (const auto &raw : RawPaths) + Deps.addRunPath(raw.str()); + break; + } + case ELF::DT_FLAGS_1: + // Check if this is not a pie executable. + if (Dyn.d_un.d_val & ELF::DF_1_PIE) + Deps.isPIE = true; + break; + // (Dyn.d_tag == ELF::DT_NULL) continue; + // (Dyn.d_tag == ELF::DT_AUXILIARY || Dyn.d_tag == ELF::DT_FILTER) + default: + break; + } + } + + return Expected(std::move(Deps)); +} + +Expected parseELFDeps(const object::ELFObjectFileBase &Obj) { + using namespace object; + LLVM_DEBUG(dbgs() << "parseELFDeps: Detected ELF object\n";); + if (const auto *ELF = dyn_cast(&Obj)) + return parseELF(ELF->getELFFile()); + else if (const auto *ELF = dyn_cast(&Obj)) + return parseELF(ELF->getELFFile()); + else if (const auto *ELF = dyn_cast(&Obj)) + return parseELF(ELF->getELFFile()); + else if (const auto *ELF = dyn_cast(&Obj)) + return parseELF(ELF->getELFFile()); + + LLVM_DEBUG(dbgs() << "parseELFDeps: Unknown ELF format\n";); + return createStringError(std::errc::not_supported, "Unknown ELF format"); +} + +Expected LibraryScanner::extractDeps(StringRef FilePath) { + LLVM_DEBUG(dbgs() << "extractDeps: Attempting to open file " << FilePath + << "\n";); + + ObjectFileLoader ObjLoader(FilePath); + auto ObjOrErr = ObjLoader.getObjectFile(); + if (!ObjOrErr) { + LLVM_DEBUG(dbgs() << "extractDeps: Failed to open " << FilePath << "\n";); + return ObjOrErr.takeError(); + } + + object::ObjectFile *Obj = &ObjOrErr.get(); + + if (auto *elfObj = dyn_cast(Obj)) { + LLVM_DEBUG(dbgs() << "extractDeps: File " << FilePath + << " is an ELF object\n";); + + return parseELFDeps(*elfObj); + } + + if (auto *macho = dyn_cast(Obj)) { + LLVM_DEBUG(dbgs() << "extractDeps: File " << FilePath + << " is a Mach-O object\n";); + return parseMachODeps(*macho); + } + + if (Obj->isCOFF()) { + // TODO: COFF support + return LibraryDepsInfo(); + } + + LLVM_DEBUG(dbgs() << "extractDeps: Unsupported binary format for file " + << FilePath << "\n";); + return createStringError(inconvertibleErrorCode(), + "Unsupported binary format: %s", + FilePath.str().c_str()); +} + +std::optional LibraryScanner::shouldScan(StringRef FilePath) { + std::error_code EC; + + LLVM_DEBUG(dbgs() << "[shouldScan] Checking: " << FilePath << "\n";); + + // [1] Check file existence early + if (!sys::fs::exists(FilePath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: file does not exist.\n";); + + return std::nullopt; + } + + // [2] Resolve to canonical path + auto CanonicalPathOpt = ScanHelper.resolve(FilePath, EC); + if (EC || !CanonicalPathOpt) { + LLVM_DEBUG(dbgs() << " -> Skipped: failed to resolve path (EC=" + << EC.message() << ").\n";); + + return std::nullopt; + } + + const std::string &CanonicalPath = *CanonicalPathOpt; + LLVM_DEBUG(dbgs() << " -> Canonical path: " << CanonicalPath << "\n"); + + // [3] Check if it's a directory — skip directories + if (sys::fs::is_directory(CanonicalPath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: path is a directory.\n";); + + return std::nullopt; + } + + // [4] Skip if it's not a shared library. + if (!DylibPathValidator::isSharedLibrary(CanonicalPath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: not a shared library.\n";); + return std::nullopt; + } + + // [5] Skip if we've already seen this path (via cache) + if (ScanHelper.hasSeenOrMark(CanonicalPath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: already seen.\n";); + + return std::nullopt; + } + + // [6] Already tracked in LibraryManager? + if (LibMgr.hasLibrary(CanonicalPath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: already tracked by LibraryManager.\n";); + + return std::nullopt; + } + + // [7] Run user-defined hook (default: always true) + if (!ShouldScanCall(CanonicalPath)) { + LLVM_DEBUG(dbgs() << " -> Skipped: user-defined hook rejected.\n";); + + return std::nullopt; + } + + LLVM_DEBUG(dbgs() << " -> Accepted: ready to scan " << CanonicalPath + << "\n";); + return CanonicalPath; +} + +void LibraryScanner::handleLibrary(StringRef FilePath, PathType K, int level) { + LLVM_DEBUG(dbgs() << "LibraryScanner::handleLibrary: Scanning: " << FilePath + << ", level=" << level << "\n";); + auto CanonPathOpt = shouldScan(FilePath); + if (!CanonPathOpt) { + LLVM_DEBUG(dbgs() << " Skipped (shouldScan returned false): " << FilePath + << "\n";); + + return; + } + const std::string CanonicalPath = *CanonPathOpt; + + auto DepsOrErr = extractDeps(CanonicalPath); + if (!DepsOrErr) { + LLVM_DEBUG(dbgs() << " Failed to extract deps for: " << CanonicalPath + << "\n";); + handleError(DepsOrErr.takeError()); + return; + } + + LibraryDepsInfo &Deps = *DepsOrErr; + + LLVM_DEBUG({ + dbgs() << " Found deps : \n"; + for (const auto &dep : Deps.deps) + dbgs() << " : " << dep << "\n"; + dbgs() << " Found @rpath : " << Deps.rpath.size() << "\n"; + for (const auto &r : Deps.rpath) + dbgs() << " : " << r << "\n"; + dbgs() << " Found @runpath : \n"; + for (const auto &r : Deps.runPath) + dbgs() << " : " << r << "\n"; + }); + + if (Deps.isPIE && level == 0) { + LLVM_DEBUG(dbgs() << " Skipped PIE executable at top level: " + << CanonicalPath << "\n";); + + return; + } + + bool Added = LibMgr.addLibrary(CanonicalPath, K); + if (!Added) { + LLVM_DEBUG(dbgs() << " Already added: " << CanonicalPath << "\n";); + return; + } + + // Heuristic 1: No RPATH/RUNPATH, skip deps + if (Deps.rpath.empty() && Deps.runPath.empty()) { + LLVM_DEBUG( + dbgs() << "LibraryScanner::handleLibrary: Skipping deps (Heuristic1): " + << CanonicalPath << "\n";); + return; + } + + // Heuristic 2: All RPATH and RUNPATH already tracked + auto allTracked = [&](const auto &Paths) { + LLVM_DEBUG(dbgs() << " Checking : " << Paths.size() << "\n";); + return std::all_of(Paths.begin(), Paths.end(), [&](StringRef P) { + LLVM_DEBUG(dbgs() << " Checking isTrackedBasePath : " << P << "\n";); + return ScanHelper.isTrackedBasePath( + DylibResolver::resolvelinkerFlag(P, CanonicalPath)); + }); + }; + + if (allTracked(Deps.rpath) && allTracked(Deps.runPath)) { + LLVM_DEBUG( + dbgs() << "LibraryScanner::handleLibrary: Skipping deps (Heuristic2): " + << CanonicalPath << "\n";); + return; + } + + DylibPathValidator Validator(ScanHelper.getPathResolver()); + DylibResolver Resolver(Validator); + Resolver.configure(CanonicalPath, + {{Deps.rpath, SearchPathType::RPath}, + {ScanHelper.getSearchPaths(), SearchPathType::UsrOrSys}, + {Deps.runPath, SearchPathType::RunPath}}); + for (StringRef Dep : Deps.deps) { + LLVM_DEBUG(dbgs() << " Resolving dep: " << Dep << "\n";); + auto DepFullOpt = Resolver.resolve(Dep); + if (!DepFullOpt) { + LLVM_DEBUG(dbgs() << " Failed to resolve dep: " << Dep << "\n";); + + continue; + } + LLVM_DEBUG(dbgs() << " Resolved dep to: " << *DepFullOpt << "\n";); + + handleLibrary(*DepFullOpt, K, level + 1); + } +} + +void LibraryScanner::scanBaseDir(std::shared_ptr SP) { + if (!sys::fs::is_directory(SP->BasePath) || SP->BasePath.empty()) { + LLVM_DEBUG( + dbgs() << "LibraryScanner::scanBaseDir: Invalid or empty basePath: " + << SP->BasePath << "\n";); + return; + } + + LLVM_DEBUG(dbgs() << "LibraryScanner::scanBaseDir: Scanning directory: " + << SP->BasePath << "\n";); + std::error_code EC; + + SP->State.store(ScanState::Scanning); + + for (sys::fs::directory_iterator It(SP->BasePath, EC), end; It != end && !EC; + It.increment(EC)) { + auto Entry = *It; + if (!Entry.status()) + continue; + + auto Status = *Entry.status(); + if (sys::fs::is_regular_file(Status) || sys::fs::is_symlink_file(Status)) { + LLVM_DEBUG(dbgs() << " Found file: " << Entry.path() << "\n";); + // async support ? + handleLibrary(Entry.path(), SP->Kind); + } + } + + SP->State.store(ScanState::Scanned); +} + +void LibraryScanner::scanNext(PathType K, size_t BatchSize) { + LLVM_DEBUG(dbgs() << "LibraryScanner::scanNext: Scanning next batch of size " + << BatchSize << " for kind " + << (K == PathType::User ? "User" : "System") << "\n";); + + auto SearchPaths = ScanHelper.getNextBatch(K, BatchSize); + for (auto &SP : SearchPaths) { + LLVM_DEBUG(dbgs() << " Scanning unit with basePath: " << SP->BasePath + << "\n";); + + scanBaseDir(SP); + } +} + +} // end namespace llvm::orc diff --git a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt index b06aa2565bb04..7b563d7bcc68c 100644 --- a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt +++ b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt @@ -26,6 +26,7 @@ add_llvm_unittest(OrcJITTests IndirectionUtilsTest.cpp JITTargetMachineBuilderTest.cpp LazyCallThroughAndReexportsTest.cpp + LibraryResolverTest.cpp LookupAndRecordAddrsTest.cpp MachOPlatformTest.cpp MapperJITLinkMemoryManagerTest.cpp diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_linux.yaml new file mode 100644 index 0000000000000..afd1d9e69448d --- /dev/null +++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_linux.yaml @@ -0,0 +1,460 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +ProgramHeaders: + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .rela.plt + Align: 0x1000 + Offset: 0x0 + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + FirstSec: .init + LastSec: .fini + VAddr: 0x1000 + Align: 0x1000 + Offset: 0x1000 + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .rodata + LastSec: .eh_frame + VAddr: 0x2000 + Align: 0x1000 + Offset: 0x2000 + - Type: PT_LOAD + Flags: [ PF_W, PF_R ] + FirstSec: .init_array + LastSec: .bss + VAddr: 0x3E10 + Align: 0x1000 + Offset: 0x2E10 + - Type: PT_DYNAMIC + Flags: [ PF_W, PF_R ] + FirstSec: .dynamic + LastSec: .dynamic + VAddr: 0x3E20 + Align: 0x8 + Offset: 0x2E20 + - Type: PT_NOTE + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .note.gnu.property + VAddr: 0x2A8 + Align: 0x8 + Offset: 0x2A8 + - Type: PT_NOTE + Flags: [ PF_R ] + FirstSec: .note.gnu.build-id + LastSec: .note.gnu.build-id + VAddr: 0x2C8 + Align: 0x4 + Offset: 0x2C8 + - Type: PT_GNU_PROPERTY + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .note.gnu.property + VAddr: 0x2A8 + Align: 0x8 + Offset: 0x2A8 + - Type: PT_GNU_EH_FRAME + Flags: [ PF_R ] + FirstSec: .eh_frame_hdr + LastSec: .eh_frame_hdr + VAddr: 0x2010 + Align: 0x4 + Offset: 0x2010 + - Type: PT_GNU_STACK + Flags: [ PF_W, PF_R ] + Align: 0x10 + Offset: 0x0 + - Type: PT_GNU_RELRO + Flags: [ PF_R ] + FirstSec: .init_array + LastSec: .got + VAddr: 0x3E10 + Offset: 0x2E10 +Sections: + - Name: .note.gnu.property + Type: SHT_NOTE + Flags: [ SHF_ALLOC ] + Address: 0x2A8 + AddressAlign: 0x8 + Notes: + - Name: GNU + Desc: 020000C0040000000300000000000000 + Type: NT_GNU_PROPERTY_TYPE_0 + - Name: .note.gnu.build-id + Type: SHT_NOTE + Flags: [ SHF_ALLOC ] + Address: 0x2C8 + AddressAlign: 0x4 + Notes: + - Name: GNU + Desc: 73604396C95840D5C380A0950F085A778F94EE7C + Type: NT_PRPSINFO + - Name: .gnu.hash + Type: SHT_GNU_HASH + Flags: [ SHF_ALLOC ] + Address: 0x2F0 + Link: .dynsym + AddressAlign: 0x8 + Header: + SymNdx: 0x6 + Shift2: 0x6 + BloomFilter: [ 0x400000080000 ] + HashBuckets: [ 0x0, 0x6 ] + HashValues: [ 0x7C9DCB93 ] + - Name: .dynsym + Type: SHT_DYNSYM + Flags: [ SHF_ALLOC ] + Address: 0x318 + Link: .dynstr + AddressAlign: 0x8 + - Name: .dynstr + Type: SHT_STRTAB + Flags: [ SHF_ALLOC ] + Address: 0x3C0 + AddressAlign: 0x1 + - Name: .gnu.version + Type: SHT_GNU_versym + Flags: [ SHF_ALLOC ] + Address: 0x436 + Link: .dynsym + AddressAlign: 0x2 + Entries: [ 0, 1, 2, 1, 1, 2, 1 ] + - Name: .gnu.version_r + Type: SHT_GNU_verneed + Flags: [ SHF_ALLOC ] + Address: 0x448 + Link: .dynstr + AddressAlign: 0x8 + Dependencies: + - Version: 1 + File: libc.so.6 + Entries: + - Name: GLIBC_2.2.5 + Hash: 157882997 + Flags: 0 + Other: 2 + - Name: .rela.dyn + Type: SHT_RELA + Flags: [ SHF_ALLOC ] + Address: 0x468 + Link: .dynsym + AddressAlign: 0x8 + Relocations: + - Offset: 0x3E10 + Type: R_X86_64_RELATIVE + Addend: 4368 + - Offset: 0x3E18 + Type: R_X86_64_RELATIVE + Addend: 4304 + - Offset: 0x4020 + Type: R_X86_64_RELATIVE + Addend: 16416 + - Offset: 0x3FE0 + Symbol: _ITM_deregisterTMCloneTable + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FE8 + Symbol: __gmon_start__ + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FF0 + Symbol: _ITM_registerTMCloneTable + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FF8 + Symbol: __cxa_finalize + Type: R_X86_64_GLOB_DAT + - Name: .rela.plt + Type: SHT_RELA + Flags: [ SHF_ALLOC, SHF_INFO_LINK ] + Address: 0x510 + Link: .dynsym + AddressAlign: 0x8 + Info: .got.plt + Relocations: + - Offset: 0x4018 + Symbol: puts + Type: R_X86_64_JUMP_SLOT + - Name: .init + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1000 + AddressAlign: 0x4 + Offset: 0x1000 + Content: F30F1EFA4883EC08488B05D92F00004885C07402FFD04883C408C3 + - Name: .plt + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1020 + AddressAlign: 0x10 + EntSize: 0x10 + Content: FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90 + - Name: .plt.got + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1040 + AddressAlign: 0x10 + EntSize: 0x10 + Content: F30F1EFAF2FF25AD2F00000F1F440000 + - Name: .plt.sec + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1050 + AddressAlign: 0x10 + EntSize: 0x10 + Content: F30F1EFAF2FF25BD2F00000F1F440000 + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1060 + AddressAlign: 0x10 + Content: 488D3DC12F0000488D05BA2F00004839F87415488B05662F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D912F0000488D358A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05352F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D4D2F000000752B5548833D122F0000004889E5740C488B3D2E2F0000E849FFFFFFE864FFFFFFC605252F0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5488D05D80E00004889C7E820FFFFFF905DC3 + - Name: .fini + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1134 + AddressAlign: 0x4 + Content: F30F1EFA4883EC084883C408C3 + - Name: .rodata + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x2000 + AddressAlign: 0x1 + Offset: 0x2000 + Content: 48656C6C6F2066726F6D204100 + - Name: .eh_frame_hdr + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x2010 + AddressAlign: 0x4 + Content: 011B033B2C0000000400000010F0FFFF4800000030F0FFFF7000000040F0FFFF8800000009F1FFFFA0000000 + - Name: .eh_frame + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x2040 + AddressAlign: 0x8 + Content: 1400000000000000017A5200017810011B0C070890010000240000001C000000C0EFFFFF20000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000B8EFFFFF100000000000000000000000140000005C000000B0EFFFFF1000000000000000000000001C0000007400000061F0FFFF1A00000000450E108602430D06510C070800000000000000 + - Name: .init_array + Type: SHT_INIT_ARRAY + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3E10 + AddressAlign: 0x8 + EntSize: 0x8 + Offset: 0x2E10 + Content: '1011000000000000' + - Name: .fini_array + Type: SHT_FINI_ARRAY + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3E18 + AddressAlign: 0x8 + EntSize: 0x8 + Content: D010000000000000 + - Name: .dynamic + Type: SHT_DYNAMIC + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3E20 + Link: .dynstr + AddressAlign: 0x8 + Entries: + - Tag: DT_NEEDED + Value: 0x5F + - Tag: DT_INIT + Value: 0x1000 + - Tag: DT_FINI + Value: 0x1134 + - Tag: DT_INIT_ARRAY + Value: 0x3E10 + - Tag: DT_INIT_ARRAYSZ + Value: 0x8 + - Tag: DT_FINI_ARRAY + Value: 0x3E18 + - Tag: DT_FINI_ARRAYSZ + Value: 0x8 + - Tag: DT_GNU_HASH + Value: 0x2F0 + - Tag: DT_STRTAB + Value: 0x3C0 + - Tag: DT_SYMTAB + Value: 0x318 + - Tag: DT_STRSZ + Value: 0x75 + - Tag: DT_SYMENT + Value: 0x18 + - Tag: DT_PLTGOT + Value: 0x4000 + - Tag: DT_PLTRELSZ + Value: 0x18 + - Tag: DT_PLTREL + Value: 0x7 + - Tag: DT_JMPREL + Value: 0x510 + - Tag: DT_RELA + Value: 0x468 + - Tag: DT_RELASZ + Value: 0xA8 + - Tag: DT_RELAENT + Value: 0x18 + - Tag: DT_VERNEED + Value: 0x448 + - Tag: DT_VERNEEDNUM + Value: 0x1 + - Tag: DT_VERSYM + Value: 0x436 + - Tag: DT_RELACOUNT + Value: 0x3 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Name: .got + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3FE0 + AddressAlign: 0x8 + EntSize: 0x8 + Content: '0000000000000000000000000000000000000000000000000000000000000000' + - Name: .got.plt + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4000 + AddressAlign: 0x8 + EntSize: 0x8 + Content: '203E000000000000000000000000000000000000000000003010000000000000' + - Name: .data + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4020 + AddressAlign: 0x8 + Content: '2040000000000000' + - Name: .bss + Type: SHT_NOBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4028 + AddressAlign: 0x1 + Size: 0x8 + - Name: .comment + Type: SHT_PROGBITS + Flags: [ SHF_MERGE, SHF_STRINGS ] + AddressAlign: 0x1 + EntSize: 0x1 + Content: 4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000 +Symbols: + - Name: crtstuff.c + Type: STT_FILE + Index: SHN_ABS + - Name: deregister_tm_clones + Type: STT_FUNC + Section: .text + Value: 0x1060 + - Name: register_tm_clones + Type: STT_FUNC + Section: .text + Value: 0x1090 + - Name: __do_global_dtors_aux + Type: STT_FUNC + Section: .text + Value: 0x10D0 + - Name: completed.0 + Type: STT_OBJECT + Section: .bss + Value: 0x4028 + Size: 0x1 + - Name: __do_global_dtors_aux_fini_array_entry + Type: STT_OBJECT + Section: .fini_array + Value: 0x3E18 + - Name: frame_dummy + Type: STT_FUNC + Section: .text + Value: 0x1110 + - Name: __frame_dummy_init_array_entry + Type: STT_OBJECT + Section: .init_array + Value: 0x3E10 + - Name: libA.c + Type: STT_FILE + Index: SHN_ABS + - Name: 'crtstuff.c (1)' + Type: STT_FILE + Index: SHN_ABS + - Name: __FRAME_END__ + Type: STT_OBJECT + Section: .eh_frame + Value: 0x20D0 + - Type: STT_FILE + Index: SHN_ABS + - Name: _fini + Type: STT_FUNC + Section: .fini + Value: 0x1134 + - Name: __dso_handle + Type: STT_OBJECT + Section: .data + Value: 0x4020 + - Name: _DYNAMIC + Type: STT_OBJECT + Section: .dynamic + Value: 0x3E20 + - Name: __GNU_EH_FRAME_HDR + Section: .eh_frame_hdr + Value: 0x2010 + - Name: __TMC_END__ + Type: STT_OBJECT + Section: .data + Value: 0x4028 + - Name: _GLOBAL_OFFSET_TABLE_ + Type: STT_OBJECT + Section: .got.plt + Value: 0x4000 + - Name: _init + Type: STT_FUNC + Section: .init + Value: 0x1000 + - Name: _ITM_deregisterTMCloneTable + Binding: STB_WEAK + - Name: 'puts@GLIBC_2.2.5' + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: sayA + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x1119 + Size: 0x1A + - Name: __gmon_start__ + Binding: STB_WEAK + - Name: _ITM_registerTMCloneTable + Binding: STB_WEAK + - Name: '__cxa_finalize@GLIBC_2.2.5' + Type: STT_FUNC + Binding: STB_WEAK +DynamicSymbols: + - Name: _ITM_deregisterTMCloneTable + Binding: STB_WEAK + - Name: puts + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: __gmon_start__ + Binding: STB_WEAK + - Name: _ITM_registerTMCloneTable + Binding: STB_WEAK + - Name: __cxa_finalize + Type: STT_FUNC + Binding: STB_WEAK + - Name: sayA + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x1119 + Size: 0x1A +... diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_macho.yaml new file mode 100644 index 0000000000000..2e851a90c21ed --- /dev/null +++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_macho.yaml @@ -0,0 +1,723 @@ +--- !fat-mach-o +FatHeader: + magic: 0xCAFEBABE + nfat_arch: 3 +FatArchs: + - cputype: 0x1000007 + cpusubtype: 0x3 + offset: 0x1000 + size: 8376 + align: 12 + - cputype: 0x100000C + cpusubtype: 0x0 + offset: 0x4000 + size: 33376 + align: 14 + - cputype: 0x100000C + cpusubtype: 0x80000002 + offset: 0x10000 + size: 33376 + align: 14 +Slices: + - !mach-o + FileHeader: + magic: 0xFEEDFACF + cputype: 0x1000007 + cpusubtype: 0x3 + filetype: 0x6 + ncmds: 14 + sizeofcmds: 960 + flags: 0x100085 + reserved: 0x0 + LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 392 + segname: __TEXT + vmaddr: 0 + vmsize: 4096 + fileoff: 0 + filesize: 4096 + maxprot: 5 + initprot: 5 + nsects: 4 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0xF80 + size: 20 + offset: 0xF80 + align: 4 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 554889E5488D3D0F000000B000E8020000005DC3 + - sectname: __stubs + segname: __TEXT + addr: 0xF94 + size: 6 + offset: 0xF94 + align: 1 + reloff: 0x0 + nreloc: 0 + flags: 0x80000408 + reserved1: 0x0 + reserved2: 0x6 + reserved3: 0x0 + content: FF2566000000 + - sectname: __cstring + segname: __TEXT + addr: 0xF9A + size: 14 + offset: 0xF9A + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 48656C6C6F2066726F6D20410A00 + - sectname: __unwind_info + segname: __TEXT + addr: 0xFA8 + size: 88 + offset: 0xFA8 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 010000001C000000000000001C000000000000001C00000002000000800F00004000000040000000940F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000 + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __DATA_CONST + vmaddr: 4096 + vmsize: 4096 + fileoff: 4096 + filesize: 4096 + maxprot: 3 + initprot: 3 + nsects: 1 + flags: 16 + Sections: + - sectname: __got + segname: __DATA_CONST + addr: 0x1000 + size: 8 + offset: 0x1000 + align: 3 + reloff: 0x0 + nreloc: 0 + flags: 0x6 + reserved1: 0x1 + reserved2: 0x0 + reserved3: 0x0 + content: '0000000000000080' + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 8192 + vmsize: 4096 + fileoff: 8192 + filesize: 184 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + - cmd: LC_ID_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 1 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libA.dylib' + ZeroPadBytes: 7 + - cmd: LC_DYLD_CHAINED_FIXUPS + cmdsize: 16 + dataoff: 8192 + datasize: 96 + - cmd: LC_DYLD_EXPORTS_TRIE + cmdsize: 16 + dataoff: 8288 + datasize: 24 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 8320 + nsyms: 2 + stroff: 8360 + strsize: 16 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 0 + iextdefsym: 0 + nextdefsym: 1 + iundefsym: 1 + nundefsym: 1 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 8352 + nindirectsyms: 2 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 + - cmd: LC_UUID + cmdsize: 24 + uuid: ADFFA141-C3EE-37CD-B1E7-906D69F81BCB + - cmd: LC_BUILD_VERSION + cmdsize: 32 + platform: 1 + minos: 983040 + sdk: 983552 + ntools: 1 + Tools: + - tool: 3 + version: 73074435 + - cmd: LC_SOURCE_VERSION + cmdsize: 16 + version: 0 + - cmd: LC_LOAD_DYLIB + cmdsize: 56 + dylib: + name: 24 + timestamp: 2 + current_version: 88539136 + compatibility_version: 65536 + Content: '/usr/lib/libSystem.B.dylib' + ZeroPadBytes: 6 + - cmd: LC_FUNCTION_STARTS + cmdsize: 16 + dataoff: 8312 + datasize: 8 + - cmd: LC_DATA_IN_CODE + cmdsize: 16 + dataoff: 8320 + datasize: 0 + LinkEditData: + ExportTrie: + TerminalSize: 0 + NodeOffset: 0 + Name: '' + Flags: 0x0 + Address: 0x0 + Other: 0x0 + ImportName: '' + Children: + - TerminalSize: 3 + NodeOffset: 13 + Name: _sayA + Flags: 0x0 + Address: 0xF80 + Other: 0x0 + ImportName: '' + NameList: + - n_strx: 2 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 3968 + - n_strx: 8 + n_type: 0x1 + n_sect: 0 + n_desc: 256 + n_value: 0 + StringTable: + - ' ' + - _sayA + - _printf + IndirectSymbols: [ 0x1, 0x1 ] + FunctionStarts: [ 0xF80 ] + ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, + 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, + 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0, + 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, + 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0 ] + - !mach-o + FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x6 + ncmds: 15 + sizeofcmds: 976 + flags: 0x100085 + reserved: 0x0 + LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 392 + segname: __TEXT + vmaddr: 0 + vmsize: 16384 + fileoff: 0 + filesize: 16384 + maxprot: 5 + initprot: 5 + nsects: 4 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x3F70 + size: 28 + offset: 0x3F70 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8C0035FD6 + - sectname: __stubs + segname: __TEXT + addr: 0x3F8C + size: 12 + offset: 0x3F8C + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000408 + reserved1: 0x0 + reserved2: 0xC + reserved3: 0x0 + content: 100000B0100240F900021FD6 + - sectname: __cstring + segname: __TEXT + addr: 0x3F98 + size: 14 + offset: 0x3F98 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 48656C6C6F2066726F6D20410A00 + - sectname: __unwind_info + segname: __TEXT + addr: 0x3FA8 + size: 88 + offset: 0x3FA8 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 010000001C000000000000001C000000000000001C00000002000000703F000040000000400000008C3F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000 + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __DATA_CONST + vmaddr: 16384 + vmsize: 16384 + fileoff: 16384 + filesize: 16384 + maxprot: 3 + initprot: 3 + nsects: 1 + flags: 16 + Sections: + - sectname: __got + segname: __DATA_CONST + addr: 0x4000 + size: 8 + offset: 0x4000 + align: 3 + reloff: 0x0 + nreloc: 0 + flags: 0x6 + reserved1: 0x1 + reserved2: 0x0 + reserved3: 0x0 + content: '0000000000000080' + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 32768 + vmsize: 16384 + fileoff: 32768 + filesize: 608 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + - cmd: LC_ID_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 1 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libA.dylib' + ZeroPadBytes: 7 + - cmd: LC_DYLD_CHAINED_FIXUPS + cmdsize: 16 + dataoff: 32768 + datasize: 96 + - cmd: LC_DYLD_EXPORTS_TRIE + cmdsize: 16 + dataoff: 32864 + datasize: 24 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 32896 + nsyms: 2 + stroff: 32936 + strsize: 16 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 0 + iextdefsym: 0 + nextdefsym: 1 + iundefsym: 1 + nundefsym: 1 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 32928 + nindirectsyms: 2 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 + - cmd: LC_UUID + cmdsize: 24 + uuid: C45227E0-C6C0-3137-969B-36AABF9D5487 + - cmd: LC_BUILD_VERSION + cmdsize: 32 + platform: 1 + minos: 983040 + sdk: 983552 + ntools: 1 + Tools: + - tool: 3 + version: 73074435 + - cmd: LC_SOURCE_VERSION + cmdsize: 16 + version: 0 + - cmd: LC_LOAD_DYLIB + cmdsize: 56 + dylib: + name: 24 + timestamp: 2 + current_version: 88539136 + compatibility_version: 65536 + Content: '/usr/lib/libSystem.B.dylib' + ZeroPadBytes: 6 + - cmd: LC_FUNCTION_STARTS + cmdsize: 16 + dataoff: 32888 + datasize: 8 + - cmd: LC_DATA_IN_CODE + cmdsize: 16 + dataoff: 32896 + datasize: 0 + - cmd: LC_CODE_SIGNATURE + cmdsize: 16 + dataoff: 32960 + datasize: 416 + LinkEditData: + ExportTrie: + TerminalSize: 0 + NodeOffset: 0 + Name: '' + Flags: 0x0 + Address: 0x0 + Other: 0x0 + ImportName: '' + Children: + - TerminalSize: 3 + NodeOffset: 13 + Name: _sayA + Flags: 0x0 + Address: 0x3F70 + Other: 0x0 + ImportName: '' + NameList: + - n_strx: 2 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 16240 + - n_strx: 8 + n_type: 0x1 + n_sect: 0 + n_desc: 256 + n_value: 0 + StringTable: + - ' ' + - _sayA + - _printf + IndirectSymbols: [ 0x1, 0x1 ] + FunctionStarts: [ 0x3F70 ] + ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, + 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, + 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0, + 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, + 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0 ] + - !mach-o + FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x80000002 + filetype: 0x6 + ncmds: 15 + sizeofcmds: 976 + flags: 0x100085 + reserved: 0x0 + LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 392 + segname: __TEXT + vmaddr: 0 + vmsize: 16384 + fileoff: 0 + filesize: 16384 + maxprot: 5 + initprot: 5 + nsects: 4 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x3F68 + size: 32 + offset: 0x3F68 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 7F2303D5FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8FF0F5FD6 + - sectname: __auth_stubs + segname: __TEXT + addr: 0x3F88 + size: 16 + offset: 0x3F88 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000408 + reserved1: 0x0 + reserved2: 0x10 + reserved3: 0x0 + content: 110000B031020091300240F9110A1FD7 + - sectname: __cstring + segname: __TEXT + addr: 0x3F98 + size: 14 + offset: 0x3F98 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 48656C6C6F2066726F6D20410A00 + - sectname: __unwind_info + segname: __TEXT + addr: 0x3FA8 + size: 88 + offset: 0x3FA8 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000883F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000 + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __DATA_CONST + vmaddr: 16384 + vmsize: 16384 + fileoff: 16384 + filesize: 16384 + maxprot: 3 + initprot: 3 + nsects: 1 + flags: 16 + Sections: + - sectname: __auth_got + segname: __DATA_CONST + addr: 0x4000 + size: 8 + offset: 0x4000 + align: 3 + reloff: 0x0 + nreloc: 0 + flags: 0x6 + reserved1: 0x1 + reserved2: 0x0 + reserved3: 0x0 + content: 00000000000001C0 + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 32768 + vmsize: 16384 + fileoff: 32768 + filesize: 608 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + - cmd: LC_ID_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 1 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libA.dylib' + ZeroPadBytes: 7 + - cmd: LC_DYLD_CHAINED_FIXUPS + cmdsize: 16 + dataoff: 32768 + datasize: 96 + - cmd: LC_DYLD_EXPORTS_TRIE + cmdsize: 16 + dataoff: 32864 + datasize: 24 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 32896 + nsyms: 2 + stroff: 32936 + strsize: 16 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 0 + iextdefsym: 0 + nextdefsym: 1 + iundefsym: 1 + nundefsym: 1 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 32928 + nindirectsyms: 2 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 + - cmd: LC_UUID + cmdsize: 24 + uuid: C9DC00C2-E721-365C-9C2D-E9FDB7C838BB + - cmd: LC_BUILD_VERSION + cmdsize: 32 + platform: 1 + minos: 983040 + sdk: 983552 + ntools: 1 + Tools: + - tool: 3 + version: 73074435 + - cmd: LC_SOURCE_VERSION + cmdsize: 16 + version: 0 + - cmd: LC_LOAD_DYLIB + cmdsize: 56 + dylib: + name: 24 + timestamp: 2 + current_version: 88539136 + compatibility_version: 65536 + Content: '/usr/lib/libSystem.B.dylib' + ZeroPadBytes: 6 + - cmd: LC_FUNCTION_STARTS + cmdsize: 16 + dataoff: 32888 + datasize: 8 + - cmd: LC_DATA_IN_CODE + cmdsize: 16 + dataoff: 32896 + datasize: 0 + - cmd: LC_CODE_SIGNATURE + cmdsize: 16 + dataoff: 32960 + datasize: 416 + LinkEditData: + ExportTrie: + TerminalSize: 0 + NodeOffset: 0 + Name: '' + Flags: 0x0 + Address: 0x0 + Other: 0x0 + ImportName: '' + Children: + - TerminalSize: 3 + NodeOffset: 13 + Name: _sayA + Flags: 0x0 + Address: 0x3F68 + Other: 0x0 + ImportName: '' + NameList: + - n_strx: 2 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 16232 + - n_strx: 8 + n_type: 0x1 + n_sect: 0 + n_desc: 256 + n_value: 0 + StringTable: + - ' ' + - _sayA + - _printf + IndirectSymbols: [ 0x1, 0x1 ] + FunctionStarts: [ 0x3F68 ] + ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, + 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, + 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0, + 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, + 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0 ] +... diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_linux.yaml new file mode 100644 index 0000000000000..fe4393e108d96 --- /dev/null +++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_linux.yaml @@ -0,0 +1,460 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +ProgramHeaders: + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .rela.plt + Align: 0x1000 + Offset: 0x0 + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + FirstSec: .init + LastSec: .fini + VAddr: 0x1000 + Align: 0x1000 + Offset: 0x1000 + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .rodata + LastSec: .eh_frame + VAddr: 0x2000 + Align: 0x1000 + Offset: 0x2000 + - Type: PT_LOAD + Flags: [ PF_W, PF_R ] + FirstSec: .init_array + LastSec: .bss + VAddr: 0x3E10 + Align: 0x1000 + Offset: 0x2E10 + - Type: PT_DYNAMIC + Flags: [ PF_W, PF_R ] + FirstSec: .dynamic + LastSec: .dynamic + VAddr: 0x3E20 + Align: 0x8 + Offset: 0x2E20 + - Type: PT_NOTE + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .note.gnu.property + VAddr: 0x2A8 + Align: 0x8 + Offset: 0x2A8 + - Type: PT_NOTE + Flags: [ PF_R ] + FirstSec: .note.gnu.build-id + LastSec: .note.gnu.build-id + VAddr: 0x2C8 + Align: 0x4 + Offset: 0x2C8 + - Type: PT_GNU_PROPERTY + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .note.gnu.property + VAddr: 0x2A8 + Align: 0x8 + Offset: 0x2A8 + - Type: PT_GNU_EH_FRAME + Flags: [ PF_R ] + FirstSec: .eh_frame_hdr + LastSec: .eh_frame_hdr + VAddr: 0x2010 + Align: 0x4 + Offset: 0x2010 + - Type: PT_GNU_STACK + Flags: [ PF_W, PF_R ] + Align: 0x10 + Offset: 0x0 + - Type: PT_GNU_RELRO + Flags: [ PF_R ] + FirstSec: .init_array + LastSec: .got + VAddr: 0x3E10 + Offset: 0x2E10 +Sections: + - Name: .note.gnu.property + Type: SHT_NOTE + Flags: [ SHF_ALLOC ] + Address: 0x2A8 + AddressAlign: 0x8 + Notes: + - Name: GNU + Desc: 020000C0040000000300000000000000 + Type: NT_GNU_PROPERTY_TYPE_0 + - Name: .note.gnu.build-id + Type: SHT_NOTE + Flags: [ SHF_ALLOC ] + Address: 0x2C8 + AddressAlign: 0x4 + Notes: + - Name: GNU + Desc: 6337F7C1BF21A1DE17630C55602EB4CAC50435BB + Type: NT_PRPSINFO + - Name: .gnu.hash + Type: SHT_GNU_HASH + Flags: [ SHF_ALLOC ] + Address: 0x2F0 + Link: .dynsym + AddressAlign: 0x8 + Header: + SymNdx: 0x6 + Shift2: 0x6 + BloomFilter: [ 0x400000100000 ] + HashBuckets: [ 0x6, 0x0 ] + HashValues: [ 0x7C9DCB95 ] + - Name: .dynsym + Type: SHT_DYNSYM + Flags: [ SHF_ALLOC ] + Address: 0x318 + Link: .dynstr + AddressAlign: 0x8 + - Name: .dynstr + Type: SHT_STRTAB + Flags: [ SHF_ALLOC ] + Address: 0x3C0 + AddressAlign: 0x1 + - Name: .gnu.version + Type: SHT_GNU_versym + Flags: [ SHF_ALLOC ] + Address: 0x436 + Link: .dynsym + AddressAlign: 0x2 + Entries: [ 0, 1, 2, 1, 1, 2, 1 ] + - Name: .gnu.version_r + Type: SHT_GNU_verneed + Flags: [ SHF_ALLOC ] + Address: 0x448 + Link: .dynstr + AddressAlign: 0x8 + Dependencies: + - Version: 1 + File: libc.so.6 + Entries: + - Name: GLIBC_2.2.5 + Hash: 157882997 + Flags: 0 + Other: 2 + - Name: .rela.dyn + Type: SHT_RELA + Flags: [ SHF_ALLOC ] + Address: 0x468 + Link: .dynsym + AddressAlign: 0x8 + Relocations: + - Offset: 0x3E10 + Type: R_X86_64_RELATIVE + Addend: 4368 + - Offset: 0x3E18 + Type: R_X86_64_RELATIVE + Addend: 4304 + - Offset: 0x4020 + Type: R_X86_64_RELATIVE + Addend: 16416 + - Offset: 0x3FE0 + Symbol: _ITM_deregisterTMCloneTable + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FE8 + Symbol: __gmon_start__ + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FF0 + Symbol: _ITM_registerTMCloneTable + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FF8 + Symbol: __cxa_finalize + Type: R_X86_64_GLOB_DAT + - Name: .rela.plt + Type: SHT_RELA + Flags: [ SHF_ALLOC, SHF_INFO_LINK ] + Address: 0x510 + Link: .dynsym + AddressAlign: 0x8 + Info: .got.plt + Relocations: + - Offset: 0x4018 + Symbol: puts + Type: R_X86_64_JUMP_SLOT + - Name: .init + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1000 + AddressAlign: 0x4 + Offset: 0x1000 + Content: F30F1EFA4883EC08488B05D92F00004885C07402FFD04883C408C3 + - Name: .plt + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1020 + AddressAlign: 0x10 + EntSize: 0x10 + Content: FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90 + - Name: .plt.got + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1040 + AddressAlign: 0x10 + EntSize: 0x10 + Content: F30F1EFAF2FF25AD2F00000F1F440000 + - Name: .plt.sec + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1050 + AddressAlign: 0x10 + EntSize: 0x10 + Content: F30F1EFAF2FF25BD2F00000F1F440000 + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1060 + AddressAlign: 0x10 + Content: 488D3DC12F0000488D05BA2F00004839F87415488B05662F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D912F0000488D358A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05352F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D4D2F000000752B5548833D122F0000004889E5740C488B3D2E2F0000E849FFFFFFE864FFFFFFC605252F0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5488D05D80E00004889C7E820FFFFFF905DC3 + - Name: .fini + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1134 + AddressAlign: 0x4 + Content: F30F1EFA4883EC084883C408C3 + - Name: .rodata + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x2000 + AddressAlign: 0x1 + Offset: 0x2000 + Content: 48656C6C6F2066726F6D204200 + - Name: .eh_frame_hdr + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x2010 + AddressAlign: 0x4 + Content: 011B033B2C0000000400000010F0FFFF4800000030F0FFFF7000000040F0FFFF8800000009F1FFFFA0000000 + - Name: .eh_frame + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x2040 + AddressAlign: 0x8 + Content: 1400000000000000017A5200017810011B0C070890010000240000001C000000C0EFFFFF20000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000B8EFFFFF100000000000000000000000140000005C000000B0EFFFFF1000000000000000000000001C0000007400000061F0FFFF1A00000000450E108602430D06510C070800000000000000 + - Name: .init_array + Type: SHT_INIT_ARRAY + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3E10 + AddressAlign: 0x8 + EntSize: 0x8 + Offset: 0x2E10 + Content: '1011000000000000' + - Name: .fini_array + Type: SHT_FINI_ARRAY + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3E18 + AddressAlign: 0x8 + EntSize: 0x8 + Content: D010000000000000 + - Name: .dynamic + Type: SHT_DYNAMIC + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3E20 + Link: .dynstr + AddressAlign: 0x8 + Entries: + - Tag: DT_NEEDED + Value: 0x5F + - Tag: DT_INIT + Value: 0x1000 + - Tag: DT_FINI + Value: 0x1134 + - Tag: DT_INIT_ARRAY + Value: 0x3E10 + - Tag: DT_INIT_ARRAYSZ + Value: 0x8 + - Tag: DT_FINI_ARRAY + Value: 0x3E18 + - Tag: DT_FINI_ARRAYSZ + Value: 0x8 + - Tag: DT_GNU_HASH + Value: 0x2F0 + - Tag: DT_STRTAB + Value: 0x3C0 + - Tag: DT_SYMTAB + Value: 0x318 + - Tag: DT_STRSZ + Value: 0x75 + - Tag: DT_SYMENT + Value: 0x18 + - Tag: DT_PLTGOT + Value: 0x4000 + - Tag: DT_PLTRELSZ + Value: 0x18 + - Tag: DT_PLTREL + Value: 0x7 + - Tag: DT_JMPREL + Value: 0x510 + - Tag: DT_RELA + Value: 0x468 + - Tag: DT_RELASZ + Value: 0xA8 + - Tag: DT_RELAENT + Value: 0x18 + - Tag: DT_VERNEED + Value: 0x448 + - Tag: DT_VERNEEDNUM + Value: 0x1 + - Tag: DT_VERSYM + Value: 0x436 + - Tag: DT_RELACOUNT + Value: 0x3 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Name: .got + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3FE0 + AddressAlign: 0x8 + EntSize: 0x8 + Content: '0000000000000000000000000000000000000000000000000000000000000000' + - Name: .got.plt + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4000 + AddressAlign: 0x8 + EntSize: 0x8 + Content: '203E000000000000000000000000000000000000000000003010000000000000' + - Name: .data + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4020 + AddressAlign: 0x8 + Content: '2040000000000000' + - Name: .bss + Type: SHT_NOBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4028 + AddressAlign: 0x1 + Size: 0x8 + - Name: .comment + Type: SHT_PROGBITS + Flags: [ SHF_MERGE, SHF_STRINGS ] + AddressAlign: 0x1 + EntSize: 0x1 + Content: 4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000 +Symbols: + - Name: crtstuff.c + Type: STT_FILE + Index: SHN_ABS + - Name: deregister_tm_clones + Type: STT_FUNC + Section: .text + Value: 0x1060 + - Name: register_tm_clones + Type: STT_FUNC + Section: .text + Value: 0x1090 + - Name: __do_global_dtors_aux + Type: STT_FUNC + Section: .text + Value: 0x10D0 + - Name: completed.0 + Type: STT_OBJECT + Section: .bss + Value: 0x4028 + Size: 0x1 + - Name: __do_global_dtors_aux_fini_array_entry + Type: STT_OBJECT + Section: .fini_array + Value: 0x3E18 + - Name: frame_dummy + Type: STT_FUNC + Section: .text + Value: 0x1110 + - Name: __frame_dummy_init_array_entry + Type: STT_OBJECT + Section: .init_array + Value: 0x3E10 + - Name: libB.c + Type: STT_FILE + Index: SHN_ABS + - Name: 'crtstuff.c (1)' + Type: STT_FILE + Index: SHN_ABS + - Name: __FRAME_END__ + Type: STT_OBJECT + Section: .eh_frame + Value: 0x20D0 + - Type: STT_FILE + Index: SHN_ABS + - Name: _fini + Type: STT_FUNC + Section: .fini + Value: 0x1134 + - Name: __dso_handle + Type: STT_OBJECT + Section: .data + Value: 0x4020 + - Name: _DYNAMIC + Type: STT_OBJECT + Section: .dynamic + Value: 0x3E20 + - Name: __GNU_EH_FRAME_HDR + Section: .eh_frame_hdr + Value: 0x2010 + - Name: __TMC_END__ + Type: STT_OBJECT + Section: .data + Value: 0x4028 + - Name: _GLOBAL_OFFSET_TABLE_ + Type: STT_OBJECT + Section: .got.plt + Value: 0x4000 + - Name: _init + Type: STT_FUNC + Section: .init + Value: 0x1000 + - Name: _ITM_deregisterTMCloneTable + Binding: STB_WEAK + - Name: 'puts@GLIBC_2.2.5' + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: __gmon_start__ + Binding: STB_WEAK + - Name: sayB + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x1119 + Size: 0x1A + - Name: _ITM_registerTMCloneTable + Binding: STB_WEAK + - Name: '__cxa_finalize@GLIBC_2.2.5' + Type: STT_FUNC + Binding: STB_WEAK +DynamicSymbols: + - Name: _ITM_deregisterTMCloneTable + Binding: STB_WEAK + - Name: puts + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: __gmon_start__ + Binding: STB_WEAK + - Name: _ITM_registerTMCloneTable + Binding: STB_WEAK + - Name: __cxa_finalize + Type: STT_FUNC + Binding: STB_WEAK + - Name: sayB + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x1119 + Size: 0x1A +... diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_macho.yaml new file mode 100644 index 0000000000000..3d57c4f9271c6 --- /dev/null +++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_macho.yaml @@ -0,0 +1,723 @@ +--- !fat-mach-o +FatHeader: + magic: 0xCAFEBABE + nfat_arch: 3 +FatArchs: + - cputype: 0x1000007 + cpusubtype: 0x3 + offset: 0x1000 + size: 8376 + align: 12 + - cputype: 0x100000C + cpusubtype: 0x0 + offset: 0x4000 + size: 33376 + align: 14 + - cputype: 0x100000C + cpusubtype: 0x80000002 + offset: 0x10000 + size: 33376 + align: 14 +Slices: + - !mach-o + FileHeader: + magic: 0xFEEDFACF + cputype: 0x1000007 + cpusubtype: 0x3 + filetype: 0x6 + ncmds: 14 + sizeofcmds: 960 + flags: 0x100085 + reserved: 0x0 + LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 392 + segname: __TEXT + vmaddr: 0 + vmsize: 4096 + fileoff: 0 + filesize: 4096 + maxprot: 5 + initprot: 5 + nsects: 4 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0xF80 + size: 20 + offset: 0xF80 + align: 4 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 554889E5488D3D0F000000B000E8020000005DC3 + - sectname: __stubs + segname: __TEXT + addr: 0xF94 + size: 6 + offset: 0xF94 + align: 1 + reloff: 0x0 + nreloc: 0 + flags: 0x80000408 + reserved1: 0x0 + reserved2: 0x6 + reserved3: 0x0 + content: FF2566000000 + - sectname: __cstring + segname: __TEXT + addr: 0xF9A + size: 14 + offset: 0xF9A + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 48656C6C6F2066726F6D20420A00 + - sectname: __unwind_info + segname: __TEXT + addr: 0xFA8 + size: 88 + offset: 0xFA8 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 010000001C000000000000001C000000000000001C00000002000000800F00004000000040000000940F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000 + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __DATA_CONST + vmaddr: 4096 + vmsize: 4096 + fileoff: 4096 + filesize: 4096 + maxprot: 3 + initprot: 3 + nsects: 1 + flags: 16 + Sections: + - sectname: __got + segname: __DATA_CONST + addr: 0x1000 + size: 8 + offset: 0x1000 + align: 3 + reloff: 0x0 + nreloc: 0 + flags: 0x6 + reserved1: 0x1 + reserved2: 0x0 + reserved3: 0x0 + content: '0000000000000080' + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 8192 + vmsize: 4096 + fileoff: 8192 + filesize: 184 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + - cmd: LC_ID_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 1 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libB.dylib' + ZeroPadBytes: 7 + - cmd: LC_DYLD_CHAINED_FIXUPS + cmdsize: 16 + dataoff: 8192 + datasize: 96 + - cmd: LC_DYLD_EXPORTS_TRIE + cmdsize: 16 + dataoff: 8288 + datasize: 24 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 8320 + nsyms: 2 + stroff: 8360 + strsize: 16 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 0 + iextdefsym: 0 + nextdefsym: 1 + iundefsym: 1 + nundefsym: 1 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 8352 + nindirectsyms: 2 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 + - cmd: LC_UUID + cmdsize: 24 + uuid: 88B60B3C-13D3-3D7E-AEED-5F3E991FDF08 + - cmd: LC_BUILD_VERSION + cmdsize: 32 + platform: 1 + minos: 983040 + sdk: 983552 + ntools: 1 + Tools: + - tool: 3 + version: 73074435 + - cmd: LC_SOURCE_VERSION + cmdsize: 16 + version: 0 + - cmd: LC_LOAD_DYLIB + cmdsize: 56 + dylib: + name: 24 + timestamp: 2 + current_version: 88539136 + compatibility_version: 65536 + Content: '/usr/lib/libSystem.B.dylib' + ZeroPadBytes: 6 + - cmd: LC_FUNCTION_STARTS + cmdsize: 16 + dataoff: 8312 + datasize: 8 + - cmd: LC_DATA_IN_CODE + cmdsize: 16 + dataoff: 8320 + datasize: 0 + LinkEditData: + ExportTrie: + TerminalSize: 0 + NodeOffset: 0 + Name: '' + Flags: 0x0 + Address: 0x0 + Other: 0x0 + ImportName: '' + Children: + - TerminalSize: 3 + NodeOffset: 13 + Name: _sayB + Flags: 0x0 + Address: 0xF80 + Other: 0x0 + ImportName: '' + NameList: + - n_strx: 2 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 3968 + - n_strx: 8 + n_type: 0x1 + n_sect: 0 + n_desc: 256 + n_value: 0 + StringTable: + - ' ' + - _sayB + - _printf + IndirectSymbols: [ 0x1, 0x1 ] + FunctionStarts: [ 0xF80 ] + ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, + 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, + 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0, + 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, + 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0 ] + - !mach-o + FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x6 + ncmds: 15 + sizeofcmds: 976 + flags: 0x100085 + reserved: 0x0 + LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 392 + segname: __TEXT + vmaddr: 0 + vmsize: 16384 + fileoff: 0 + filesize: 16384 + maxprot: 5 + initprot: 5 + nsects: 4 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x3F70 + size: 28 + offset: 0x3F70 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8C0035FD6 + - sectname: __stubs + segname: __TEXT + addr: 0x3F8C + size: 12 + offset: 0x3F8C + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000408 + reserved1: 0x0 + reserved2: 0xC + reserved3: 0x0 + content: 100000B0100240F900021FD6 + - sectname: __cstring + segname: __TEXT + addr: 0x3F98 + size: 14 + offset: 0x3F98 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 48656C6C6F2066726F6D20420A00 + - sectname: __unwind_info + segname: __TEXT + addr: 0x3FA8 + size: 88 + offset: 0x3FA8 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 010000001C000000000000001C000000000000001C00000002000000703F000040000000400000008C3F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000 + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __DATA_CONST + vmaddr: 16384 + vmsize: 16384 + fileoff: 16384 + filesize: 16384 + maxprot: 3 + initprot: 3 + nsects: 1 + flags: 16 + Sections: + - sectname: __got + segname: __DATA_CONST + addr: 0x4000 + size: 8 + offset: 0x4000 + align: 3 + reloff: 0x0 + nreloc: 0 + flags: 0x6 + reserved1: 0x1 + reserved2: 0x0 + reserved3: 0x0 + content: '0000000000000080' + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 32768 + vmsize: 16384 + fileoff: 32768 + filesize: 608 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + - cmd: LC_ID_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 1 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libB.dylib' + ZeroPadBytes: 7 + - cmd: LC_DYLD_CHAINED_FIXUPS + cmdsize: 16 + dataoff: 32768 + datasize: 96 + - cmd: LC_DYLD_EXPORTS_TRIE + cmdsize: 16 + dataoff: 32864 + datasize: 24 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 32896 + nsyms: 2 + stroff: 32936 + strsize: 16 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 0 + iextdefsym: 0 + nextdefsym: 1 + iundefsym: 1 + nundefsym: 1 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 32928 + nindirectsyms: 2 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 + - cmd: LC_UUID + cmdsize: 24 + uuid: 90C3787A-22E1-35AE-9284-97A4842F88AF + - cmd: LC_BUILD_VERSION + cmdsize: 32 + platform: 1 + minos: 983040 + sdk: 983552 + ntools: 1 + Tools: + - tool: 3 + version: 73074435 + - cmd: LC_SOURCE_VERSION + cmdsize: 16 + version: 0 + - cmd: LC_LOAD_DYLIB + cmdsize: 56 + dylib: + name: 24 + timestamp: 2 + current_version: 88539136 + compatibility_version: 65536 + Content: '/usr/lib/libSystem.B.dylib' + ZeroPadBytes: 6 + - cmd: LC_FUNCTION_STARTS + cmdsize: 16 + dataoff: 32888 + datasize: 8 + - cmd: LC_DATA_IN_CODE + cmdsize: 16 + dataoff: 32896 + datasize: 0 + - cmd: LC_CODE_SIGNATURE + cmdsize: 16 + dataoff: 32960 + datasize: 416 + LinkEditData: + ExportTrie: + TerminalSize: 0 + NodeOffset: 0 + Name: '' + Flags: 0x0 + Address: 0x0 + Other: 0x0 + ImportName: '' + Children: + - TerminalSize: 3 + NodeOffset: 13 + Name: _sayB + Flags: 0x0 + Address: 0x3F70 + Other: 0x0 + ImportName: '' + NameList: + - n_strx: 2 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 16240 + - n_strx: 8 + n_type: 0x1 + n_sect: 0 + n_desc: 256 + n_value: 0 + StringTable: + - ' ' + - _sayB + - _printf + IndirectSymbols: [ 0x1, 0x1 ] + FunctionStarts: [ 0x3F70 ] + ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, + 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, + 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0, + 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, + 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0 ] + - !mach-o + FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x80000002 + filetype: 0x6 + ncmds: 15 + sizeofcmds: 976 + flags: 0x100085 + reserved: 0x0 + LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 392 + segname: __TEXT + vmaddr: 0 + vmsize: 16384 + fileoff: 0 + filesize: 16384 + maxprot: 5 + initprot: 5 + nsects: 4 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x3F68 + size: 32 + offset: 0x3F68 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 7F2303D5FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8FF0F5FD6 + - sectname: __auth_stubs + segname: __TEXT + addr: 0x3F88 + size: 16 + offset: 0x3F88 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000408 + reserved1: 0x0 + reserved2: 0x10 + reserved3: 0x0 + content: 110000B031020091300240F9110A1FD7 + - sectname: __cstring + segname: __TEXT + addr: 0x3F98 + size: 14 + offset: 0x3F98 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 48656C6C6F2066726F6D20420A00 + - sectname: __unwind_info + segname: __TEXT + addr: 0x3FA8 + size: 88 + offset: 0x3FA8 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000883F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000 + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __DATA_CONST + vmaddr: 16384 + vmsize: 16384 + fileoff: 16384 + filesize: 16384 + maxprot: 3 + initprot: 3 + nsects: 1 + flags: 16 + Sections: + - sectname: __auth_got + segname: __DATA_CONST + addr: 0x4000 + size: 8 + offset: 0x4000 + align: 3 + reloff: 0x0 + nreloc: 0 + flags: 0x6 + reserved1: 0x1 + reserved2: 0x0 + reserved3: 0x0 + content: 00000000000001C0 + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 32768 + vmsize: 16384 + fileoff: 32768 + filesize: 608 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + - cmd: LC_ID_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 1 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libB.dylib' + ZeroPadBytes: 7 + - cmd: LC_DYLD_CHAINED_FIXUPS + cmdsize: 16 + dataoff: 32768 + datasize: 96 + - cmd: LC_DYLD_EXPORTS_TRIE + cmdsize: 16 + dataoff: 32864 + datasize: 24 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 32896 + nsyms: 2 + stroff: 32936 + strsize: 16 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 0 + iextdefsym: 0 + nextdefsym: 1 + iundefsym: 1 + nundefsym: 1 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 32928 + nindirectsyms: 2 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 + - cmd: LC_UUID + cmdsize: 24 + uuid: 76B41B3A-00EC-388B-A432-478A96772CC4 + - cmd: LC_BUILD_VERSION + cmdsize: 32 + platform: 1 + minos: 983040 + sdk: 983552 + ntools: 1 + Tools: + - tool: 3 + version: 73074435 + - cmd: LC_SOURCE_VERSION + cmdsize: 16 + version: 0 + - cmd: LC_LOAD_DYLIB + cmdsize: 56 + dylib: + name: 24 + timestamp: 2 + current_version: 88539136 + compatibility_version: 65536 + Content: '/usr/lib/libSystem.B.dylib' + ZeroPadBytes: 6 + - cmd: LC_FUNCTION_STARTS + cmdsize: 16 + dataoff: 32888 + datasize: 8 + - cmd: LC_DATA_IN_CODE + cmdsize: 16 + dataoff: 32896 + datasize: 0 + - cmd: LC_CODE_SIGNATURE + cmdsize: 16 + dataoff: 32960 + datasize: 416 + LinkEditData: + ExportTrie: + TerminalSize: 0 + NodeOffset: 0 + Name: '' + Flags: 0x0 + Address: 0x0 + Other: 0x0 + ImportName: '' + Children: + - TerminalSize: 3 + NodeOffset: 13 + Name: _sayB + Flags: 0x0 + Address: 0x3F68 + Other: 0x0 + ImportName: '' + NameList: + - n_strx: 2 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 16232 + - n_strx: 8 + n_type: 0x1 + n_sect: 0 + n_desc: 256 + n_value: 0 + StringTable: + - ' ' + - _sayB + - _printf + IndirectSymbols: [ 0x1, 0x1 ] + FunctionStarts: [ 0x3F68 ] + ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, + 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, + 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0, + 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, + 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0 ] +... diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_linux.yaml new file mode 100644 index 0000000000000..3fabf9a62e336 --- /dev/null +++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_linux.yaml @@ -0,0 +1,450 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +ProgramHeaders: + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .rela.plt + Align: 0x1000 + Offset: 0x0 + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + FirstSec: .init + LastSec: .fini + VAddr: 0x1000 + Align: 0x1000 + Offset: 0x1000 + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .eh_frame_hdr + LastSec: .eh_frame + VAddr: 0x2000 + Align: 0x1000 + Offset: 0x2000 + - Type: PT_LOAD + Flags: [ PF_W, PF_R ] + FirstSec: .init_array + LastSec: .bss + VAddr: 0x3E10 + Align: 0x1000 + Offset: 0x2E10 + - Type: PT_DYNAMIC + Flags: [ PF_W, PF_R ] + FirstSec: .dynamic + LastSec: .dynamic + VAddr: 0x3E20 + Align: 0x8 + Offset: 0x2E20 + - Type: PT_NOTE + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .note.gnu.property + VAddr: 0x2A8 + Align: 0x8 + Offset: 0x2A8 + - Type: PT_NOTE + Flags: [ PF_R ] + FirstSec: .note.gnu.build-id + LastSec: .note.gnu.build-id + VAddr: 0x2C8 + Align: 0x4 + Offset: 0x2C8 + - Type: PT_GNU_PROPERTY + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .note.gnu.property + VAddr: 0x2A8 + Align: 0x8 + Offset: 0x2A8 + - Type: PT_GNU_EH_FRAME + Flags: [ PF_R ] + FirstSec: .eh_frame_hdr + LastSec: .eh_frame_hdr + VAddr: 0x2000 + Align: 0x4 + Offset: 0x2000 + - Type: PT_GNU_STACK + Flags: [ PF_W, PF_R ] + Align: 0x10 + Offset: 0x0 + - Type: PT_GNU_RELRO + Flags: [ PF_R ] + FirstSec: .init_array + LastSec: .got + VAddr: 0x3E10 + Offset: 0x2E10 +Sections: + - Name: .note.gnu.property + Type: SHT_NOTE + Flags: [ SHF_ALLOC ] + Address: 0x2A8 + AddressAlign: 0x8 + Notes: + - Name: GNU + Desc: 020000C0040000000300000000000000 + Type: NT_GNU_PROPERTY_TYPE_0 + - Name: .note.gnu.build-id + Type: SHT_NOTE + Flags: [ SHF_ALLOC ] + Address: 0x2C8 + AddressAlign: 0x4 + Notes: + - Name: GNU + Desc: 0318D63E46BF31CEFF90D5C7F0475D9F78676EC8 + Type: NT_PRPSINFO + - Name: .gnu.hash + Type: SHT_GNU_HASH + Flags: [ SHF_ALLOC ] + Address: 0x2F0 + Link: .dynsym + AddressAlign: 0x8 + Header: + SymNdx: 0x8 + Shift2: 0x6 + BloomFilter: [ 0x400000200000 ] + HashBuckets: [ 0x0, 0x8 ] + HashValues: [ 0x7C9DCB95 ] + - Name: .dynsym + Type: SHT_DYNSYM + Flags: [ SHF_ALLOC ] + Address: 0x318 + Link: .dynstr + AddressAlign: 0x8 + - Name: .dynstr + Type: SHT_STRTAB + Flags: [ SHF_ALLOC ] + Address: 0x3F0 + AddressAlign: 0x1 + Content: "6C6962412E736F006C6962422E736F006C69625A2E736F00244F524947494E2F2E2E2F413A244F524947494E2F2E2E2F423A244F524947494E2F2E2E2F5A" + - Name: .rela.dyn + Type: SHT_RELA + Flags: [ SHF_ALLOC ] + Address: 0x498 + Link: .dynsym + AddressAlign: 0x8 + Relocations: + - Offset: 0x3E10 + Type: R_X86_64_RELATIVE + Addend: 4432 + - Offset: 0x3E18 + Type: R_X86_64_RELATIVE + Addend: 4368 + - Offset: 0x4030 + Type: R_X86_64_RELATIVE + Addend: 16432 + - Offset: 0x3FE0 + Symbol: __cxa_finalize + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FE8 + Symbol: _ITM_registerTMCloneTable + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FF0 + Symbol: _ITM_deregisterTMCloneTable + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FF8 + Symbol: __gmon_start__ + Type: R_X86_64_GLOB_DAT + - Name: .rela.plt + Type: SHT_RELA + Flags: [ SHF_ALLOC, SHF_INFO_LINK ] + Address: 0x540 + Link: .dynsym + AddressAlign: 0x8 + Info: .got.plt + Relocations: + - Offset: 0x4018 + Symbol: sayA + Type: R_X86_64_JUMP_SLOT + - Offset: 0x4020 + Symbol: sayB + Type: R_X86_64_JUMP_SLOT + - Offset: 0x4028 + Symbol: sayZ + Type: R_X86_64_JUMP_SLOT + - Name: .init + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1000 + AddressAlign: 0x4 + Offset: 0x1000 + Content: F30F1EFA4883EC08488B05E92F00004885C07402FFD04883C408C3 + - Name: .plt + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1020 + AddressAlign: 0x10 + EntSize: 0x10 + Content: FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90F30F1EFA6801000000F2E9D1FFFFFF90F30F1EFA6802000000F2E9C1FFFFFF90 + - Name: .plt.got + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1060 + AddressAlign: 0x10 + EntSize: 0x10 + Content: F30F1EFAF2FF25752F00000F1F440000 + - Name: .plt.sec + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1070 + AddressAlign: 0x10 + EntSize: 0x10 + Content: F30F1EFAF2FF259D2F00000F1F440000F30F1EFAF2FF25952F00000F1F440000F30F1EFAF2FF258D2F00000F1F440000 + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x10A0 + AddressAlign: 0x10 + Content: 488D3D912F0000488D058A2F00004839F87415488B05362F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D612F0000488D355A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05ED2E00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D1D2F000000752B5548833DBA2E0000004889E5740C488B3DFE2E0000E829FFFFFFE864FFFFFFC605F52E0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5B800000000E805FFFFFFB800000000E80BFFFFFFB800000000E811FFFFFF905DC3 + - Name: .fini + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1184 + AddressAlign: 0x4 + Content: F30F1EFA4883EC084883C408C3 + - Name: .eh_frame_hdr + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x2000 + AddressAlign: 0x4 + Offset: 0x2000 + Content: 011B033B2C0000000400000020F0FFFF4800000060F0FFFF7000000070F0FFFF8800000059F1FFFFA0000000 + - Name: .eh_frame + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x2030 + AddressAlign: 0x8 + Content: 1400000000000000017A5200017810011B0C070890010000240000001C000000D0EFFFFF40000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000E8EFFFFF100000000000000000000000140000005C000000E0EFFFFF3000000000000000000000001C00000074000000B1F0FFFF2900000000450E108602430D06600C070800000000000000 + - Name: .init_array + Type: SHT_INIT_ARRAY + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3E10 + AddressAlign: 0x8 + EntSize: 0x8 + Offset: 0x2E10 + Content: '5011000000000000' + - Name: .fini_array + Type: SHT_FINI_ARRAY + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3E18 + AddressAlign: 0x8 + EntSize: 0x8 + Content: '1011000000000000' + - Name: .dynamic + Type: SHT_DYNAMIC + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3E20 + Link: .dynstr + AddressAlign: 0x8 + Entries: + - Tag: DT_NEEDED + Value: 0x0 + - Tag: DT_NEEDED + Value: 0x8 + - Tag: DT_NEEDED + Value: 0x10 + - Tag: DT_RUNPATH + Value: 0x18 + - Tag: DT_INIT + Value: 0x1000 + - Tag: DT_FINI + Value: 0x1184 + - Tag: DT_INIT_ARRAY + Value: 0x3E10 + - Tag: DT_INIT_ARRAYSZ + Value: 0x8 + - Tag: DT_FINI_ARRAY + Value: 0x3E18 + - Tag: DT_FINI_ARRAYSZ + Value: 0x8 + - Tag: DT_GNU_HASH + Value: 0x2F0 + - Tag: DT_STRTAB + Value: 0x3F0 + - Tag: DT_SYMTAB + Value: 0x318 + - Tag: DT_STRSZ + Value: 0xA8 + - Tag: DT_SYMENT + Value: 0x18 + - Tag: DT_PLTGOT + Value: 0x4000 + - Tag: DT_PLTRELSZ + Value: 0x48 + - Tag: DT_PLTREL + Value: 0x7 + - Tag: DT_JMPREL + Value: 0x540 + - Tag: DT_RELA + Value: 0x498 + - Tag: DT_RELASZ + Value: 0xA8 + - Tag: DT_RELAENT + Value: 0x18 + - Tag: DT_RELACOUNT + Value: 0x3 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Name: .got + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3FE0 + AddressAlign: 0x8 + EntSize: 0x8 + Content: '0000000000000000000000000000000000000000000000000000000000000000' + - Name: .got.plt + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4000 + AddressAlign: 0x8 + EntSize: 0x8 + Content: '203E00000000000000000000000000000000000000000000301000000000000040100000000000005010000000000000' + - Name: .data + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4030 + AddressAlign: 0x8 + Content: '3040000000000000' + - Name: .bss + Type: SHT_NOBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4038 + AddressAlign: 0x1 + Size: 0x8 + - Name: .comment + Type: SHT_PROGBITS + Flags: [ SHF_MERGE, SHF_STRINGS ] + AddressAlign: 0x1 + EntSize: 0x1 + Content: 4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000 +Symbols: + - Name: crtstuff.c + Type: STT_FILE + Index: SHN_ABS + - Name: deregister_tm_clones + Type: STT_FUNC + Section: .text + Value: 0x10A0 + - Name: register_tm_clones + Type: STT_FUNC + Section: .text + Value: 0x10D0 + - Name: __do_global_dtors_aux + Type: STT_FUNC + Section: .text + Value: 0x1110 + - Name: completed.0 + Type: STT_OBJECT + Section: .bss + Value: 0x4038 + Size: 0x1 + - Name: __do_global_dtors_aux_fini_array_entry + Type: STT_OBJECT + Section: .fini_array + Value: 0x3E18 + - Name: frame_dummy + Type: STT_FUNC + Section: .text + Value: 0x1150 + - Name: __frame_dummy_init_array_entry + Type: STT_OBJECT + Section: .init_array + Value: 0x3E10 + - Name: libC.c + Type: STT_FILE + Index: SHN_ABS + - Name: 'crtstuff.c (1)' + Type: STT_FILE + Index: SHN_ABS + - Name: __FRAME_END__ + Type: STT_OBJECT + Section: .eh_frame + Value: 0x20C0 + - Type: STT_FILE + Index: SHN_ABS + - Name: _DYNAMIC + Type: STT_OBJECT + Section: .dynamic + Value: 0x3E20 + - Name: __TMC_END__ + Type: STT_OBJECT + Section: .data + Value: 0x4038 + - Name: __dso_handle + Type: STT_OBJECT + Section: .data + Value: 0x4030 + - Name: _init + Type: STT_FUNC + Section: .init + Value: 0x1000 + - Name: __GNU_EH_FRAME_HDR + Section: .eh_frame_hdr + Value: 0x2000 + - Name: _fini + Type: STT_FUNC + Section: .fini + Value: 0x1184 + - Name: _GLOBAL_OFFSET_TABLE_ + Type: STT_OBJECT + Section: .got.plt + Value: 0x4000 + - Name: __cxa_finalize + Binding: STB_WEAK + - Name: sayC + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x1159 + Size: 0x29 + - Name: _ITM_registerTMCloneTable + Binding: STB_WEAK + - Name: _ITM_deregisterTMCloneTable + Binding: STB_WEAK + - Name: sayA + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: sayB + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: sayZ + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: __gmon_start__ + Binding: STB_WEAK +DynamicSymbols: + - Name: __cxa_finalize + Binding: STB_WEAK + - Name: _ITM_registerTMCloneTable + Binding: STB_WEAK + - Name: _ITM_deregisterTMCloneTable + Binding: STB_WEAK + - Name: sayA + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: sayB + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: sayZ + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: __gmon_start__ + Binding: STB_WEAK + - Name: sayC + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x1159 + Size: 0x29 +... diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_macho.yaml new file mode 100644 index 0000000000000..ba33483c5122f --- /dev/null +++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_macho.yaml @@ -0,0 +1,870 @@ +--- !fat-mach-o +FatHeader: + magic: 0xCAFEBABE + nfat_arch: 3 +FatArchs: + - cputype: 0x1000007 + cpusubtype: 0x3 + offset: 0x1000 + size: 8456 + align: 12 + - cputype: 0x100000C + cpusubtype: 0x0 + offset: 0x4000 + size: 33456 + align: 14 + - cputype: 0x100000C + cpusubtype: 0x80000002 + offset: 0x10000 + size: 33456 + align: 14 +Slices: + - !mach-o + FileHeader: + magic: 0xFEEDFACF + cputype: 0x1000007 + cpusubtype: 0x3 + filetype: 0x6 + ncmds: 20 + sizeofcmds: 1120 + flags: 0x100085 + reserved: 0x0 + LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 312 + segname: __TEXT + vmaddr: 0 + vmsize: 4096 + fileoff: 0 + filesize: 4096 + maxprot: 5 + initprot: 5 + nsects: 3 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0xF70 + size: 27 + offset: 0xF70 + align: 4 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 554889E5B000E811000000B000E810000000B000E80F0000005DC3 + - sectname: __stubs + segname: __TEXT + addr: 0xF8C + size: 18 + offset: 0xF8C + align: 1 + reloff: 0x0 + nreloc: 0 + flags: 0x80000408 + reserved1: 0x0 + reserved2: 0x6 + reserved3: 0x0 + content: FF256E000000FF2570000000FF2572000000 + - sectname: __unwind_info + segname: __TEXT + addr: 0xFA0 + size: 88 + offset: 0xFA0 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 010000001C000000000000001C000000000000001C00000002000000700F000040000000400000008B0F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000 + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __DATA_CONST + vmaddr: 4096 + vmsize: 4096 + fileoff: 4096 + filesize: 4096 + maxprot: 3 + initprot: 3 + nsects: 1 + flags: 16 + Sections: + - sectname: __got + segname: __DATA_CONST + addr: 0x1000 + size: 24 + offset: 0x1000 + align: 3 + reloff: 0x0 + nreloc: 0 + flags: 0x6 + reserved1: 0x3 + reserved2: 0x0 + reserved3: 0x0 + content: '000000000000108001000000000010800200000000000080' + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 8192 + vmsize: 4096 + fileoff: 8192 + filesize: 264 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + - cmd: LC_ID_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 1 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libC.dylib' + ZeroPadBytes: 7 + - cmd: LC_DYLD_CHAINED_FIXUPS + cmdsize: 16 + dataoff: 8192 + datasize: 112 + - cmd: LC_DYLD_EXPORTS_TRIE + cmdsize: 16 + dataoff: 8304 + datasize: 24 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 8336 + nsyms: 4 + stroff: 8424 + strsize: 32 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 0 + iextdefsym: 0 + nextdefsym: 1 + iundefsym: 1 + nundefsym: 3 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 8400 + nindirectsyms: 6 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 + - cmd: LC_UUID + cmdsize: 24 + uuid: 2AA1F9E9-F250-366F-B382-51A91DE06BED + - cmd: LC_BUILD_VERSION + cmdsize: 32 + platform: 1 + minos: 983040 + sdk: 983552 + ntools: 1 + Tools: + - tool: 3 + version: 73074435 + - cmd: LC_SOURCE_VERSION + cmdsize: 16 + version: 0 + - cmd: LC_LOAD_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 2 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libA.dylib' + ZeroPadBytes: 7 + - cmd: LC_LOAD_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 2 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libB.dylib' + ZeroPadBytes: 7 + - cmd: LC_LOAD_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 2 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libZ.dylib' + ZeroPadBytes: 7 + - cmd: LC_LOAD_DYLIB + cmdsize: 56 + dylib: + name: 24 + timestamp: 2 + current_version: 88539136 + compatibility_version: 65536 + Content: '/usr/lib/libSystem.B.dylib' + ZeroPadBytes: 6 + - cmd: LC_RPATH + cmdsize: 32 + path: 12 + Content: '@loader_path/../A' + ZeroPadBytes: 3 + - cmd: LC_RPATH + cmdsize: 32 + path: 12 + Content: '@loader_path/../B' + ZeroPadBytes: 3 + - cmd: LC_RPATH + cmdsize: 32 + path: 12 + Content: '@loader_path/../Z' + ZeroPadBytes: 3 + - cmd: LC_FUNCTION_STARTS + cmdsize: 16 + dataoff: 8328 + datasize: 8 + - cmd: LC_DATA_IN_CODE + cmdsize: 16 + dataoff: 8336 + datasize: 0 + LinkEditData: + ExportTrie: + TerminalSize: 0 + NodeOffset: 0 + Name: '' + Flags: 0x0 + Address: 0x0 + Other: 0x0 + ImportName: '' + Children: + - TerminalSize: 3 + NodeOffset: 13 + Name: _sayC + Flags: 0x0 + Address: 0xF70 + Other: 0x0 + ImportName: '' + NameList: + - n_strx: 2 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 3952 + - n_strx: 8 + n_type: 0x1 + n_sect: 0 + n_desc: 256 + n_value: 0 + - n_strx: 14 + n_type: 0x1 + n_sect: 0 + n_desc: 512 + n_value: 0 + - n_strx: 20 + n_type: 0x1 + n_sect: 0 + n_desc: 768 + n_value: 0 + StringTable: + - ' ' + - _sayC + - _sayA + - _sayB + - _sayZ + - '' + - '' + - '' + - '' + - '' + - '' + IndirectSymbols: [ 0x1, 0x2, 0x3, 0x1, 0x2, 0x3 ] + FunctionStarts: [ 0xF70 ] + ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, + 0x0, 0x0, 0x0, 0x58, 0x0, 0x0, 0x0, 0x3, 0x0, + 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0, + 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, + 0x0, 0x2, 0xE, 0x0, 0x0, 0x3, 0x1A, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x73, 0x61, 0x79, + 0x41, 0x0, 0x5F, 0x73, 0x61, 0x79, 0x42, 0x0, + 0x5F, 0x73, 0x61, 0x79, 0x5A, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0 ] + - !mach-o + FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x6 + ncmds: 21 + sizeofcmds: 1136 + flags: 0x100085 + reserved: 0x0 + LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 312 + segname: __TEXT + vmaddr: 0 + vmsize: 16384 + fileoff: 0 + filesize: 16384 + maxprot: 5 + initprot: 5 + nsects: 3 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x3F68 + size: 28 + offset: 0x3F68 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: FD7BBFA9FD030091050000940700009409000094FD7BC1A8C0035FD6 + - sectname: __stubs + segname: __TEXT + addr: 0x3F84 + size: 36 + offset: 0x3F84 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000408 + reserved1: 0x0 + reserved2: 0xC + reserved3: 0x0 + content: 100000B0100240F900021FD6100000B0100640F900021FD6100000B0100A40F900021FD6 + - sectname: __unwind_info + segname: __TEXT + addr: 0x3FA8 + size: 88 + offset: 0x3FA8 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000843F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000 + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __DATA_CONST + vmaddr: 16384 + vmsize: 16384 + fileoff: 16384 + filesize: 16384 + maxprot: 3 + initprot: 3 + nsects: 1 + flags: 16 + Sections: + - sectname: __got + segname: __DATA_CONST + addr: 0x4000 + size: 24 + offset: 0x4000 + align: 3 + reloff: 0x0 + nreloc: 0 + flags: 0x6 + reserved1: 0x3 + reserved2: 0x0 + reserved3: 0x0 + content: '000000000000108001000000000010800200000000000080' + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 32768 + vmsize: 16384 + fileoff: 32768 + filesize: 688 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + - cmd: LC_ID_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 1 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libC.dylib' + ZeroPadBytes: 7 + - cmd: LC_DYLD_CHAINED_FIXUPS + cmdsize: 16 + dataoff: 32768 + datasize: 112 + - cmd: LC_DYLD_EXPORTS_TRIE + cmdsize: 16 + dataoff: 32880 + datasize: 24 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 32912 + nsyms: 4 + stroff: 33000 + strsize: 32 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 0 + iextdefsym: 0 + nextdefsym: 1 + iundefsym: 1 + nundefsym: 3 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 32976 + nindirectsyms: 6 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 + - cmd: LC_UUID + cmdsize: 24 + uuid: 02B69690-925D-35EE-A8AB-6D99813D2A16 + - cmd: LC_BUILD_VERSION + cmdsize: 32 + platform: 1 + minos: 983040 + sdk: 983552 + ntools: 1 + Tools: + - tool: 3 + version: 73074435 + - cmd: LC_SOURCE_VERSION + cmdsize: 16 + version: 0 + - cmd: LC_LOAD_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 2 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libA.dylib' + ZeroPadBytes: 7 + - cmd: LC_LOAD_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 2 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libB.dylib' + ZeroPadBytes: 7 + - cmd: LC_LOAD_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 2 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libZ.dylib' + ZeroPadBytes: 7 + - cmd: LC_LOAD_DYLIB + cmdsize: 56 + dylib: + name: 24 + timestamp: 2 + current_version: 88539136 + compatibility_version: 65536 + Content: '/usr/lib/libSystem.B.dylib' + ZeroPadBytes: 6 + - cmd: LC_RPATH + cmdsize: 32 + path: 12 + Content: '@loader_path/../A' + ZeroPadBytes: 3 + - cmd: LC_RPATH + cmdsize: 32 + path: 12 + Content: '@loader_path/../B' + ZeroPadBytes: 3 + - cmd: LC_RPATH + cmdsize: 32 + path: 12 + Content: '@loader_path/../Z' + ZeroPadBytes: 3 + - cmd: LC_FUNCTION_STARTS + cmdsize: 16 + dataoff: 32904 + datasize: 8 + - cmd: LC_DATA_IN_CODE + cmdsize: 16 + dataoff: 32912 + datasize: 0 + - cmd: LC_CODE_SIGNATURE + cmdsize: 16 + dataoff: 33040 + datasize: 416 + LinkEditData: + ExportTrie: + TerminalSize: 0 + NodeOffset: 0 + Name: '' + Flags: 0x0 + Address: 0x0 + Other: 0x0 + ImportName: '' + Children: + - TerminalSize: 3 + NodeOffset: 13 + Name: _sayC + Flags: 0x0 + Address: 0x3F68 + Other: 0x0 + ImportName: '' + NameList: + - n_strx: 2 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 16232 + - n_strx: 8 + n_type: 0x1 + n_sect: 0 + n_desc: 256 + n_value: 0 + - n_strx: 14 + n_type: 0x1 + n_sect: 0 + n_desc: 512 + n_value: 0 + - n_strx: 20 + n_type: 0x1 + n_sect: 0 + n_desc: 768 + n_value: 0 + StringTable: + - ' ' + - _sayC + - _sayA + - _sayB + - _sayZ + - '' + - '' + - '' + - '' + - '' + - '' + IndirectSymbols: [ 0x1, 0x2, 0x3, 0x1, 0x2, 0x3 ] + FunctionStarts: [ 0x3F68 ] + ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, + 0x0, 0x0, 0x0, 0x58, 0x0, 0x0, 0x0, 0x3, 0x0, + 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0, + 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, + 0x0, 0x2, 0xE, 0x0, 0x0, 0x3, 0x1A, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x73, 0x61, 0x79, + 0x41, 0x0, 0x5F, 0x73, 0x61, 0x79, 0x42, 0x0, + 0x5F, 0x73, 0x61, 0x79, 0x5A, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0 ] + - !mach-o + FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x80000002 + filetype: 0x6 + ncmds: 21 + sizeofcmds: 1136 + flags: 0x100085 + reserved: 0x0 + LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 312 + segname: __TEXT + vmaddr: 0 + vmsize: 16384 + fileoff: 0 + filesize: 16384 + maxprot: 5 + initprot: 5 + nsects: 3 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x3F58 + size: 32 + offset: 0x3F58 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 7F2303D5FD7BBFA9FD03009105000094080000940B000094FD7BC1A8FF0F5FD6 + - sectname: __auth_stubs + segname: __TEXT + addr: 0x3F78 + size: 48 + offset: 0x3F78 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000408 + reserved1: 0x0 + reserved2: 0x10 + reserved3: 0x0 + content: 110000B031020091300240F9110A1FD7110000B031220091300240F9110A1FD7110000B031420091300240F9110A1FD7 + - sectname: __unwind_info + segname: __TEXT + addr: 0x3FA8 + size: 88 + offset: 0x3FA8 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 010000001C000000000000001C000000000000001C00000002000000583F00004000000040000000783F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000 + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __DATA_CONST + vmaddr: 16384 + vmsize: 16384 + fileoff: 16384 + filesize: 16384 + maxprot: 3 + initprot: 3 + nsects: 1 + flags: 16 + Sections: + - sectname: __auth_got + segname: __DATA_CONST + addr: 0x4000 + size: 24 + offset: 0x4000 + align: 3 + reloff: 0x0 + nreloc: 0 + flags: 0x6 + reserved1: 0x3 + reserved2: 0x0 + reserved3: 0x0 + content: 00000000000009C001000000000009C002000000000001C0 + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 32768 + vmsize: 16384 + fileoff: 32768 + filesize: 688 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + - cmd: LC_ID_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 1 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libC.dylib' + ZeroPadBytes: 7 + - cmd: LC_DYLD_CHAINED_FIXUPS + cmdsize: 16 + dataoff: 32768 + datasize: 112 + - cmd: LC_DYLD_EXPORTS_TRIE + cmdsize: 16 + dataoff: 32880 + datasize: 24 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 32912 + nsyms: 4 + stroff: 33000 + strsize: 32 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 0 + iextdefsym: 0 + nextdefsym: 1 + iundefsym: 1 + nundefsym: 3 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 32976 + nindirectsyms: 6 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 + - cmd: LC_UUID + cmdsize: 24 + uuid: F54076AA-8888-3DED-8BDF-BC7FB3E6FE8A + - cmd: LC_BUILD_VERSION + cmdsize: 32 + platform: 1 + minos: 983040 + sdk: 983552 + ntools: 1 + Tools: + - tool: 3 + version: 73074435 + - cmd: LC_SOURCE_VERSION + cmdsize: 16 + version: 0 + - cmd: LC_LOAD_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 2 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libA.dylib' + ZeroPadBytes: 7 + - cmd: LC_LOAD_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 2 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libB.dylib' + ZeroPadBytes: 7 + - cmd: LC_LOAD_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 2 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libZ.dylib' + ZeroPadBytes: 7 + - cmd: LC_LOAD_DYLIB + cmdsize: 56 + dylib: + name: 24 + timestamp: 2 + current_version: 88539136 + compatibility_version: 65536 + Content: '/usr/lib/libSystem.B.dylib' + ZeroPadBytes: 6 + - cmd: LC_RPATH + cmdsize: 32 + path: 12 + Content: '@loader_path/../A' + ZeroPadBytes: 3 + - cmd: LC_RPATH + cmdsize: 32 + path: 12 + Content: '@loader_path/../B' + ZeroPadBytes: 3 + - cmd: LC_RPATH + cmdsize: 32 + path: 12 + Content: '@loader_path/../Z' + ZeroPadBytes: 3 + - cmd: LC_FUNCTION_STARTS + cmdsize: 16 + dataoff: 32904 + datasize: 8 + - cmd: LC_DATA_IN_CODE + cmdsize: 16 + dataoff: 32912 + datasize: 0 + - cmd: LC_CODE_SIGNATURE + cmdsize: 16 + dataoff: 33040 + datasize: 416 + LinkEditData: + ExportTrie: + TerminalSize: 0 + NodeOffset: 0 + Name: '' + Flags: 0x0 + Address: 0x0 + Other: 0x0 + ImportName: '' + Children: + - TerminalSize: 3 + NodeOffset: 13 + Name: _sayC + Flags: 0x0 + Address: 0x3F58 + Other: 0x0 + ImportName: '' + NameList: + - n_strx: 2 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 16216 + - n_strx: 8 + n_type: 0x1 + n_sect: 0 + n_desc: 256 + n_value: 0 + - n_strx: 14 + n_type: 0x1 + n_sect: 0 + n_desc: 512 + n_value: 0 + - n_strx: 20 + n_type: 0x1 + n_sect: 0 + n_desc: 768 + n_value: 0 + StringTable: + - ' ' + - _sayC + - _sayA + - _sayB + - _sayZ + - '' + - '' + - '' + - '' + - '' + - '' + IndirectSymbols: [ 0x1, 0x2, 0x3, 0x1, 0x2, 0x3 ] + FunctionStarts: [ 0x3F58 ] + ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, + 0x0, 0x0, 0x0, 0x58, 0x0, 0x0, 0x0, 0x3, 0x0, + 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0, + 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, + 0x0, 0x2, 0xE, 0x0, 0x0, 0x3, 0x1A, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x73, 0x61, 0x79, + 0x41, 0x0, 0x5F, 0x73, 0x61, 0x79, 0x42, 0x0, + 0x5F, 0x73, 0x61, 0x79, 0x5A, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0 ] +... diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml new file mode 100644 index 0000000000000..5561f29a93602 --- /dev/null +++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml @@ -0,0 +1,460 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +ProgramHeaders: + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .rela.plt + Align: 0x1000 + Offset: 0x0 + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + FirstSec: .init + LastSec: .fini + VAddr: 0x1000 + Align: 0x1000 + Offset: 0x1000 + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .rodata + LastSec: .eh_frame + VAddr: 0x2000 + Align: 0x1000 + Offset: 0x2000 + - Type: PT_LOAD + Flags: [ PF_W, PF_R ] + FirstSec: .init_array + LastSec: .bss + VAddr: 0x3E10 + Align: 0x1000 + Offset: 0x2E10 + - Type: PT_DYNAMIC + Flags: [ PF_W, PF_R ] + FirstSec: .dynamic + LastSec: .dynamic + VAddr: 0x3E20 + Align: 0x8 + Offset: 0x2E20 + - Type: PT_NOTE + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .note.gnu.property + VAddr: 0x2A8 + Align: 0x8 + Offset: 0x2A8 + - Type: PT_NOTE + Flags: [ PF_R ] + FirstSec: .note.gnu.build-id + LastSec: .note.gnu.build-id + VAddr: 0x2C8 + Align: 0x4 + Offset: 0x2C8 + - Type: PT_GNU_PROPERTY + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .note.gnu.property + VAddr: 0x2A8 + Align: 0x8 + Offset: 0x2A8 + - Type: PT_GNU_EH_FRAME + Flags: [ PF_R ] + FirstSec: .eh_frame_hdr + LastSec: .eh_frame_hdr + VAddr: 0x2010 + Align: 0x4 + Offset: 0x2010 + - Type: PT_GNU_STACK + Flags: [ PF_W, PF_R ] + Align: 0x10 + Offset: 0x0 + - Type: PT_GNU_RELRO + Flags: [ PF_R ] + FirstSec: .init_array + LastSec: .got + VAddr: 0x3E10 + Offset: 0x2E10 +Sections: + - Name: .note.gnu.property + Type: SHT_NOTE + Flags: [ SHF_ALLOC ] + Address: 0x2A8 + AddressAlign: 0x8 + Notes: + - Name: GNU + Desc: 020000C0040000000300000000000000 + Type: NT_GNU_PROPERTY_TYPE_0 + - Name: .note.gnu.build-id + Type: SHT_NOTE + Flags: [ SHF_ALLOC ] + Address: 0x2C8 + AddressAlign: 0x4 + Notes: + - Name: GNU + Desc: 640A4A3AC0DF6BA3DAC3B51CCD727245117E0B30 + Type: NT_PRPSINFO + - Name: .gnu.hash + Type: SHT_GNU_HASH + Flags: [ SHF_ALLOC ] + Address: 0x2F0 + Link: .dynsym + AddressAlign: 0x8 + Header: + SymNdx: 0x6 + Shift2: 0x6 + BloomFilter: [ 0x500000000000 ] + HashBuckets: [ 0x6, 0x0 ] + HashValues: [ 0x7C9DCBAD ] + - Name: .dynsym + Type: SHT_DYNSYM + Flags: [ SHF_ALLOC ] + Address: 0x318 + Link: .dynstr + AddressAlign: 0x8 + - Name: .dynstr + Type: SHT_STRTAB + Flags: [ SHF_ALLOC ] + Address: 0x3C0 + AddressAlign: 0x1 + - Name: .gnu.version + Type: SHT_GNU_versym + Flags: [ SHF_ALLOC ] + Address: 0x436 + Link: .dynsym + AddressAlign: 0x2 + Entries: [ 0, 1, 2, 1, 1, 2, 1 ] + - Name: .gnu.version_r + Type: SHT_GNU_verneed + Flags: [ SHF_ALLOC ] + Address: 0x448 + Link: .dynstr + AddressAlign: 0x8 + Dependencies: + - Version: 1 + File: libc.so.6 + Entries: + - Name: GLIBC_2.2.5 + Hash: 157882997 + Flags: 0 + Other: 2 + - Name: .rela.dyn + Type: SHT_RELA + Flags: [ SHF_ALLOC ] + Address: 0x468 + Link: .dynsym + AddressAlign: 0x8 + Relocations: + - Offset: 0x3E10 + Type: R_X86_64_RELATIVE + Addend: 4368 + - Offset: 0x3E18 + Type: R_X86_64_RELATIVE + Addend: 4304 + - Offset: 0x4020 + Type: R_X86_64_RELATIVE + Addend: 16416 + - Offset: 0x3FE0 + Symbol: _ITM_deregisterTMCloneTable + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FE8 + Symbol: __gmon_start__ + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FF0 + Symbol: _ITM_registerTMCloneTable + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FF8 + Symbol: __cxa_finalize + Type: R_X86_64_GLOB_DAT + - Name: .rela.plt + Type: SHT_RELA + Flags: [ SHF_ALLOC, SHF_INFO_LINK ] + Address: 0x510 + Link: .dynsym + AddressAlign: 0x8 + Info: .got.plt + Relocations: + - Offset: 0x4018 + Symbol: puts + Type: R_X86_64_JUMP_SLOT + - Name: .init + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1000 + AddressAlign: 0x4 + Offset: 0x1000 + Content: F30F1EFA4883EC08488B05D92F00004885C07402FFD04883C408C3 + - Name: .plt + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1020 + AddressAlign: 0x10 + EntSize: 0x10 + Content: FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90 + - Name: .plt.got + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1040 + AddressAlign: 0x10 + EntSize: 0x10 + Content: F30F1EFAF2FF25AD2F00000F1F440000 + - Name: .plt.sec + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1050 + AddressAlign: 0x10 + EntSize: 0x10 + Content: F30F1EFAF2FF25BD2F00000F1F440000 + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1060 + AddressAlign: 0x10 + Content: 488D3DC12F0000488D05BA2F00004839F87415488B05662F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D912F0000488D358A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05352F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D4D2F000000752B5548833D122F0000004889E5740C488B3D2E2F0000E849FFFFFFE864FFFFFFC605252F0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5488D05D80E00004889C7E820FFFFFF905DC3 + - Name: .fini + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1134 + AddressAlign: 0x4 + Content: F30F1EFA4883EC084883C408C3 + - Name: .rodata + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x2000 + AddressAlign: 0x1 + Offset: 0x2000 + Content: 48656C6C6F2066726F6D205A00 + - Name: .eh_frame_hdr + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x2010 + AddressAlign: 0x4 + Content: 011B033B2C0000000400000010F0FFFF4800000030F0FFFF7000000040F0FFFF8800000009F1FFFFA0000000 + - Name: .eh_frame + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x2040 + AddressAlign: 0x8 + Content: 1400000000000000017A5200017810011B0C070890010000240000001C000000C0EFFFFF20000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000B8EFFFFF100000000000000000000000140000005C000000B0EFFFFF1000000000000000000000001C0000007400000061F0FFFF1A00000000450E108602430D06510C070800000000000000 + - Name: .init_array + Type: SHT_INIT_ARRAY + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3E10 + AddressAlign: 0x8 + EntSize: 0x8 + Offset: 0x2E10 + Content: '1011000000000000' + - Name: .fini_array + Type: SHT_FINI_ARRAY + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3E18 + AddressAlign: 0x8 + EntSize: 0x8 + Content: D010000000000000 + - Name: .dynamic + Type: SHT_DYNAMIC + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3E20 + Link: .dynstr + AddressAlign: 0x8 + Entries: + - Tag: DT_NEEDED + Value: 0x5F + - Tag: DT_INIT + Value: 0x1000 + - Tag: DT_FINI + Value: 0x1134 + - Tag: DT_INIT_ARRAY + Value: 0x3E10 + - Tag: DT_INIT_ARRAYSZ + Value: 0x8 + - Tag: DT_FINI_ARRAY + Value: 0x3E18 + - Tag: DT_FINI_ARRAYSZ + Value: 0x8 + - Tag: DT_GNU_HASH + Value: 0x2F0 + - Tag: DT_STRTAB + Value: 0x3C0 + - Tag: DT_SYMTAB + Value: 0x318 + - Tag: DT_STRSZ + Value: 0x75 + - Tag: DT_SYMENT + Value: 0x18 + - Tag: DT_PLTGOT + Value: 0x4000 + - Tag: DT_PLTRELSZ + Value: 0x18 + - Tag: DT_PLTREL + Value: 0x7 + - Tag: DT_JMPREL + Value: 0x510 + - Tag: DT_RELA + Value: 0x468 + - Tag: DT_RELASZ + Value: 0xA8 + - Tag: DT_RELAENT + Value: 0x18 + - Tag: DT_VERNEED + Value: 0x448 + - Tag: DT_VERNEEDNUM + Value: 0x1 + - Tag: DT_VERSYM + Value: 0x436 + - Tag: DT_RELACOUNT + Value: 0x3 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Name: .got + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3FE0 + AddressAlign: 0x8 + EntSize: 0x8 + Content: '0000000000000000000000000000000000000000000000000000000000000000' + - Name: .got.plt + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4000 + AddressAlign: 0x8 + EntSize: 0x8 + Content: '203E000000000000000000000000000000000000000000003010000000000000' + - Name: .data + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4020 + AddressAlign: 0x8 + Content: '2040000000000000' + - Name: .bss + Type: SHT_NOBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4028 + AddressAlign: 0x1 + Size: 0x8 + - Name: .comment + Type: SHT_PROGBITS + Flags: [ SHF_MERGE, SHF_STRINGS ] + AddressAlign: 0x1 + EntSize: 0x1 + Content: 4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000 +Symbols: + - Name: crtstuff.c + Type: STT_FILE + Index: SHN_ABS + - Name: deregister_tm_clones + Type: STT_FUNC + Section: .text + Value: 0x1060 + - Name: register_tm_clones + Type: STT_FUNC + Section: .text + Value: 0x1090 + - Name: __do_global_dtors_aux + Type: STT_FUNC + Section: .text + Value: 0x10D0 + - Name: completed.0 + Type: STT_OBJECT + Section: .bss + Value: 0x4028 + Size: 0x1 + - Name: __do_global_dtors_aux_fini_array_entry + Type: STT_OBJECT + Section: .fini_array + Value: 0x3E18 + - Name: frame_dummy + Type: STT_FUNC + Section: .text + Value: 0x1110 + - Name: __frame_dummy_init_array_entry + Type: STT_OBJECT + Section: .init_array + Value: 0x3E10 + - Name: libZ.c + Type: STT_FILE + Index: SHN_ABS + - Name: 'crtstuff.c (1)' + Type: STT_FILE + Index: SHN_ABS + - Name: __FRAME_END__ + Type: STT_OBJECT + Section: .eh_frame + Value: 0x20D0 + - Type: STT_FILE + Index: SHN_ABS + - Name: _fini + Type: STT_FUNC + Section: .fini + Value: 0x1134 + - Name: __dso_handle + Type: STT_OBJECT + Section: .data + Value: 0x4020 + - Name: _DYNAMIC + Type: STT_OBJECT + Section: .dynamic + Value: 0x3E20 + - Name: __GNU_EH_FRAME_HDR + Section: .eh_frame_hdr + Value: 0x2010 + - Name: __TMC_END__ + Type: STT_OBJECT + Section: .data + Value: 0x4028 + - Name: _GLOBAL_OFFSET_TABLE_ + Type: STT_OBJECT + Section: .got.plt + Value: 0x4000 + - Name: _init + Type: STT_FUNC + Section: .init + Value: 0x1000 + - Name: _ITM_deregisterTMCloneTable + Binding: STB_WEAK + - Name: 'puts@GLIBC_2.2.5' + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: __gmon_start__ + Binding: STB_WEAK + - Name: sayZ + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x1119 + Size: 0x1A + - Name: _ITM_registerTMCloneTable + Binding: STB_WEAK + - Name: '__cxa_finalize@GLIBC_2.2.5' + Type: STT_FUNC + Binding: STB_WEAK +DynamicSymbols: + - Name: _ITM_deregisterTMCloneTable + Binding: STB_WEAK + - Name: puts + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: __gmon_start__ + Binding: STB_WEAK + - Name: _ITM_registerTMCloneTable + Binding: STB_WEAK + - Name: __cxa_finalize + Type: STT_FUNC + Binding: STB_WEAK + - Name: sayZ + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x1119 + Size: 0x1A +... diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml new file mode 100644 index 0000000000000..c0c18265ab667 --- /dev/null +++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml @@ -0,0 +1,723 @@ +--- !fat-mach-o +FatHeader: + magic: 0xCAFEBABE + nfat_arch: 3 +FatArchs: + - cputype: 0x1000007 + cpusubtype: 0x3 + offset: 0x1000 + size: 8376 + align: 12 + - cputype: 0x100000C + cpusubtype: 0x0 + offset: 0x4000 + size: 33376 + align: 14 + - cputype: 0x100000C + cpusubtype: 0x80000002 + offset: 0x10000 + size: 33376 + align: 14 +Slices: + - !mach-o + FileHeader: + magic: 0xFEEDFACF + cputype: 0x1000007 + cpusubtype: 0x3 + filetype: 0x6 + ncmds: 14 + sizeofcmds: 960 + flags: 0x100085 + reserved: 0x0 + LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 392 + segname: __TEXT + vmaddr: 0 + vmsize: 4096 + fileoff: 0 + filesize: 4096 + maxprot: 5 + initprot: 5 + nsects: 4 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0xF80 + size: 20 + offset: 0xF80 + align: 4 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 554889E5488D3D0F000000B000E8020000005DC3 + - sectname: __stubs + segname: __TEXT + addr: 0xF94 + size: 6 + offset: 0xF94 + align: 1 + reloff: 0x0 + nreloc: 0 + flags: 0x80000408 + reserved1: 0x0 + reserved2: 0x6 + reserved3: 0x0 + content: FF2566000000 + - sectname: __cstring + segname: __TEXT + addr: 0xF9A + size: 14 + offset: 0xF9A + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 48656C6C6F2066726F6D205A0A00 + - sectname: __unwind_info + segname: __TEXT + addr: 0xFA8 + size: 88 + offset: 0xFA8 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 010000001C000000000000001C000000000000001C00000002000000800F00004000000040000000940F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000 + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __DATA_CONST + vmaddr: 4096 + vmsize: 4096 + fileoff: 4096 + filesize: 4096 + maxprot: 3 + initprot: 3 + nsects: 1 + flags: 16 + Sections: + - sectname: __got + segname: __DATA_CONST + addr: 0x1000 + size: 8 + offset: 0x1000 + align: 3 + reloff: 0x0 + nreloc: 0 + flags: 0x6 + reserved1: 0x1 + reserved2: 0x0 + reserved3: 0x0 + content: '0000000000000080' + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 8192 + vmsize: 4096 + fileoff: 8192 + filesize: 184 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + - cmd: LC_ID_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 1 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libZ.dylib' + ZeroPadBytes: 7 + - cmd: LC_DYLD_CHAINED_FIXUPS + cmdsize: 16 + dataoff: 8192 + datasize: 96 + - cmd: LC_DYLD_EXPORTS_TRIE + cmdsize: 16 + dataoff: 8288 + datasize: 24 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 8320 + nsyms: 2 + stroff: 8360 + strsize: 16 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 0 + iextdefsym: 0 + nextdefsym: 1 + iundefsym: 1 + nundefsym: 1 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 8352 + nindirectsyms: 2 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 + - cmd: LC_UUID + cmdsize: 24 + uuid: 399E203C-FF9A-3B80-872C-85F3A759A78B + - cmd: LC_BUILD_VERSION + cmdsize: 32 + platform: 1 + minos: 983040 + sdk: 983552 + ntools: 1 + Tools: + - tool: 3 + version: 73074435 + - cmd: LC_SOURCE_VERSION + cmdsize: 16 + version: 0 + - cmd: LC_LOAD_DYLIB + cmdsize: 56 + dylib: + name: 24 + timestamp: 2 + current_version: 88539136 + compatibility_version: 65536 + Content: '/usr/lib/libSystem.B.dylib' + ZeroPadBytes: 6 + - cmd: LC_FUNCTION_STARTS + cmdsize: 16 + dataoff: 8312 + datasize: 8 + - cmd: LC_DATA_IN_CODE + cmdsize: 16 + dataoff: 8320 + datasize: 0 + LinkEditData: + ExportTrie: + TerminalSize: 0 + NodeOffset: 0 + Name: '' + Flags: 0x0 + Address: 0x0 + Other: 0x0 + ImportName: '' + Children: + - TerminalSize: 3 + NodeOffset: 13 + Name: _sayZ + Flags: 0x0 + Address: 0xF80 + Other: 0x0 + ImportName: '' + NameList: + - n_strx: 2 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 3968 + - n_strx: 8 + n_type: 0x1 + n_sect: 0 + n_desc: 256 + n_value: 0 + StringTable: + - ' ' + - _sayZ + - _printf + IndirectSymbols: [ 0x1, 0x1 ] + FunctionStarts: [ 0xF80 ] + ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, + 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, + 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0, + 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, + 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0 ] + - !mach-o + FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x6 + ncmds: 15 + sizeofcmds: 976 + flags: 0x100085 + reserved: 0x0 + LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 392 + segname: __TEXT + vmaddr: 0 + vmsize: 16384 + fileoff: 0 + filesize: 16384 + maxprot: 5 + initprot: 5 + nsects: 4 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x3F70 + size: 28 + offset: 0x3F70 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8C0035FD6 + - sectname: __stubs + segname: __TEXT + addr: 0x3F8C + size: 12 + offset: 0x3F8C + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000408 + reserved1: 0x0 + reserved2: 0xC + reserved3: 0x0 + content: 100000B0100240F900021FD6 + - sectname: __cstring + segname: __TEXT + addr: 0x3F98 + size: 14 + offset: 0x3F98 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 48656C6C6F2066726F6D205A0A00 + - sectname: __unwind_info + segname: __TEXT + addr: 0x3FA8 + size: 88 + offset: 0x3FA8 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 010000001C000000000000001C000000000000001C00000002000000703F000040000000400000008C3F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000 + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __DATA_CONST + vmaddr: 16384 + vmsize: 16384 + fileoff: 16384 + filesize: 16384 + maxprot: 3 + initprot: 3 + nsects: 1 + flags: 16 + Sections: + - sectname: __got + segname: __DATA_CONST + addr: 0x4000 + size: 8 + offset: 0x4000 + align: 3 + reloff: 0x0 + nreloc: 0 + flags: 0x6 + reserved1: 0x1 + reserved2: 0x0 + reserved3: 0x0 + content: '0000000000000080' + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 32768 + vmsize: 16384 + fileoff: 32768 + filesize: 608 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + - cmd: LC_ID_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 1 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libZ.dylib' + ZeroPadBytes: 7 + - cmd: LC_DYLD_CHAINED_FIXUPS + cmdsize: 16 + dataoff: 32768 + datasize: 96 + - cmd: LC_DYLD_EXPORTS_TRIE + cmdsize: 16 + dataoff: 32864 + datasize: 24 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 32896 + nsyms: 2 + stroff: 32936 + strsize: 16 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 0 + iextdefsym: 0 + nextdefsym: 1 + iundefsym: 1 + nundefsym: 1 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 32928 + nindirectsyms: 2 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 + - cmd: LC_UUID + cmdsize: 24 + uuid: 6E8E78AF-EDB2-3830-BE1E-013390302CC5 + - cmd: LC_BUILD_VERSION + cmdsize: 32 + platform: 1 + minos: 983040 + sdk: 983552 + ntools: 1 + Tools: + - tool: 3 + version: 73074435 + - cmd: LC_SOURCE_VERSION + cmdsize: 16 + version: 0 + - cmd: LC_LOAD_DYLIB + cmdsize: 56 + dylib: + name: 24 + timestamp: 2 + current_version: 88539136 + compatibility_version: 65536 + Content: '/usr/lib/libSystem.B.dylib' + ZeroPadBytes: 6 + - cmd: LC_FUNCTION_STARTS + cmdsize: 16 + dataoff: 32888 + datasize: 8 + - cmd: LC_DATA_IN_CODE + cmdsize: 16 + dataoff: 32896 + datasize: 0 + - cmd: LC_CODE_SIGNATURE + cmdsize: 16 + dataoff: 32960 + datasize: 416 + LinkEditData: + ExportTrie: + TerminalSize: 0 + NodeOffset: 0 + Name: '' + Flags: 0x0 + Address: 0x0 + Other: 0x0 + ImportName: '' + Children: + - TerminalSize: 3 + NodeOffset: 13 + Name: _sayZ + Flags: 0x0 + Address: 0x3F70 + Other: 0x0 + ImportName: '' + NameList: + - n_strx: 2 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 16240 + - n_strx: 8 + n_type: 0x1 + n_sect: 0 + n_desc: 256 + n_value: 0 + StringTable: + - ' ' + - _sayZ + - _printf + IndirectSymbols: [ 0x1, 0x1 ] + FunctionStarts: [ 0x3F70 ] + ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, + 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, + 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0, + 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, + 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0 ] + - !mach-o + FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x80000002 + filetype: 0x6 + ncmds: 15 + sizeofcmds: 976 + flags: 0x100085 + reserved: 0x0 + LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 392 + segname: __TEXT + vmaddr: 0 + vmsize: 16384 + fileoff: 0 + filesize: 16384 + maxprot: 5 + initprot: 5 + nsects: 4 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x3F68 + size: 32 + offset: 0x3F68 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 7F2303D5FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8FF0F5FD6 + - sectname: __auth_stubs + segname: __TEXT + addr: 0x3F88 + size: 16 + offset: 0x3F88 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000408 + reserved1: 0x0 + reserved2: 0x10 + reserved3: 0x0 + content: 110000B031020091300240F9110A1FD7 + - sectname: __cstring + segname: __TEXT + addr: 0x3F98 + size: 14 + offset: 0x3F98 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 48656C6C6F2066726F6D205A0A00 + - sectname: __unwind_info + segname: __TEXT + addr: 0x3FA8 + size: 88 + offset: 0x3FA8 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000883F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000 + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __DATA_CONST + vmaddr: 16384 + vmsize: 16384 + fileoff: 16384 + filesize: 16384 + maxprot: 3 + initprot: 3 + nsects: 1 + flags: 16 + Sections: + - sectname: __auth_got + segname: __DATA_CONST + addr: 0x4000 + size: 8 + offset: 0x4000 + align: 3 + reloff: 0x0 + nreloc: 0 + flags: 0x6 + reserved1: 0x1 + reserved2: 0x0 + reserved3: 0x0 + content: 00000000000001C0 + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 32768 + vmsize: 16384 + fileoff: 32768 + filesize: 608 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + - cmd: LC_ID_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 1 + current_version: 0 + compatibility_version: 0 + Content: '@rpath/libZ.dylib' + ZeroPadBytes: 7 + - cmd: LC_DYLD_CHAINED_FIXUPS + cmdsize: 16 + dataoff: 32768 + datasize: 96 + - cmd: LC_DYLD_EXPORTS_TRIE + cmdsize: 16 + dataoff: 32864 + datasize: 24 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 32896 + nsyms: 2 + stroff: 32936 + strsize: 16 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 0 + iextdefsym: 0 + nextdefsym: 1 + iundefsym: 1 + nundefsym: 1 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 32928 + nindirectsyms: 2 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 + - cmd: LC_UUID + cmdsize: 24 + uuid: E74F368D-238F-31FA-BF40-FA2964FED986 + - cmd: LC_BUILD_VERSION + cmdsize: 32 + platform: 1 + minos: 983040 + sdk: 983552 + ntools: 1 + Tools: + - tool: 3 + version: 73074435 + - cmd: LC_SOURCE_VERSION + cmdsize: 16 + version: 0 + - cmd: LC_LOAD_DYLIB + cmdsize: 56 + dylib: + name: 24 + timestamp: 2 + current_version: 88539136 + compatibility_version: 65536 + Content: '/usr/lib/libSystem.B.dylib' + ZeroPadBytes: 6 + - cmd: LC_FUNCTION_STARTS + cmdsize: 16 + dataoff: 32888 + datasize: 8 + - cmd: LC_DATA_IN_CODE + cmdsize: 16 + dataoff: 32896 + datasize: 0 + - cmd: LC_CODE_SIGNATURE + cmdsize: 16 + dataoff: 32960 + datasize: 416 + LinkEditData: + ExportTrie: + TerminalSize: 0 + NodeOffset: 0 + Name: '' + Flags: 0x0 + Address: 0x0 + Other: 0x0 + ImportName: '' + Children: + - TerminalSize: 3 + NodeOffset: 13 + Name: _sayZ + Flags: 0x0 + Address: 0x3F68 + Other: 0x0 + ImportName: '' + NameList: + - n_strx: 2 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 16232 + - n_strx: 8 + n_type: 0x1 + n_sect: 0 + n_desc: 256 + n_value: 0 + StringTable: + - ' ' + - _sayZ + - _printf + IndirectSymbols: [ 0x1, 0x1 ] + FunctionStarts: [ 0x3F68 ] + ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, + 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, + 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0, + 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, + 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0 ] +... diff --git a/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp b/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp new file mode 100644 index 0000000000000..31f0cce86fe8e --- /dev/null +++ b/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp @@ -0,0 +1,762 @@ +//===- LibraryResolverTest.cpp - Unit tests for LibraryResolver -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h" +#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h" +#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h" +#include "llvm/ObjectYAML/MachOYAML.h" +#include "llvm/ObjectYAML/yaml2obj.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/YAMLParser.h" +#include "llvm/Support/YAMLTraits.h" +#include "llvm/Support/raw_ostream.h" + +#include "llvm/Testing/Support/SupportHelpers.h" + +#include "gtest/gtest.h" + +#include +#include +#include +#include + +using namespace llvm; +using namespace llvm::orc; + +#if defined(__APPLE__) || defined(__linux__) +// TODO: Add COFF (Windows) support for these tests. +// this facility also works correctly on Windows (COFF), +// so we should eventually enable and run these tests for that platform as well. +namespace { + +#if defined(__APPLE__) +constexpr const char *ext = ".dylib"; +#elif defined(_WIN32) +constexpr const char *ext = ".dll"; +#else +constexpr const char *ext = ".so"; +#endif + +bool EnvReady = false; + +Triple getTargetTriple() { + auto JTMB = JITTargetMachineBuilder::detectHost(); + if (!JTMB) { + consumeError(JTMB.takeError()); + return Triple(); + } + return JTMB->getTargetTriple(); +} + +static bool CheckHostSupport() { + auto Triple = getTargetTriple(); + // TODO: Extend support to COFF (Windows) once test setup and YAML conversion + // are verified. + if (!Triple.isOSBinFormatMachO() && + !(Triple.isOSBinFormatELF() && Triple.getArch() == Triple::x86_64)) + return false; + + return true; +} + +std::string getYamlFilePlatformExt() { + auto Triple = getTargetTriple(); + if (Triple.isOSBinFormatMachO()) + return "_macho"; + else if (Triple.isOSBinFormatELF()) + return "_linux"; + + return ""; +} + +unsigned getYamlDocNum() { + // auto Triple = getTargetTriple(); + // if (Triple.isOSBinFormatELF()) + // return 1; + + return 1; +} + +class LibraryTestEnvironment : public ::testing::Environment { + std::vector CreatedDylibsDir; + std::vector CreatedDylibs; + SmallVector DirPath; + +public: + void SetUp() override { + if (!CheckHostSupport()) { + EnvReady = false; + return; + } + + StringRef ThisFile = __FILE__; + SmallVector InputDirPath(ThisFile.begin(), ThisFile.end()); + sys::path::remove_filename(InputDirPath); + sys::path::append(InputDirPath, "Inputs"); + if (!sys::fs::exists(InputDirPath)) + return; + + SmallString<128> UniqueDir; + sys::path::append(UniqueDir, InputDirPath); + std::error_code EC = sys::fs::createUniqueDirectory(UniqueDir, DirPath); + + if (EC) + return; + + // given yamlPath + DylibPath, validate + convert + auto processYamlToDylib = [&](const SmallVector &YamlPath, + const SmallVector &DylibPath, + unsigned DocNum) -> bool { + if (!sys::fs::exists(YamlPath)) { + errs() << "YAML file missing: " + << StringRef(YamlPath.data(), YamlPath.size()) << "\n"; + EnvReady = false; + return false; + } + + auto BufOrErr = MemoryBuffer::getFile(YamlPath); + if (!BufOrErr) { + errs() << "Failed to read " + << StringRef(YamlPath.data(), YamlPath.size()) << ": " + << BufOrErr.getError().message() << "\n"; + EnvReady = false; + return false; + } + + yaml::Input yin(BufOrErr->get()->getBuffer()); + std::error_code EC; + raw_fd_ostream outFile(StringRef(DylibPath.data(), DylibPath.size()), EC, + sys::fs::OF_None); + + if (EC) { + errs() << "Failed to open " + << StringRef(DylibPath.data(), DylibPath.size()) + << " for writing: " << EC.message() << "\n"; + EnvReady = false; + return false; + } + + if (!yaml::convertYAML( + yin, outFile, + [](const Twine &M) { + // Handle or ignore errors here + errs() << "Yaml Error :" << M << "\n"; + }, + DocNum)) { + errs() << "Failed to convert " + << StringRef(YamlPath.data(), YamlPath.size()) << " to " + << StringRef(DylibPath.data(), DylibPath.size()) << "\n"; + EnvReady = false; + return false; + } + + CreatedDylibsDir.push_back(std::string(sys::path::parent_path( + StringRef(DylibPath.data(), DylibPath.size())))); + CreatedDylibs.push_back(std::string(DylibPath.begin(), DylibPath.end())); + return true; + }; + + std::vector LibDirs = {"Z", "A", "B", "C"}; + + unsigned DocNum = getYamlDocNum(); + std::string YamlPltExt = getYamlFilePlatformExt(); + for (const auto &LibdirName : LibDirs) { + // YAML path + SmallVector YamlPath(InputDirPath.begin(), InputDirPath.end()); + SmallVector YamlFileName; + YamlFileName.append(LibdirName, LibdirName + strlen(LibdirName)); + YamlFileName.append(YamlPltExt.begin(), YamlPltExt.end()); + sys::path::append(YamlPath, LibdirName, YamlFileName); + sys::path::replace_extension(YamlPath, ".yaml"); + + // dylib path + SmallVector DylibPath(DirPath.begin(), DirPath.end()); + SmallVector DylibFileName; + StringRef prefix("lib"); + DylibFileName.append(prefix.begin(), prefix.end()); + DylibFileName.append(LibdirName, LibdirName + strlen(LibdirName)); + + sys::path::append(DylibPath, LibdirName); + if (!sys::fs::exists(DylibPath)) { + auto EC = sys::fs::create_directory(DylibPath); + if (EC) + return; + } + sys::path::append(DylibPath, DylibFileName); + sys::path::replace_extension(DylibPath, ext); + if (!processYamlToDylib(YamlPath, DylibPath, DocNum)) + return; + } + + EnvReady = true; + } + + void TearDown() override { sys::fs::remove_directories(DirPath); } + + std::string getBaseDir() const { + return std::string(DirPath.begin(), DirPath.end()); + } + + std::vector getDylibPaths() const { return CreatedDylibs; } +}; + +static LibraryTestEnvironment *GlobalEnv = + static_cast( + ::testing::AddGlobalTestEnvironment(new LibraryTestEnvironment())); + +inline std::string libPath(const std::string &BaseDir, + const std::string &name) { +#if defined(__APPLE__) + return BaseDir + "/" + name + ".dylib"; +#elif defined(_WIN32) + return BaseDir + "/" + name + ".dll"; +#else + return BaseDir + "/" + name + ".so"; +#endif +} + +inline std::string withext(const std::string &lib) { + SmallString<128> P(lib); + sys::path::replace_extension(P, ext); + return P.str().str(); +} + +inline std::string platformSymbolName(const std::string &name) { +#if defined(__APPLE__) + return "_" + name; // macOS prepends underscore +#else + return name; +#endif +} + +struct TestLibrary { + std::string path; + std::vector Syms; +}; + +class LibraryResolverIT : public ::testing::Test { +protected: + std::string BaseDir; + std::unordered_map libs; + + void addLib(const std::string &name) { + SmallString<512> path; + std::error_code EC = + sys::fs::real_path(libPath(BaseDir, name + "/lib" + name), path); + if (EC || path.empty() || !sys::fs::exists(path)) + GTEST_SKIP(); + libs[name] = {path.str().str(), {platformSymbolName("say" + name)}}; + } + + void SetUp() override { + if (!EnvReady || GlobalEnv == nullptr) + GTEST_SKIP() << "Skipping test: environment setup failed."; + + { + SmallString<512> path; + std::error_code EC = sys::fs::real_path(GlobalEnv->getBaseDir(), path); + if (path.empty() || EC) + GTEST_SKIP() << "Base directory resolution failed: " << EC.message(); + BaseDir = path.str().str(); + } + + for (const auto &P : GlobalEnv->getDylibPaths()) { + if (!sys::fs::exists(P)) + GTEST_SKIP() << "Missing dylib path: " << P; + } + + const std::vector libNames = {"A", "B", "C", "Z"}; + for (const auto &name : libNames) + addLib(name); + + if (!EnvReady) + GTEST_SKIP() << "Skipping test: environment setup failed."; + } + + const std::vector &sym(const std::string &key) { + return libs[key].Syms; + } + const std::string &lib(const std::string &key) { return libs[key].path; } + const std::string libdir(const std::string &key) { + SmallString<512> P(libs[key].path); + sys::path::remove_filename(P); + return P.str().str(); + } + const std::string libname(const std::string &key) { + return sys::path::filename(libs[key].path).str(); + } +}; + +// Helper: allow either "sayA" or "_sayA" depending on how your +// SymbolEnumerator reports. +static bool matchesEitherUnderscore(const std::string &got, + const std::string &bare) { + return got == bare || got == ("_" + bare); +} + +// Helper: normalize path ending check (we only care that it resolved to the +// right dylib) +static bool endsWith(const std::string &s, const std::string &suffix) { + if (s.size() < suffix.size()) + return false; + return std::equal(suffix.rbegin(), suffix.rend(), s.rbegin()); +} + +TEST_F(LibraryResolverIT, EnumerateSymbols_ExportsOnly_DefaultFlags) { + const std::string libC = lib("C"); + SymbolEnumeratorOptions Opts = SymbolEnumeratorOptions::defaultOptions(); + + std::vector seen; + auto onEach = [&](llvm::StringRef sym) -> EnumerateResult { + seen.emplace_back(sym.str()); + return EnumerateResult::Continue; + }; + + ASSERT_TRUE(SymbolEnumerator::enumerateSymbols(libC, onEach, Opts)); + + // sayC is exported, others are undefined → only sayC expected + EXPECT_TRUE(any_of(seen, [&](const std::string &s) { + return matchesEitherUnderscore(s, "sayC"); + })); + EXPECT_FALSE(any_of(seen, [&](const std::string &s) { + return matchesEitherUnderscore(s, "sayA"); + })); + EXPECT_FALSE(any_of(seen, [&](const std::string &s) { + return matchesEitherUnderscore(s, "sayB"); + })); +} + +TEST_F(LibraryResolverIT, EnumerateSymbols_IncludesUndefineds) { + const std::string libC = lib("C"); + + SymbolEnumeratorOptions Opts; + Opts.FilterFlags = + SymbolEnumeratorOptions::IgnoreWeak | + SymbolEnumeratorOptions::IgnoreIndirect; // no IgnoreUndefined + + std::vector seen; + auto onEach = [&](llvm::StringRef sym) -> EnumerateResult { + seen.emplace_back(sym.str()); + return EnumerateResult::Continue; + }; + + ASSERT_TRUE(SymbolEnumerator::enumerateSymbols(libC, onEach, Opts)); + + // Now we should see both sayC (export) and the undefined refs sayA, sayB, + // sayZ + EXPECT_TRUE(any_of(seen, [&](const std::string &s) { + return matchesEitherUnderscore(s, "sayC"); + })); + EXPECT_TRUE(any_of(seen, [&](const std::string &s) { + return matchesEitherUnderscore(s, "sayA"); + })); + EXPECT_TRUE(any_of(seen, [&](const std::string &s) { + return matchesEitherUnderscore(s, "sayB"); + })); +} + +// Full resolution via LibraryResolutionDriver/LibraryResolver --- +TEST_F(LibraryResolverIT, DriverResolvesSymbolsToCorrectLibraries) { + // Create the resolver from real base paths (our fixtures dir) + auto Stup = LibraryResolver::Setup::create({BaseDir}); + + // Full system behavior: no mocks + auto Driver = LibraryResolutionDriver::create(Stup); + ASSERT_NE(Driver, nullptr); + + // Tell the Driver about the scan path kinds (User/System) as your + // production code expects. + Driver->addScanPath(libdir("A"), PathType::User); + Driver->addScanPath(libdir("B"), PathType::User); + Driver->addScanPath(libdir("Z"), PathType::User); + + // Symbols to resolve (bare names; class handles underscore differences + // internally) + std::vector Syms = {platformSymbolName("sayA"), + platformSymbolName("sayB"), + platformSymbolName("sayZ")}; + + bool CallbackRan = false; + Driver->resolveSymbols(Syms, [&](SymbolQuery &Q) { + CallbackRan = true; + + // sayA should resolve to A.dylib + { + auto lib = Q.getResolvedLib(platformSymbolName("sayA")); + ASSERT_TRUE(lib.has_value()) << "sayA should be resolved"; + EXPECT_TRUE(endsWith(lib->str(), libname("A"))) + << "sayA resolved to: " << lib->str(); + } + + // sayB should resolve to B.dylib + { + auto lib = Q.getResolvedLib(platformSymbolName("sayB")); + ASSERT_TRUE(lib.has_value()) << "sayB should be resolved"; + EXPECT_TRUE(endsWith(lib->str(), libname("B"))) + << "sayB resolved to: " << lib->str(); + } + + // sayZ should resolve to B.dylib + { + auto lib = Q.getResolvedLib(platformSymbolName("sayZ")); + ASSERT_TRUE(lib.has_value()) << "sayZ should be resolved"; + EXPECT_TRUE(endsWith(lib->str(), libname("Z"))) + << "sayZ resolved to: " << lib->str(); + } + + EXPECT_TRUE(Q.allResolved()); + }); + + EXPECT_TRUE(CallbackRan); +} + +// stress SymbolQuery with the real resolve flow +// And resolve libC dependency libA, libB, libZ --- +TEST_F(LibraryResolverIT, ResolveManySymbols) { + auto Stup = LibraryResolver::Setup::create({BaseDir}); + auto Driver = LibraryResolutionDriver::create(Stup); + ASSERT_NE(Driver, nullptr); + Driver->addScanPath(libdir("C"), PathType::User); + + // Many duplicates to provoke concurrent updates inside SymbolQuery + std::vector Syms = { + platformSymbolName("sayA"), platformSymbolName("sayB"), + platformSymbolName("sayA"), platformSymbolName("sayB"), + platformSymbolName("sayZ"), platformSymbolName("sayZ"), + platformSymbolName("sayZ"), platformSymbolName("sayZ"), + platformSymbolName("sayA"), platformSymbolName("sayB"), + platformSymbolName("sayA"), platformSymbolName("sayB")}; + + bool CallbackRan = false; + Driver->resolveSymbols(Syms, [&](SymbolQuery &Q) { + CallbackRan = true; + EXPECT_TRUE(Q.isResolved(platformSymbolName("sayA"))); + EXPECT_TRUE(Q.isResolved(platformSymbolName("sayB"))); + EXPECT_TRUE(Q.isResolved(platformSymbolName("sayZ"))); + + auto A = Q.getResolvedLib(platformSymbolName("sayA")); + auto B = Q.getResolvedLib(platformSymbolName("sayB")); + auto Z = Q.getResolvedLib(platformSymbolName("sayZ")); + ASSERT_TRUE(A.has_value()); + ASSERT_TRUE(B.has_value()); + ASSERT_TRUE(Z.has_value()); + EXPECT_TRUE(endsWith(A->str(), libname("A"))); + EXPECT_TRUE(endsWith(B->str(), libname("B"))); + EXPECT_TRUE(endsWith(Z->str(), libname("Z"))); + EXPECT_TRUE(Q.allResolved()); + }); + + EXPECT_TRUE(CallbackRan); +} + +TEST_F(LibraryResolverIT, ScanAndResolveDependencyGraph) { + auto LibPathCache = std::make_shared(); + auto PResolver = std::make_shared(LibPathCache); + LibraryScanHelper ScanH({}, LibPathCache, PResolver); + + ScanH.addBasePath(libdir("C"), PathType::User); + + LibraryManager LibMgr; + LibraryScanner Scanner(ScanH, LibMgr); + + Scanner.scanNext(PathType::User, 0); + + size_t numLibs = 0; + LibMgr.forEachLibrary([&](const LibraryInfo &L) { + numLibs++; + return true; + }); + + EXPECT_GT(numLibs, 0u) << "Expected at least one library scanned"; + + // Validate that each scanned library path is resolvable + std::error_code EC; + LibMgr.forEachLibrary([&](const LibraryInfo &L) { + auto R = PResolver->resolve(L.getFullPath(), EC); + EXPECT_TRUE(R.has_value()); + EXPECT_FALSE(EC); + return true; + }); +} + +TEST_F(LibraryResolverIT, ScanEmptyPath) { + auto LibPathCache = std::make_shared(); + auto PResolver = std::make_shared(LibPathCache); + LibraryScanHelper ScanH({}, LibPathCache, PResolver); + + ScanH.addBasePath("/tmp/empty", PathType::User); + + LibraryManager LibMgr; + LibraryScanner Scanner(ScanH, LibMgr); + + Scanner.scanNext(PathType::User, 0); + + size_t count = 0; + LibMgr.forEachLibrary([&](const LibraryInfo &) { + count++; + return true; + }); + EXPECT_EQ(count, 0u); +} + +TEST_F(LibraryResolverIT, PathResolverResolvesKnownPaths) { + auto LibPathCache = std::make_shared(); + auto PResolver = std::make_shared(LibPathCache); + + std::error_code EC; + auto Missing = PResolver->resolve("temp/foo/bar", EC); + EXPECT_FALSE(Missing.has_value()) << "Unexpectedly resolved a bogus path"; + EXPECT_TRUE(EC) << "Expected error resolving path"; + + auto DirPath = PResolver->resolve(BaseDir, EC); + ASSERT_TRUE(DirPath.has_value()); + EXPECT_FALSE(EC) << "Expected no error resolving path"; + EXPECT_EQ(*DirPath, BaseDir); + + auto DylibPath = PResolver->resolve(lib("C"), EC); + ASSERT_TRUE(DylibPath.has_value()); + EXPECT_FALSE(EC) << "Expected no error resolving path"; + EXPECT_EQ(*DylibPath, lib("C")); +} + +TEST_F(LibraryResolverIT, PathResolverNormalizesDotAndDotDot) { + auto LibPathCache = std::make_shared(); + auto PResolver = std::make_shared(LibPathCache); + + std::error_code EC; + + // e.g. BaseDir + "/./C/../C/C.dylib" → BaseDir + "/C.dylib" + std::string Messy = BaseDir + "/C/./../C/./libC" + ext; + auto Resolved = PResolver->resolve(Messy, EC); + ASSERT_TRUE(Resolved.has_value()); + EXPECT_FALSE(EC); + EXPECT_EQ(*Resolved, lib("C")) << "Expected realpath to collapse . and .."; +} + +#if !defined(_WIN32) +TEST_F(LibraryResolverIT, PathResolverFollowsSymlinks) { + auto LibPathCache = std::make_shared(); + auto PResolver = std::make_shared(LibPathCache); + + std::error_code EC; + + // Create a symlink temp -> BaseDir (only if filesystem allows it) + std::string linkName = BaseDir + withext("/link_to_C"); + std::string target = lib("C"); + if (::symlink(target.c_str(), linkName.c_str()) != 0) + GTEST_SKIP() << "Failed to create symlink: " << strerror(errno); + + auto resolved = PResolver->resolve(linkName, EC); + ASSERT_TRUE(resolved.has_value()); + EXPECT_FALSE(EC); + EXPECT_EQ(*resolved, target); + + (void)::unlink(linkName.c_str()); // cleanup +} + +TEST_F(LibraryResolverIT, PathResolverCachesResults) { + auto LibPathCache = std::make_shared(); + auto PResolver = std::make_shared(LibPathCache); + + SmallString<128> TmpDylib; + std::error_code EC; + EC = sys::fs::createUniqueFile(withext("A-copy"), TmpDylib); + if (EC) + GTEST_SKIP() << "Failed to create temp dylib" << EC.message(); + + EC = sys::fs::copy_file(lib("A"), TmpDylib); + if (EC) + GTEST_SKIP() << "Failed to copy libA: " << EC.message(); + EC.clear(); + + // First resolve -> should populate LibPathCache + auto first = PResolver->resolve(TmpDylib, EC); + ASSERT_TRUE(first.has_value()); + + // Forcefully remove the file from disk + (void)::unlink(TmpDylib.c_str()); + + // Second resolve -> should still succeed from LibPathCache + auto second = PResolver->resolve(TmpDylib, EC); + EXPECT_TRUE(second.has_value()); + EXPECT_EQ(*second, *first); +} +#endif + +TEST_F(LibraryResolverIT, LoaderPathSubstitutionAndResolve) { + auto LibPathCache = std::make_shared(); + auto PResolver = std::make_shared(LibPathCache); + + DylibSubstitutor substitutor; + substitutor.configure(libdir("C")); +#if defined(__APPLE__) + // Substitute @loader_path with BaseDir + std::string substituted = + substitutor.substitute(withext("@loader_path/libC")); +#elif defined(__linux__) + // Substitute $origin with BaseDir + std::string substituted = substitutor.substitute(withext("$ORIGIN/libC")); +#endif + ASSERT_FALSE(substituted.empty()); + EXPECT_EQ(substituted, lib("C")); + + // Now try resolving the substituted path + std::error_code EC; + auto resolved = PResolver->resolve(substituted, EC); + ASSERT_TRUE(resolved.has_value()) << "Expected to resolve substituted dylib"; + EXPECT_EQ(*resolved, lib("C")); + EXPECT_FALSE(EC) << "Expected no error resolving substituted dylib"; +} + +TEST_F(LibraryResolverIT, ResolveFromUsrOrSystemPaths) { + auto LibPathCache = std::make_shared(); + auto PResolver = std::make_shared(LibPathCache); + + DylibPathValidator validator(*PResolver); + + std::vector Paths = {"/foo/bar/", "temp/foo", libdir("C"), + libdir("A"), libdir("B"), libdir("Z")}; + + SmallVector P(Paths.begin(), Paths.end()); + + DylibResolver Resolver(validator); + Resolver.configure("", {{P, SearchPathType::UsrOrSys}}); + + // Check "C" + auto ValOptC = Resolver.resolve("libC", true); + EXPECT_TRUE(ValOptC.has_value()); + EXPECT_EQ(*ValOptC, lib("C")); + + auto ValOptCdylib = Resolver.resolve(withext("libC")); + EXPECT_TRUE(ValOptCdylib.has_value()); + EXPECT_EQ(*ValOptCdylib, lib("C")); + + // Check "A" + auto ValOptA = Resolver.resolve("libA", true); + EXPECT_TRUE(ValOptA.has_value()); + EXPECT_EQ(*ValOptA, lib("A")); + + auto ValOptAdylib = Resolver.resolve(withext("libA")); + EXPECT_TRUE(ValOptAdylib.has_value()); + EXPECT_EQ(*ValOptAdylib, lib("A")); + + // Check "B" + auto ValOptB = Resolver.resolve("libB", true); + EXPECT_TRUE(ValOptB.has_value()); + EXPECT_EQ(*ValOptB, lib("B")); + + auto ValOptBdylib = Resolver.resolve(withext("libB")); + EXPECT_TRUE(ValOptBdylib.has_value()); + EXPECT_EQ(*ValOptBdylib, lib("B")); + + // Check "Z" + auto ValOptZ = Resolver.resolve("libZ", true); + EXPECT_TRUE(ValOptZ.has_value()); + EXPECT_EQ(*ValOptZ, lib("Z")); + + auto ValOptZdylib = Resolver.resolve(withext("libZ")); + EXPECT_TRUE(ValOptZdylib.has_value()); + EXPECT_EQ(*ValOptZdylib, lib("Z")); +} + +#if defined(__APPLE__) +TEST_F(LibraryResolverIT, ResolveViaLoaderPathAndRPathSubstitution) { + auto LibPathCache = std::make_shared(); + auto PResolver = std::make_shared(LibPathCache); + + DylibPathValidator validator(*PResolver); + + std::vector Paths = {"@loader_path/../A", "@loader_path/../B", + "@loader_path/../C", "@loader_path/../Z"}; + + SmallVector P(Paths.begin(), Paths.end()); + + DylibResolver Resolver(validator); + + // Use only RPath config + Resolver.configure(lib("C"), {{P, SearchPathType::RPath}}); + + // --- Check A --- + auto ValOptA = Resolver.resolve("@rpath/libA", true); + EXPECT_TRUE(ValOptA.has_value()); + EXPECT_EQ(*ValOptA, lib("A")); + + auto ValOptAdylib = Resolver.resolve(withext("@rpath/libA")); + EXPECT_TRUE(ValOptAdylib.has_value()); + EXPECT_EQ(*ValOptAdylib, lib("A")); + + // --- Check B --- + auto ValOptB = Resolver.resolve("@rpath/libB", true); + EXPECT_TRUE(ValOptB.has_value()); + EXPECT_EQ(*ValOptB, lib("B")); + + auto ValOptBdylib = Resolver.resolve(withext("@rpath/libB")); + EXPECT_TRUE(ValOptBdylib.has_value()); + EXPECT_EQ(*ValOptBdylib, lib("B")); + + // --- Check Z --- + auto ValOptZ = Resolver.resolve("@rpath/libZ", true); + EXPECT_TRUE(ValOptZ.has_value()); + EXPECT_EQ(*ValOptZ, lib("Z")); + + auto ValOptZdylib = Resolver.resolve(withext("@rpath/libZ")); + EXPECT_TRUE(ValOptZdylib.has_value()); + EXPECT_EQ(*ValOptZdylib, lib("Z")); +} +#endif + +#if defined(__linux__) +TEST_F(LibraryResolverIT, ResolveViaOriginAndRPathSubstitution) { + auto LibPathCache = std::make_shared(); + auto PResolver = std::make_shared(LibPathCache); + + DylibPathValidator validator(*PResolver); + + // On Linux, $ORIGIN works like @loader_path + std::vector Paths = {"$ORIGIN/../A", "$ORIGIN/../B", + "$ORIGIN/../C", "$ORIGIN/../Z"}; + + SmallVector P(Paths.begin(), Paths.end()); + + DylibResolver Resolver(validator); + + // Use only RPath config + Resolver.configure(lib("C"), {{P, SearchPathType::RunPath}}); + + // --- Check A --- + auto ValOptA = Resolver.resolve("libA", true); + EXPECT_TRUE(ValOptA.has_value()); + EXPECT_EQ(*ValOptA, lib("A")); + + auto valOptASO = Resolver.resolve(withext("libA")); + EXPECT_TRUE(valOptASO.has_value()); + EXPECT_EQ(*valOptASO, lib("A")); + + // --- Check B --- + auto ValOptB = Resolver.resolve("libB", true); + EXPECT_TRUE(ValOptB.has_value()); + EXPECT_EQ(*ValOptB, lib("B")); + + auto valOptBSO = Resolver.resolve(withext("libB")); + EXPECT_TRUE(valOptBSO.has_value()); + EXPECT_EQ(*valOptBSO, lib("B")); + + // --- Check Z --- + auto ValOptZ = Resolver.resolve("libZ", true); + EXPECT_TRUE(ValOptZ.has_value()); + EXPECT_EQ(*ValOptZ, lib("Z")); + + auto valOptZSO = Resolver.resolve(withext("libZ")); + EXPECT_TRUE(valOptZSO.has_value()); + EXPECT_EQ(*valOptZSO, lib("Z")); +} +#endif +} // namespace +#endif // defined(__APPLE__) From 9671dae9aeeb47dd31183a25bc5ebed0fdea9a01 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 31 Oct 2025 23:30:23 -0700 Subject: [PATCH 420/539] [LegalizeTypes] Use UpdateNodeOperands in SoftPromoteHalfOp_STACKMAP/PATCHPOINT. (#165927) --- .../SelectionDAG/LegalizeFloatTypes.cpp | 22 ++++--------------- 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index bf1abfe50327e..1178b6fba6f0b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -3957,28 +3957,14 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfOp_ATOMIC_STORE(SDNode *N, SDValue DAGTypeLegalizer::SoftPromoteHalfOp_STACKMAP(SDNode *N, unsigned OpNo) { assert(OpNo > 1); // Because the first two arguments are guaranteed legal. SmallVector NewOps(N->ops()); - SDValue Op = N->getOperand(OpNo); - NewOps[OpNo] = GetSoftPromotedHalf(Op); - SDValue NewNode = - DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), NewOps); - - for (unsigned ResNum = 0; ResNum < N->getNumValues(); ResNum++) - ReplaceValueWith(SDValue(N, ResNum), NewNode.getValue(ResNum)); - - return SDValue(); // Signal that we replaced the node ourselves. + NewOps[OpNo] = GetSoftPromotedHalf(NewOps[OpNo]); + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } SDValue DAGTypeLegalizer::SoftPromoteHalfOp_PATCHPOINT(SDNode *N, unsigned OpNo) { assert(OpNo >= 7); SmallVector NewOps(N->ops()); - SDValue Op = N->getOperand(OpNo); - NewOps[OpNo] = GetSoftPromotedHalf(Op); - SDValue NewNode = - DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), NewOps); - - for (unsigned ResNum = 0; ResNum < N->getNumValues(); ResNum++) - ReplaceValueWith(SDValue(N, ResNum), NewNode.getValue(ResNum)); - - return SDValue(); // Signal that we replaced the node ourselves. + NewOps[OpNo] = GetSoftPromotedHalf(NewOps[OpNo]); + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } From 8e5292a52106e52da264c610938fdfcf32cec4d0 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 31 Oct 2025 23:31:10 -0700 Subject: [PATCH 421/539] [SelectionDAG][RISCV] Support STACK/PATCHPOINT in SoftenFloatOperand. (#165922) Test float/double/half/bfloat on RISC-V without F extension. --- .../SelectionDAG/LegalizeFloatTypes.cpp | 20 ++++ llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 2 + llvm/test/CodeGen/RISCV/rv64-stackmap.ll | 108 +++++++++++++++++- 3 files changed, 127 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 1178b6fba6f0b..780c9b31fa9c7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -1172,6 +1172,12 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) { case ISD::FAKE_USE: Res = SoftenFloatOp_FAKE_USE(N); break; + case ISD::STACKMAP: + Res = SoftenFloatOp_STACKMAP(N, OpNo); + break; + case ISD::PATCHPOINT: + Res = SoftenFloatOp_PATCHPOINT(N, OpNo); + break; } // If the result is null, the sub-method took care of registering results etc. @@ -1512,6 +1518,20 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FAKE_USE(SDNode *N) { N->getOperand(0), Op1); } +SDValue DAGTypeLegalizer::SoftenFloatOp_STACKMAP(SDNode *N, unsigned OpNo) { + assert(OpNo > 1); // Because the first two arguments are guaranteed legal. + SmallVector NewOps(N->ops()); + NewOps[OpNo] = GetSoftenedFloat(NewOps[OpNo]); + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + +SDValue DAGTypeLegalizer::SoftenFloatOp_PATCHPOINT(SDNode *N, unsigned OpNo) { + assert(OpNo >= 7); + SmallVector NewOps(N->ops()); + NewOps[OpNo] = GetSoftenedFloat(NewOps[OpNo]); + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + //===----------------------------------------------------------------------===// // Float Result Expansion //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 9656a30321efa..ede522eff6df3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -658,6 +658,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SoftenFloatOp_ATOMIC_STORE(SDNode *N, unsigned OpNo); SDValue SoftenFloatOp_FCOPYSIGN(SDNode *N); SDValue SoftenFloatOp_FAKE_USE(SDNode *N); + SDValue SoftenFloatOp_STACKMAP(SDNode *N, unsigned OpNo); + SDValue SoftenFloatOp_PATCHPOINT(SDNode *N, unsigned OpNo); //===--------------------------------------------------------------------===// // Float Expansion Support: LegalizeFloatTypes.cpp diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll index 9aefa90684dd3..c50a0fb3ffe91 100644 --- a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll +++ b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll @@ -7,11 +7,11 @@ ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .half 0 ; Num Functions -; CHECK-NEXT: .word 12 +; CHECK-NEXT: .word 13 ; Num LargeConstants -; CHECK-NEXT: .word 2 +; CHECK-NEXT: .word 3 ; Num Callsites -; CHECK-NEXT: .word 16 +; CHECK-NEXT: .word 17 ; Functions and stack size ; CHECK-NEXT: .quad constantargs @@ -50,10 +50,14 @@ ; CHECK-NEXT: .quad needsStackRealignment ; CHECK-NEXT: .quad -1 ; CHECK-NEXT: .quad 1 +; CHECK-NEXT: .quad floats +; CHECK-NEXT: .quad 32 +; CHECK-NEXT: .quad 1 ; Num LargeConstants ; CHECK-NEXT: .quad 4294967295 ; CHECK-NEXT: .quad 4294967296 +; CHECK-NEXT: .quad 4609434218613702656 ; Constant arguments ; @@ -379,6 +383,104 @@ define void @needsStackRealignment() { } declare void @escape_values(...) +; CHECK-LABEL: .word .L{{.*}}-floats +; CHECK-NEXT: .half 0 +; Num Locations +; CHECK-NEXT: .half 12 +; Loc 0: constant float as constant integer +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 1: constant double as large constant integer +; CHECK-NEXT: .byte 5 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 2: constant half as constant integer +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 3: constant bfloat as constant integer +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 4: float value in X register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 10 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 5: double value in X register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 11 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 6: half value in X register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 12 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 7: bfloat value in X register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 13 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 8: float on stack +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 2 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 9: double on stack +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 2 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 10: half on stack +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 2 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 11: bfloat on stack +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 2 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +define void @floats(float %f, double %g, half %h, bfloat %i) { + %ff = alloca float + %gg = alloca double + %hh = alloca half + %ii = alloca bfloat + call void (i64, i32, ...) @llvm.experimental.stackmap(i64 888, i32 0, float 1.25, + double 1.5, half 1.5, bfloat 1.5, float %f, double %g, half %h, bfloat %i, ptr %ff, ptr %gg, ptr %hh, ptr %ii) + ret void +} + declare void @llvm.experimental.stackmap(i64, i32, ...) declare void @llvm.experimental.patchpoint.void(i64, i32, ptr, i32, ...) declare i64 @llvm.experimental.patchpoint.i64(i64, i32, ptr, i32, ...) From cdc02c958fd3d8099a2a2f1f71c2d17b9d35c93d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 31 Oct 2025 23:38:04 -0700 Subject: [PATCH 422/539] Revert "[LegalizeTypes] Use UpdateNodeOperands in SoftPromoteHalfOp_STACKMAP/PATCHPOINT. (#165927)" This reverts commit 4357fcbbd5012369dbbbe50f99941147895d6611. Causes a crash when combined with #165922. --- .../SelectionDAG/LegalizeFloatTypes.cpp | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 780c9b31fa9c7..58983cb57d7f6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -3977,14 +3977,28 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfOp_ATOMIC_STORE(SDNode *N, SDValue DAGTypeLegalizer::SoftPromoteHalfOp_STACKMAP(SDNode *N, unsigned OpNo) { assert(OpNo > 1); // Because the first two arguments are guaranteed legal. SmallVector NewOps(N->ops()); - NewOps[OpNo] = GetSoftPromotedHalf(NewOps[OpNo]); - return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); + SDValue Op = N->getOperand(OpNo); + NewOps[OpNo] = GetSoftPromotedHalf(Op); + SDValue NewNode = + DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), NewOps); + + for (unsigned ResNum = 0; ResNum < N->getNumValues(); ResNum++) + ReplaceValueWith(SDValue(N, ResNum), NewNode.getValue(ResNum)); + + return SDValue(); // Signal that we replaced the node ourselves. } SDValue DAGTypeLegalizer::SoftPromoteHalfOp_PATCHPOINT(SDNode *N, unsigned OpNo) { assert(OpNo >= 7); SmallVector NewOps(N->ops()); - NewOps[OpNo] = GetSoftPromotedHalf(NewOps[OpNo]); - return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); + SDValue Op = N->getOperand(OpNo); + NewOps[OpNo] = GetSoftPromotedHalf(Op); + SDValue NewNode = + DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), NewOps); + + for (unsigned ResNum = 0; ResNum < N->getNumValues(); ResNum++) + ReplaceValueWith(SDValue(N, ResNum), NewNode.getValue(ResNum)); + + return SDValue(); // Signal that we replaced the node ourselves. } From 36c459f17661f6f9d452e3e1f7c0c8dcc61db7cf Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Sat, 1 Nov 2025 14:50:27 +0800 Subject: [PATCH 423/539] [LoongArch][NFC] Pre-commit tests for sink-and-fold (#163928) --- llvm/test/CodeGen/LoongArch/sink-fold-addi.ll | 758 ++++++++++++++++++ 1 file changed, 758 insertions(+) create mode 100644 llvm/test/CodeGen/LoongArch/sink-fold-addi.ll diff --git a/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll new file mode 100644 index 0000000000000..9a806a12f7de6 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll @@ -0,0 +1,758 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx --verify-machineinstrs < %s \ +; RUN: | FileCheck --check-prefix=LA32 %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx --verify-machineinstrs < %s \ +; RUN: | FileCheck --check-prefix=LA64 %s + +%struct.S = type { i64, i64, i8 } +%struct.F = type { float, double, float } +%struct.V = type { <4 x i32>, <4 x i32>, <16 x i16> } + +define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -48 +; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: st.w $s5, $sp, 16 # 4-byte Folded Spill +; LA32-NEXT: st.w $s6, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a1, $a0, 4 +; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 +; LA32-NEXT: add.w $a0, $a4, $a0 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: addi.w $s2, $a0, 8 +; LA32-NEXT: bnez $a1, .LBB0_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: move $s5, $zero +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s6, $zero +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB0_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: ld.w $a0, $s2, 4 +; LA32-NEXT: ld.w $a1, $s2, 0 +; LA32-NEXT: add.w $a0, $a0, $s6 +; LA32-NEXT: add.w $s3, $a1, $s3 +; LA32-NEXT: sltu $a1, $s3, $a1 +; LA32-NEXT: addi.w $s4, $s4, 1 +; LA32-NEXT: sltui $a2, $s4, 1 +; LA32-NEXT: add.w $s5, $s5, $a2 +; LA32-NEXT: xor $a2, $s4, $s1 +; LA32-NEXT: xor $a3, $s5, $s0 +; LA32-NEXT: or $a2, $a2, $a3 +; LA32-NEXT: add.w $s6, $a0, $a1 +; LA32-NEXT: bnez $a2, .LBB0_2 +; LA32-NEXT: b .LBB0_4 +; LA32-NEXT: .LBB0_3: +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s6, $zero +; LA32-NEXT: .LBB0_4: # %for.cond.cleanup +; LA32-NEXT: st.w $s3, $s2, 0 +; LA32-NEXT: st.w $s6, $s2, 4 +; LA32-NEXT: ld.w $s6, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s5, $sp, 16 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 48 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -48 +; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: st.d $s2, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: slli.d $a1, $a0, 4 +; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 +; LA64-NEXT: add.d $a0, $a2, $a0 +; LA64-NEXT: addi.d $s1, $a0, 8 +; LA64-NEXT: blez $s0, .LBB0_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: move $s2, $zero +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB0_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $a0, $s1, 0 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: add.d $s2, $a0, $s2 +; LA64-NEXT: bnez $s0, .LBB0_2 +; LA64-NEXT: b .LBB0_4 +; LA64-NEXT: .LBB0_3: +; LA64-NEXT: move $s2, $zero +; LA64-NEXT: .LBB0_4: # %for.cond.cleanup +; LA64-NEXT: st.d $s2, $s1, 0 +; LA64-NEXT: ld.d $s2, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 48 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 1 + %cmp4 = icmp sgt i64 %n, 0 + br i1 %cmp4, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %s.05 = phi i64 [ 0, %entry ], [ %add, %for.body ] + call void @f(ptr %a) + %0 = load i64, ptr %y + %add = add nsw i64 %0, %s.05 + %inc = add nuw nsw i64 %i.06, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %s.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ] + store i64 %s.0.lcssa, ptr %y + ret void +} + +define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_f32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -48 +; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a1, $a0, 4 +; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 +; LA32-NEXT: add.w $a0, $a4, $a0 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: addi.w $s2, $a0, 16 +; LA32-NEXT: bnez $a1, .LBB1_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: movgr2fr.w $fs0, $zero +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB1_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: fld.s $fa0, $s2, 0 +; LA32-NEXT: addi.w $s3, $s3, 1 +; LA32-NEXT: sltui $a0, $s3, 1 +; LA32-NEXT: add.w $s4, $s4, $a0 +; LA32-NEXT: xor $a0, $s3, $s1 +; LA32-NEXT: xor $a1, $s4, $s0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: fadd.s $fs0, $fa0, $fs0 +; LA32-NEXT: bnez $a0, .LBB1_2 +; LA32-NEXT: b .LBB1_4 +; LA32-NEXT: .LBB1_3: +; LA32-NEXT: movgr2fr.w $fs0, $zero +; LA32-NEXT: .LBB1_4: # %for.cond.cleanup +; LA32-NEXT: fst.s $fs0, $s2, 0 +; LA32-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload +; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 48 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_f32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -48 +; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: slli.d $a1, $a0, 4 +; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 +; LA64-NEXT: add.d $a0, $a2, $a0 +; LA64-NEXT: addi.d $s1, $a0, 16 +; LA64-NEXT: blez $s0, .LBB1_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: movgr2fr.w $fs0, $zero +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB1_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: fld.s $fa0, $s1, 0 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: fadd.s $fs0, $fa0, $fs0 +; LA64-NEXT: bnez $s0, .LBB1_2 +; LA64-NEXT: b .LBB1_4 +; LA64-NEXT: .LBB1_3: +; LA64-NEXT: movgr2fr.w $fs0, $zero +; LA64-NEXT: .LBB1_4: # %for.cond.cleanup +; LA64-NEXT: fst.s $fs0, $s1, 0 +; LA64-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 48 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 2 + %cmp4 = icmp sgt i64 %n, 0 + br i1 %cmp4, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %s.05 = phi float [ 0.0, %entry ], [ %add, %for.body ] + call void @f(ptr %a) + %0 = load float, ptr %y + %add = fadd float %0, %s.05 + %inc = add nuw nsw i64 %i.06, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %s.0.lcssa = phi float [ 0.0, %entry ], [ %add, %for.body ] + store float %s.0.lcssa, ptr %y + ret void +} + +define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_v4i32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -48 +; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a0, $a0, 6 +; LA32-NEXT: add.w $a0, $a4, $a0 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: addi.w $s2, $a0, 16 +; LA32-NEXT: bnez $a1, .LBB2_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: vrepli.b $vr0, 0 +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB2_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: vld $vr0, $s2, 0 +; LA32-NEXT: addi.w $s3, $s3, 1 +; LA32-NEXT: sltui $a0, $s3, 1 +; LA32-NEXT: add.w $s4, $s4, $a0 +; LA32-NEXT: xor $a0, $s3, $s1 +; LA32-NEXT: xor $a1, $s4, $s0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload +; LA32-NEXT: vadd.w $vr1, $vr0, $vr1 +; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill +; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA32-NEXT: bnez $a0, .LBB2_2 +; LA32-NEXT: b .LBB2_4 +; LA32-NEXT: .LBB2_3: +; LA32-NEXT: vrepli.b $vr0, 0 +; LA32-NEXT: .LBB2_4: # %for.cond.cleanup +; LA32-NEXT: vst $vr0, $s2, 0 +; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 48 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_v4i32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -48 +; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: slli.d $a0, $a0, 6 +; LA64-NEXT: add.d $a0, $a2, $a0 +; LA64-NEXT: addi.d $s1, $a0, 16 +; LA64-NEXT: blez $a1, .LBB2_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: vrepli.b $vr0, 0 +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB2_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: vld $vr0, $s1, 0 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: vadd.w $vr1, $vr0, $vr1 +; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: bnez $s0, .LBB2_2 +; LA64-NEXT: b .LBB2_4 +; LA64-NEXT: .LBB2_3: +; LA64-NEXT: vrepli.b $vr0, 0 +; LA64-NEXT: .LBB2_4: # %for.cond.cleanup +; LA64-NEXT: vst $vr0, $s1, 0 +; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 48 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 1 + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %sum.0 = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ] + call void @f(ptr %a) + %v = load <4 x i32>, ptr %y + %addv = add <4 x i32> %v, %sum.0 + %inc = add nuw nsw i64 %i.0, 1 + %exitcond = icmp eq i64 %inc, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %sum.lcssa = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ] + store <4 x i32> %sum.lcssa, ptr %y + ret void +} + +define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_v16i16: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -80 +; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 68 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 64 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 60 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 56 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 52 # 4-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a0, $a0, 6 +; LA32-NEXT: add.w $a0, $a4, $a0 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: addi.w $s2, $a0, 32 +; LA32-NEXT: bnez $a1, .LBB3_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: xvrepli.b $xr0, 0 +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB3_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: xvld $xr0, $s2, 0 +; LA32-NEXT: addi.w $s3, $s3, 1 +; LA32-NEXT: sltui $a0, $s3, 1 +; LA32-NEXT: add.w $s4, $s4, $a0 +; LA32-NEXT: xor $a0, $s3, $s1 +; LA32-NEXT: xor $a1, $s4, $s0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA32-NEXT: xvadd.h $xr1, $xr0, $xr1 +; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload +; LA32-NEXT: bnez $a0, .LBB3_2 +; LA32-NEXT: b .LBB3_4 +; LA32-NEXT: .LBB3_3: +; LA32-NEXT: xvrepli.b $xr0, 0 +; LA32-NEXT: .LBB3_4: # %for.cond.cleanup +; LA32-NEXT: xvst $xr0, $s2, 0 +; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 64 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 68 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 80 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_v16i16: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -80 +; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill +; LA64-NEXT: slli.d $a0, $a0, 6 +; LA64-NEXT: add.d $a0, $a2, $a0 +; LA64-NEXT: addi.d $s1, $a0, 32 +; LA64-NEXT: blez $a1, .LBB3_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: xvrepli.b $xr0, 0 +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB3_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: xvld $xr0, $s1, 0 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA64-NEXT: xvadd.h $xr1, $xr0, $xr1 +; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload +; LA64-NEXT: bnez $s0, .LBB3_2 +; LA64-NEXT: b .LBB3_4 +; LA64-NEXT: .LBB3_3: +; LA64-NEXT: xvrepli.b $xr0, 0 +; LA64-NEXT: .LBB3_4: # %for.cond.cleanup +; LA64-NEXT: xvst $xr0, $s1, 0 +; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 80 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 2 + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %sum.0 = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ] + call void @f(ptr %a) + %v = load <16 x i16>, ptr %y + %addv = add <16 x i16> %v, %sum.0 + %inc = add nuw nsw i64 %i.0, 1 + %exitcond = icmp eq i64 %inc, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %sum.lcssa = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ] + store <16 x i16> %sum.lcssa, ptr %y + ret void +} + +define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_extracti8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -48 +; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a1, $a0, 4 +; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 +; LA32-NEXT: add.w $a0, $a4, $a0 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: addi.w $s2, $a0, 16 +; LA32-NEXT: bnez $a1, .LBB4_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: vrepli.b $vr0, 0 +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB4_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: vldrepl.b $vr0, $s2, 0 +; LA32-NEXT: addi.w $s3, $s3, 1 +; LA32-NEXT: sltui $a0, $s3, 1 +; LA32-NEXT: add.w $s4, $s4, $a0 +; LA32-NEXT: xor $a0, $s3, $s1 +; LA32-NEXT: xor $a1, $s4, $s0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload +; LA32-NEXT: vadd.b $vr1, $vr0, $vr1 +; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill +; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA32-NEXT: bnez $a0, .LBB4_2 +; LA32-NEXT: b .LBB4_4 +; LA32-NEXT: .LBB4_3: +; LA32-NEXT: vrepli.b $vr0, 0 +; LA32-NEXT: .LBB4_4: # %for.cond.cleanup +; LA32-NEXT: vstelm.b $vr0, $s2, 0, 1 +; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 48 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_extracti8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -48 +; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: slli.d $a1, $a0, 4 +; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 +; LA64-NEXT: add.d $a0, $a2, $a0 +; LA64-NEXT: addi.d $s1, $a0, 16 +; LA64-NEXT: blez $s0, .LBB4_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: vrepli.b $vr0, 0 +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB4_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: vldrepl.b $vr0, $s1, 0 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: vadd.b $vr1, $vr0, $vr1 +; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: bnez $s0, .LBB4_2 +; LA64-NEXT: b .LBB4_4 +; LA64-NEXT: .LBB4_3: +; LA64-NEXT: vrepli.b $vr0, 0 +; LA64-NEXT: .LBB4_4: # %for.cond.cleanup +; LA64-NEXT: vstelm.b $vr0, $s1, 0, 1 +; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 48 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 2 + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %sum.0 = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ] + call void @f(ptr %a) + %e = load i8, ptr %y + %ins0 = insertelement <16 x i8> poison, i8 %e, i32 0 + %v = shufflevector <16 x i8> %ins0, <16 x i8> poison, <16 x i32> zeroinitializer + %addv = add <16 x i8> %v, %sum.0 + %inc = add nuw nsw i64 %i.0, 1 + %exitcond = icmp eq i64 %inc, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %sum.lcssa = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ] + %res = extractelement <16 x i8> %sum.lcssa, i32 1 + store i8 %res, ptr %y + ret void +} + +define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_extractf64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -80 +; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 68 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 64 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 60 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 56 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 52 # 4-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a1, $a0, 4 +; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 +; LA32-NEXT: add.w $a0, $a4, $a0 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: addi.w $s2, $a0, 8 +; LA32-NEXT: bnez $a1, .LBB5_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: xvrepli.b $xr0, 0 +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB5_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: xvldrepl.d $xr0, $s2, 0 +; LA32-NEXT: addi.w $s3, $s3, 1 +; LA32-NEXT: sltui $a0, $s3, 1 +; LA32-NEXT: add.w $s4, $s4, $a0 +; LA32-NEXT: xor $a0, $s3, $s1 +; LA32-NEXT: xor $a1, $s4, $s0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA32-NEXT: xvfadd.d $xr1, $xr0, $xr1 +; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload +; LA32-NEXT: bnez $a0, .LBB5_2 +; LA32-NEXT: b .LBB5_4 +; LA32-NEXT: .LBB5_3: +; LA32-NEXT: xvrepli.b $xr0, 0 +; LA32-NEXT: .LBB5_4: # %for.cond.cleanup +; LA32-NEXT: xvstelm.d $xr0, $s2, 0, 1 +; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 64 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 68 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 80 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_extractf64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -80 +; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: slli.d $a1, $a0, 4 +; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 +; LA64-NEXT: add.d $a0, $a2, $a0 +; LA64-NEXT: addi.d $s1, $a0, 8 +; LA64-NEXT: blez $s0, .LBB5_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: xvrepli.b $xr0, 0 +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB5_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: xvldrepl.d $xr0, $s1, 0 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA64-NEXT: xvfadd.d $xr1, $xr0, $xr1 +; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload +; LA64-NEXT: bnez $s0, .LBB5_2 +; LA64-NEXT: b .LBB5_4 +; LA64-NEXT: .LBB5_3: +; LA64-NEXT: xvrepli.b $xr0, 0 +; LA64-NEXT: .LBB5_4: # %for.cond.cleanup +; LA64-NEXT: xvstelm.d $xr0, $s1, 0, 1 +; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 80 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 1 + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %sum.0 = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ] + call void @f(ptr %a) + %e = load double, ptr %y + %ins0 = insertelement <4 x double> poison, double %e, i32 0 + %v = shufflevector <4 x double> %ins0, <4 x double> poison, <4 x i32> zeroinitializer + %addv = fadd <4 x double> %v, %sum.0 + %inc = add nuw nsw i64 %i.0, 1 + %exitcond = icmp eq i64 %inc, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %sum.lcssa = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ] + %res = extractelement <4 x double> %sum.lcssa, i32 1 + store double %res, ptr %y + ret void +} + +declare void @f(ptr) From 02e5497f469b742a16458774c5b8d337befabc3c Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 1 Nov 2025 00:22:31 -0700 Subject: [PATCH 424/539] [CSKY] Use non-deprecated CasesLower --- llvm/lib/Target/CSKY/CSKYISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp index e5b4f6eeb7b73..ab4ee55bae75e 100644 --- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp +++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp @@ -889,7 +889,7 @@ CSKYTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, .Cases("{t9}", "{bsp}", CSKY::R25) .Case("{r26}", CSKY::R26) .Case("{r27}", CSKY::R27) - .Cases("{gb}", "{rgb}", "{rdb}", CSKY::R28) + .Cases({"{gb}", "{rgb}", "{rdb}"}, CSKY::R28) .Cases("{tb}", "{rtb}", CSKY::R29) .Case("{svbr}", CSKY::R30) .Case("{tls}", CSKY::R31) From f12d20a6d7f38331fdb74df01b053a2643575213 Mon Sep 17 00:00:00 2001 From: azwolski Date: Sat, 1 Nov 2025 08:25:50 +0100 Subject: [PATCH 425/539] [InstCombine] Baseline test exposing vector fp-to-int conversion becoming scalar (#165800) Baseline test for https://github.com/llvm/llvm-project/issues/165793 exposing the change from a vector fp-to-int conversion into a scalar one inside the loop. Additionally, removed `llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll`, which was a duplicate of `llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll` differing only by the use of poison in the insert element. The poison variant has been merged into `llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll`. Also improved test naming and updated test checks. --- .../vec_extract_var_elt-inseltpoison.ll | 26 ------- .../InstCombine/vec_extract_var_elt.ll | 73 ++++++++++++++++--- 2 files changed, 64 insertions(+), 35 deletions(-) delete mode 100644 llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll diff --git a/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll deleted file mode 100644 index 9fcac802378f6..0000000000000 --- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: opt < %s -passes=instcombine -S | FileCheck %s - -define void @test (float %b, ptr %p) { -; CHECK: extractelement -; CHECK: fptosi - %1 = load <8 x float> , ptr %p - %2 = bitcast <8 x float> %1 to <8 x i32> - %3 = bitcast <8 x i32> %2 to <8 x float> - %a = fptosi <8 x float> %3 to <8 x i32> - %4 = fptosi float %b to i32 - %5 = add i32 %4, -2 - %6 = extractelement <8 x i32> %a, i32 %5 - %7 = insertelement <8 x i32> poison, i32 %6, i32 7 - %8 = sitofp <8 x i32> %7 to <8 x float> - store <8 x float> %8, ptr %p - ret void -} - -; PR18600 -define i32 @test2(i32 %i) { - %e = extractelement <4 x i32> bitcast (<2 x i64> to <4 x i32>), i32 %i - ret i32 %e - -; CHECK-LABEL: @test2 -; CHECK: extractelement -} diff --git a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll index 32bf4da12c497..205b4b88c473a 100644 --- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll +++ b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll @@ -1,26 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s -define void @test (float %b, ptr %p) { -; CHECK: extractelement -; CHECK: fptosi - %1 = load <8 x float> , ptr %p +define void @test_poison(float %b, ptr %p) { +; CHECK-LABEL: define void @test_poison( +; CHECK-SAME: float [[B:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[P]], align 32 +; CHECK-NEXT: [[TMP2:%.*]] = fptosi float [[B]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], -2 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = fptosi float [[TMP4]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 7 +; CHECK-NEXT: [[TMP7:%.*]] = sitofp <8 x i32> [[TMP6]] to <8 x float> +; CHECK-NEXT: store <8 x float> [[TMP7]], ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %1 = load <8 x float>, ptr %p %2 = bitcast <8 x float> %1 to <8 x i32> %3 = bitcast <8 x i32> %2 to <8 x float> %a = fptosi <8 x float> %3 to <8 x i32> %4 = fptosi float %b to i32 %5 = add i32 %4, -2 %6 = extractelement <8 x i32> %a, i32 %5 - %7 = insertelement <8 x i32> undef, i32 %6, i32 7 + %7 = insertelement <8 x i32> poison, i32 %6, i32 7 %8 = sitofp <8 x i32> %7 to <8 x float> store <8 x float> %8, ptr %p - ret void + ret void } ; PR18600 -define i32 @test2(i32 %i) { +define i32 @test_bitcast(i32 %i) { +; CHECK-LABEL: define i32 @test_bitcast( +; CHECK-SAME: i32 [[I:%.*]]) { +; CHECK-NEXT: [[E:%.*]] = extractelement <4 x i32> , i32 [[I]] +; CHECK-NEXT: ret i32 [[E]] +; %e = extractelement <4 x i32> bitcast (<2 x i64> to <4 x i32>), i32 %i ret i32 %e +} + +declare void @use(i32) -; CHECK-LABEL: @test2 -; CHECK: extractelement +define void @test_loop(<4 x float> %in) { +; CHECK-LABEL: define void @test_loop( +; CHECK-SAME: <4 x float> [[IN:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[R:%.*]] = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> [[IN]], i32 9) +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[LATCH:.*]] ] +; CHECK-NEXT: [[COND:%.*]] = icmp samesign ult i32 [[I]], 4 +; CHECK-NEXT: br i1 [[COND]], label %[[BODY:.*]], label %[[DONE:.*]] +; CHECK: [[BODY]]: +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[R]], i32 [[I]] +; CHECK-NEXT: [[ELEM:%.*]] = fptosi float [[TMP0]] to i32 +; CHECK-NEXT: call void @use(i32 [[ELEM]]) +; CHECK-NEXT: br label %[[LATCH]] +; CHECK: [[LATCH]]: +; CHECK-NEXT: [[NEXT]] = add nuw nsw i32 [[I]], 1 +; CHECK-NEXT: br label %[[LOOP]] +; CHECK: [[DONE]]: +; CHECK-NEXT: ret void +; +entry: + %r = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %in, i32 9) + %vi = fptosi <4 x float> %r to <4 x i32> + br label %loop +loop: + %i = phi i32 [ 0, %entry ], [ %next, %latch ] + %cond = icmp ult i32 %i, 4 + br i1 %cond, label %body, label %done +body: + %elem = extractelement <4 x i32> %vi, i32 %i + call void @use(i32 %elem) + br label %latch +latch: + %next = add i32 %i, 1 + br label %loop +done: + ret void } From d3a61ef7336b6c95f2ad7e9be08c2c1ce15a4c05 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Sat, 1 Nov 2025 01:34:58 -0700 Subject: [PATCH 426/539] Revert "[X86] narrowBitOpRMW - add handling for single bit insertion patterns (#165742)" (#165978) This reverts commit 2108c623e618265c4146c405f196953a9c157e73. #165742 blocks revert of #165540. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 43 +- llvm/test/CodeGen/X86/bittest-big-integer.ll | 937 +++++++++++++++++-- 2 files changed, 866 insertions(+), 114 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 007074c3ffc82..6f75a2eb7075a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -53345,8 +53345,7 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, } // Look for a RMW operation that only touches one bit of a larger than legal -// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single -// i32 sub value. +// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value. static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -53372,20 +53371,14 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, // BTR: X & ~(1 << ShAmt) // BTS: X | (1 << ShAmt) // BTC: X ^ (1 << ShAmt) - // - // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt) - SDValue InsertBit, ShAmt; + SDValue ShAmt; if (!StoredVal.hasOneUse() || !(sd_match(StoredVal, m_And(m_Specific(LoadVal), m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || sd_match(StoredVal, m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || sd_match(StoredVal, - m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || - sd_match(StoredVal, - m_Or(m_And(m_Specific(LoadVal), - m_Not(m_Shl(m_One(), m_Value(ShAmt)))), - m_Shl(m_Value(InsertBit), m_Deferred(ShAmt)))))) + m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))))) return SDValue(); // Ensure the shift amount is in bounds. @@ -53393,13 +53386,6 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, if (KnownAmt.getMaxValue().uge(VT.getSizeInBits())) return SDValue(); - // If we're inserting a bit then it must be the LSB. - if (InsertBit) { - KnownBits KnownInsert = DAG.computeKnownBits(InsertBit); - if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1)) - return SDValue(); - } - // Split the shift into an alignment shift that moves the active i32 block to // the bottom bits for truncation and a modulo shift that can act on the i32. EVT AmtVT = ShAmt.getValueType(); @@ -53407,7 +53393,6 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, DAG.getSignedConstant(-32LL, DL, AmtVT)); SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT)); - ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8); // Compute the byte offset for the i32 block that is changed by the RMW. // combineTruncate will adjust the load for us in a similar way. @@ -53422,23 +53407,13 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt); X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); - SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32, - DAG.getConstant(1, DL, MVT::i32), ModuloAmt); - - SDValue Res; - if (InsertBit) { - SDValue BitMask = - DAG.getNode(ISD::SHL, DL, MVT::i32, - DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt); - Res = - DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32)); - Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask); - } else { - if (StoredVal.getOpcode() == ISD::AND) - Mask = DAG.getNOT(DL, Mask, MVT::i32); - Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); - } + SDValue Mask = + DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), + DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); + if (StoredVal.getOpcode() == ISD::AND) + Mask = DAG.getNOT(DL, Mask, MVT::i32); + SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), Align(), St->getMemOperand()->getFlags()); } diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index dffe9005094ab..06e7d4773c58d 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -356,20 +356,41 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl $32, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%edx,%esi), %edi -; X86-NEXT: btl %ecx, %edi +; X86-NEXT: movl $1, %edx +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl %edi, %edi +; X86-NEXT: shldl %cl, %eax, %edi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: testb $32, %cl +; X86-NEXT: je .LBB9_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl $0, %edx +; X86-NEXT: .LBB9_2: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: notl %esi +; X86-NEXT: notl %edx +; X86-NEXT: je .LBB9_4 +; X86-NEXT: # %bb.3: +; X86-NEXT: movl %eax, %edi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: .LBB9_4: +; X86-NEXT: andl 4(%ebx), %esi +; X86-NEXT: orl %edi, %esi +; X86-NEXT: andl (%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $32, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: movl (%ebx,%eax), %eax +; X86-NEXT: btl %ecx, %eax ; X86-NEXT: setae %al -; X86-NEXT: btrl %ecx, %edi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, (%edx,%esi) +; X86-NEXT: movl %esi, 4(%ebx) +; X86-NEXT: movl %edx, (%ebx) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -579,55 +600,201 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind { define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-LABEL: init_eq_i128: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl $96, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%edx,%esi), %edi -; X86-NEXT: btl %ecx, %edi -; X86-NEXT: setae %al -; X86-NEXT: btrl %ecx, %edi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $96, %esp +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movzbl 16(%ebp), %ebx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %edi +; X86-NEXT: movl 72(%esp,%edi), %edx +; X86-NEXT: movl 76(%esp,%edi), %esi +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 64(%esp,%edi), %ebx +; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: movl 68(%esp,%edi), %ebx +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %ebx ; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, (%edx,%esi) +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: notl %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl 40(%esp,%eax), %edi +; X86-NEXT: movl 44(%esp,%eax), %esi +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: movl 8(%ebp), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 12(%ecx), %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: notl %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl 36(%esp,%esi), %esi +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: andl 8(%edx), %eax +; X86-NEXT: orl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: notl %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl 32(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl 8(%ebp), %edi +; X86-NEXT: andl 4(%edi), %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %eax +; X86-NEXT: andl (%edi), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $96, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: movl (%edi,%eax), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 12(%edi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 8(%edi) +; X86-NEXT: movl %ebx, 4(%edi) +; X86-NEXT: movl %edx, (%edi) +; X86-NEXT: setae %al +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: init_eq_i128: ; SSE: # %bb.0: ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $96, %esi -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: movl (%rdi,%rsi), %r8d -; SSE-NEXT: btl %ecx, %r8d +; SSE-NEXT: movl $1, %esi +; SSE-NEXT: xorl %r8d, %r8d +; SSE-NEXT: shldq %cl, %rsi, %r8 +; SSE-NEXT: shlq %cl, %rsi +; SSE-NEXT: movl %edx, %eax +; SSE-NEXT: xorl %edx, %edx +; SSE-NEXT: shldq %cl, %rax, %rdx +; SSE-NEXT: shlq %cl, %rax +; SSE-NEXT: xorl %r9d, %r9d +; SSE-NEXT: testb $64, %cl +; SSE-NEXT: cmovneq %rsi, %r8 +; SSE-NEXT: cmovneq %r9, %rsi +; SSE-NEXT: notq %r8 +; SSE-NEXT: cmovneq %rax, %rdx +; SSE-NEXT: cmovneq %r9, %rax +; SSE-NEXT: notq %rsi +; SSE-NEXT: andq 8(%rdi), %r8 +; SSE-NEXT: orq %rdx, %r8 +; SSE-NEXT: andq (%rdi), %rsi +; SSE-NEXT: orq %rax, %rsi +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: andl $96, %eax +; SSE-NEXT: shrl $3, %eax +; SSE-NEXT: movl (%rdi,%rax), %eax +; SSE-NEXT: btl %ecx, %eax ; SSE-NEXT: setae %al -; SSE-NEXT: shll %cl, %edx -; SSE-NEXT: btrl %ecx, %r8d -; SSE-NEXT: orl %r8d, %edx -; SSE-NEXT: movl %edx, (%rdi,%rsi) +; SSE-NEXT: movq %r8, 8(%rdi) +; SSE-NEXT: movq %rsi, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: init_eq_i128: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: andl $96, %ecx -; AVX-NEXT: shrl $3, %ecx -; AVX-NEXT: movl (%rdi,%rcx), %r8d -; AVX-NEXT: btl %esi, %r8d -; AVX-NEXT: setae %al -; AVX-NEXT: btrl %esi, %r8d -; AVX-NEXT: shlxl %esi, %edx, %edx -; AVX-NEXT: orl %r8d, %edx -; AVX-NEXT: movl %edx, (%rdi,%rcx) -; AVX-NEXT: retq +; AVX2-LABEL: init_eq_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: movl $1, %eax +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: shldq %cl, %rax, %rsi +; AVX2-NEXT: movl %edx, %edx +; AVX2-NEXT: xorl %r8d, %r8d +; AVX2-NEXT: shldq %cl, %rdx, %r8 +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: shlxq %rcx, %rax, %rax +; AVX2-NEXT: testb $64, %cl +; AVX2-NEXT: cmovneq %rax, %rsi +; AVX2-NEXT: cmovneq %r9, %rax +; AVX2-NEXT: shlxq %rcx, %rdx, %rdx +; AVX2-NEXT: cmovneq %rdx, %r8 +; AVX2-NEXT: cmovneq %r9, %rdx +; AVX2-NEXT: andnq 8(%rdi), %rsi, %rsi +; AVX2-NEXT: orq %r8, %rsi +; AVX2-NEXT: andnq (%rdi), %rax, %r8 +; AVX2-NEXT: orq %rdx, %r8 +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $96, %eax +; AVX2-NEXT: shrl $3, %eax +; AVX2-NEXT: movl (%rdi,%rax), %eax +; AVX2-NEXT: btl %ecx, %eax +; AVX2-NEXT: setae %al +; AVX2-NEXT: movq %rsi, 8(%rdi) +; AVX2-NEXT: movq %r8, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: init_eq_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: movl $1, %eax +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: shldq %cl, %rax, %rsi +; AVX512-NEXT: xorl %r8d, %r8d +; AVX512-NEXT: shlxq %rcx, %rax, %rax +; AVX512-NEXT: movl %edx, %edx +; AVX512-NEXT: xorl %r9d, %r9d +; AVX512-NEXT: shldq %cl, %rdx, %r9 +; AVX512-NEXT: testb $64, %cl +; AVX512-NEXT: cmovneq %rax, %rsi +; AVX512-NEXT: cmovneq %r8, %rax +; AVX512-NEXT: shlxq %rcx, %rdx, %rdx +; AVX512-NEXT: cmovneq %rdx, %r9 +; AVX512-NEXT: cmovneq %r8, %rdx +; AVX512-NEXT: andnq 8(%rdi), %rsi, %rsi +; AVX512-NEXT: orq %r9, %rsi +; AVX512-NEXT: andnq (%rdi), %rax, %r8 +; AVX512-NEXT: orq %rdx, %r8 +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: andl $96, %eax +; AVX512-NEXT: shrl $3, %eax +; AVX512-NEXT: movl (%rdi,%rax), %eax +; AVX512-NEXT: btl %ecx, %eax +; AVX512-NEXT: setae %al +; AVX512-NEXT: movq %rsi, 8(%rdi) +; AVX512-NEXT: movq %r8, (%rdi) +; AVX512-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -803,55 +970,665 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-LABEL: init_eq_i512: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: andl $60, %esi -; X86-NEXT: movl (%edx,%esi), %edi -; X86-NEXT: btl %ecx, %edi -; X86-NEXT: setae %al -; X86-NEXT: btrl %ecx, %edi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %ebx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $352, %esp # imm = 0x160 +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: andl $60, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: subl %edx, %eax +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 56(%eax), %esi +; X86-NEXT: movl 60(%eax), %ebx +; X86-NEXT: movl 52(%eax), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%eax), %edi +; X86-NEXT: movl 44(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 24(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 4(%eax), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl 16(%ebp), %eax +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: andl $31, %ecx +; X86-NEXT: shldl %cl, %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl %cl, %ebx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 56(%eax), %esi +; X86-NEXT: movl 60(%eax), %edi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: andl 60(%edx), %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 52(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 56(%edx), %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 48(%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: andl 52(%edx), %ebx ; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, (%edx,%esi) +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 44(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 48(%edx), %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 40(%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: andl 44(%edx), %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 36(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 40(%edx), %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 32(%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: andl 36(%edx), %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 28(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 32(%edx), %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 24(%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: andl 28(%edx), %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 20(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 24(%edx), %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 16(%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: andl 20(%edx), %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 12(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 16(%edx), %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 8(%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: andl 12(%edx), %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 4(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 8(%edx), %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: notl %esi +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: shldl %cl, %eax, %edi +; X86-NEXT: andl 4(%edx), %esi +; X86-NEXT: orl %edi, %esi +; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: notl %esi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: andl (%edx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl (%edx,%eax), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 60(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 56(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 52(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 48(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 44(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 40(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 36(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 32(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 28(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 24(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 16(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 12(%edx) +; X86-NEXT: movl %ebx, 8(%edx) +; X86-NEXT: movl %edi, 4(%edx) +; X86-NEXT: movl %esi, (%edx) +; X86-NEXT: setae %al +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: init_eq_i512: ; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $184, %rsp +; SSE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $60, %esi -; SSE-NEXT: movl (%rdi,%rsi), %r8d -; SSE-NEXT: btl %ecx, %r8d +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: movl %esi, %eax +; SSE-NEXT: shrl $3, %eax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: andl $56, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: movslq %eax, %r12 +; SSE-NEXT: movq 160(%rsp,%r12), %rax +; SSE-NEXT: movq 168(%rsp,%r12), %r10 +; SSE-NEXT: shldq %cl, %rax, %r10 +; SSE-NEXT: movq 152(%rsp,%r12), %rsi +; SSE-NEXT: shldq %cl, %rsi, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 144(%rsp,%r12), %r11 +; SSE-NEXT: shldq %cl, %r11, %rsi +; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 136(%rsp,%r12), %rbx +; SSE-NEXT: shldq %cl, %rbx, %r11 +; SSE-NEXT: movq 128(%rsp,%r12), %r14 +; SSE-NEXT: shldq %cl, %r14, %rbx +; SSE-NEXT: movq 120(%rsp,%r12), %r15 +; SSE-NEXT: shldq %cl, %r15, %r14 +; SSE-NEXT: movq 112(%rsp,%r12), %r13 +; SSE-NEXT: shldq %cl, %r13, %r15 +; SSE-NEXT: shlq %cl, %r13 +; SSE-NEXT: movl %edx, %eax +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq 32(%rsp,%r12), %rax +; SSE-NEXT: movq 40(%rsp,%r12), %rdx +; SSE-NEXT: shldq %cl, %rax, %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 24(%rsp,%r12), %rdx +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq 16(%rsp,%r12), %rsi +; SSE-NEXT: shldq %cl, %rsi, %rdx +; SSE-NEXT: movq 8(%rsp,%r12), %r8 +; SSE-NEXT: shldq %cl, %r8, %rsi +; SSE-NEXT: movq (%rsp,%r12), %rbp +; SSE-NEXT: shldq %cl, %rbp, %r8 +; SSE-NEXT: movq -8(%rsp,%r12), %r9 +; SSE-NEXT: shldq %cl, %r9, %rbp +; SSE-NEXT: notq %r10 +; SSE-NEXT: andq 56(%rdi), %r10 +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload +; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE-NEXT: notq %r10 +; SSE-NEXT: andq 48(%rdi), %r10 +; SSE-NEXT: orq %rax, %r10 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: notq %rax +; SSE-NEXT: andq 40(%rdi), %rax +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: notq %r11 +; SSE-NEXT: andq 32(%rdi), %r11 +; SSE-NEXT: orq %rsi, %r11 +; SSE-NEXT: notq %rbx +; SSE-NEXT: andq 24(%rdi), %rbx +; SSE-NEXT: orq %r8, %rbx +; SSE-NEXT: notq %r14 +; SSE-NEXT: andq 16(%rdi), %r14 +; SSE-NEXT: orq %rbp, %r14 +; SSE-NEXT: notq %r15 +; SSE-NEXT: movq -16(%rsp,%r12), %rax +; SSE-NEXT: shldq %cl, %rax, %r9 +; SSE-NEXT: andq 8(%rdi), %r15 +; SSE-NEXT: orq %r9, %r15 +; SSE-NEXT: notq %r13 +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shlq %cl, %rax +; SSE-NEXT: andq (%rdi), %r13 +; SSE-NEXT: orq %rax, %r13 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andl $60, %eax +; SSE-NEXT: movl (%rdi,%rax), %eax +; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; SSE-NEXT: btl %ecx, %eax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: movq %rax, 56(%rdi) +; SSE-NEXT: movq %r10, 48(%rdi) +; SSE-NEXT: movq %rdx, 40(%rdi) +; SSE-NEXT: movq %r11, 32(%rdi) +; SSE-NEXT: movq %rbx, 24(%rdi) +; SSE-NEXT: movq %r14, 16(%rdi) +; SSE-NEXT: movq %r15, 8(%rdi) +; SSE-NEXT: movq %r13, (%rdi) ; SSE-NEXT: setae %al -; SSE-NEXT: shll %cl, %edx -; SSE-NEXT: btrl %ecx, %r8d -; SSE-NEXT: orl %r8d, %edx -; SSE-NEXT: movl %edx, (%rdi,%rsi) +; SSE-NEXT: addq $184, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; -; AVX-LABEL: init_eq_i512: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: shrl $3, %ecx -; AVX-NEXT: andl $60, %ecx -; AVX-NEXT: movl (%rdi,%rcx), %r8d -; AVX-NEXT: btl %esi, %r8d -; AVX-NEXT: setae %al -; AVX-NEXT: btrl %esi, %r8d -; AVX-NEXT: shlxl %esi, %edx, %edx -; AVX-NEXT: orl %r8d, %edx -; AVX-NEXT: movl %edx, (%rdi,%rcx) -; AVX-NEXT: retq +; AVX2-LABEL: init_eq_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $168, %rsp +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0] +; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: movl %esi, %r11d +; AVX2-NEXT: shrl $3, %r11d +; AVX2-NEXT: movl %r11d, %eax +; AVX2-NEXT: andl $56, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: movslq %eax, %r10 +; AVX2-NEXT: movq 104(%rsp,%r10), %r15 +; AVX2-NEXT: movq 112(%rsp,%r10), %rax +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shldq %cl, %r15, %rsi +; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 120(%rsp,%r10), %rsi +; AVX2-NEXT: movq %rsi, %r8 +; AVX2-NEXT: shldq %cl, %rax, %r8 +; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 128(%rsp,%r10), %rax +; AVX2-NEXT: movq %rax, %rbx +; AVX2-NEXT: shldq %cl, %rsi, %rbx +; AVX2-NEXT: movq 136(%rsp,%r10), %rsi +; AVX2-NEXT: movq %rsi, %r14 +; AVX2-NEXT: shldq %cl, %rax, %r14 +; AVX2-NEXT: movq 144(%rsp,%r10), %rax +; AVX2-NEXT: movq %rax, %r12 +; AVX2-NEXT: shldq %cl, %rsi, %r12 +; AVX2-NEXT: movq 96(%rsp,%r10), %rsi +; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 152(%rsp,%r10), %r13 +; AVX2-NEXT: shldq %cl, %rax, %r13 +; AVX2-NEXT: shldq %cl, %rsi, %r15 +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movq 16(%rsp,%r10), %rbp +; AVX2-NEXT: movq 24(%rsp,%r10), %r9 +; AVX2-NEXT: shldq %cl, %rbp, %r9 +; AVX2-NEXT: movq 8(%rsp,%r10), %rdx +; AVX2-NEXT: shldq %cl, %rdx, %rbp +; AVX2-NEXT: movq (%rsp,%r10), %rax +; AVX2-NEXT: shldq %cl, %rax, %rdx +; AVX2-NEXT: movq -8(%rsp,%r10), %r8 +; AVX2-NEXT: shldq %cl, %r8, %rax +; AVX2-NEXT: movq -16(%rsp,%r10), %rsi +; AVX2-NEXT: shldq %cl, %rsi, %r8 +; AVX2-NEXT: andnq 56(%rdi), %r13, %r13 +; AVX2-NEXT: orq %r9, %r13 +; AVX2-NEXT: movq -24(%rsp,%r10), %r9 +; AVX2-NEXT: shldq %cl, %r9, %rsi +; AVX2-NEXT: andnq 48(%rdi), %r12, %r12 +; AVX2-NEXT: andnq 40(%rdi), %r14, %r14 +; AVX2-NEXT: orq %rbp, %r12 +; AVX2-NEXT: orq %rdx, %r14 +; AVX2-NEXT: andnq 32(%rdi), %rbx, %rdx +; AVX2-NEXT: orq %rax, %rdx +; AVX2-NEXT: shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX2-NEXT: movq -32(%rsp,%r10), %r10 +; AVX2-NEXT: shlxq %rcx, %r10, %rbx +; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX2-NEXT: shldq %cl, %r10, %r9 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: andnq 24(%rdi), %rcx, %rcx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: andnq 16(%rdi), %r10, %r10 +; AVX2-NEXT: orq %r8, %rcx +; AVX2-NEXT: orq %rsi, %r10 +; AVX2-NEXT: andnq 8(%rdi), %r15, %rsi +; AVX2-NEXT: orq %r9, %rsi +; AVX2-NEXT: andnq (%rdi), %rax, %rax +; AVX2-NEXT: orq %rbx, %rax +; AVX2-NEXT: andl $60, %r11d +; AVX2-NEXT: movl (%rdi,%r11), %r8d +; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload +; AVX2-NEXT: btl %r9d, %r8d +; AVX2-NEXT: movq %r13, 56(%rdi) +; AVX2-NEXT: movq %r12, 48(%rdi) +; AVX2-NEXT: movq %r14, 40(%rdi) +; AVX2-NEXT: movq %rdx, 32(%rdi) +; AVX2-NEXT: movq %rcx, 24(%rdi) +; AVX2-NEXT: movq %r10, 16(%rdi) +; AVX2-NEXT: movq %rsi, 8(%rdi) +; AVX2-NEXT: movq %rax, (%rdi) +; AVX2-NEXT: setae %al +; AVX2-NEXT: addq $168, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: init_eq_i512: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $152, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0] +; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: andl $63, %ecx +; AVX512-NEXT: movl %esi, %r8d +; AVX512-NEXT: shrl $3, %r8d +; AVX512-NEXT: movl %r8d, %eax +; AVX512-NEXT: andl $56, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: movslq %eax, %r9 +; AVX512-NEXT: movq 88(%rsp,%r9), %r10 +; AVX512-NEXT: movq 96(%rsp,%r9), %rax +; AVX512-NEXT: movq %rax, %rsi +; AVX512-NEXT: shldq %cl, %r10, %rsi +; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 104(%rsp,%r9), %rsi +; AVX512-NEXT: movq %rsi, %r11 +; AVX512-NEXT: shldq %cl, %rax, %r11 +; AVX512-NEXT: movq 112(%rsp,%r9), %rax +; AVX512-NEXT: movq %rax, %rbx +; AVX512-NEXT: shldq %cl, %rsi, %rbx +; AVX512-NEXT: movq 120(%rsp,%r9), %rsi +; AVX512-NEXT: movq %rsi, %r14 +; AVX512-NEXT: shldq %cl, %rax, %r14 +; AVX512-NEXT: movq 128(%rsp,%r9), %rax +; AVX512-NEXT: movq %rax, %r12 +; AVX512-NEXT: shldq %cl, %rsi, %r12 +; AVX512-NEXT: movq 136(%rsp,%r9), %r13 +; AVX512-NEXT: shldq %cl, %rax, %r13 +; AVX512-NEXT: movq 80(%rsp,%r9), %r15 +; AVX512-NEXT: shldq %cl, %r15, %r10 +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movq (%rsp,%r9), %rbp +; AVX512-NEXT: movq 8(%rsp,%r9), %rsi +; AVX512-NEXT: shldq %cl, %rbp, %rsi +; AVX512-NEXT: movq -8(%rsp,%r9), %rdx +; AVX512-NEXT: shldq %cl, %rdx, %rbp +; AVX512-NEXT: movq -16(%rsp,%r9), %rax +; AVX512-NEXT: shldq %cl, %rax, %rdx +; AVX512-NEXT: andnq 56(%rdi), %r13, %r13 +; AVX512-NEXT: andnq 48(%rdi), %r12, %r12 +; AVX512-NEXT: orq %rsi, %r13 +; AVX512-NEXT: orq %rbp, %r12 +; AVX512-NEXT: andnq 40(%rdi), %r14, %r14 +; AVX512-NEXT: orq %rdx, %r14 +; AVX512-NEXT: movq -24(%rsp,%r9), %rsi +; AVX512-NEXT: shldq %cl, %rsi, %rax +; AVX512-NEXT: andnq 32(%rdi), %rbx, %rdx +; AVX512-NEXT: orq %rax, %rdx +; AVX512-NEXT: movq -32(%rsp,%r9), %rax +; AVX512-NEXT: shldq %cl, %rax, %rsi +; AVX512-NEXT: shlxq %rcx, %r15, %rbx +; AVX512-NEXT: andnq 24(%rdi), %r11, %r11 +; AVX512-NEXT: orq %rsi, %r11 +; AVX512-NEXT: movq -48(%rsp,%r9), %rsi +; AVX512-NEXT: movq -40(%rsp,%r9), %r9 +; AVX512-NEXT: shldq %cl, %r9, %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: andnq 16(%rdi), %r15, %r15 +; AVX512-NEXT: orq %rax, %r15 +; AVX512-NEXT: shlxq %rcx, %rsi, %rax +; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512-NEXT: shldq %cl, %rsi, %r9 +; AVX512-NEXT: andnq 8(%rdi), %r10, %rcx +; AVX512-NEXT: orq %r9, %rcx +; AVX512-NEXT: andnq (%rdi), %rbx, %rsi +; AVX512-NEXT: orq %rax, %rsi +; AVX512-NEXT: andl $60, %r8d +; AVX512-NEXT: movl (%rdi,%r8), %eax +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload +; AVX512-NEXT: btl %r8d, %eax +; AVX512-NEXT: movq %r13, 56(%rdi) +; AVX512-NEXT: movq %r12, 48(%rdi) +; AVX512-NEXT: movq %r14, 40(%rdi) +; AVX512-NEXT: movq %rdx, 32(%rdi) +; AVX512-NEXT: movq %r11, 24(%rdi) +; AVX512-NEXT: movq %r15, 16(%rdi) +; AVX512-NEXT: movq %rcx, 8(%rdi) +; AVX512-NEXT: movq %rsi, (%rdi) +; AVX512-NEXT: setae %al +; AVX512-NEXT: addq $152, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %rem = and i32 %position, 511 %ofs = zext nneg i32 %rem to i512 %bit = shl nuw i512 1, %ofs From be64cdea182ccae7ce2b093c9536a626858fcc40 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Sat, 1 Nov 2025 01:38:00 -0700 Subject: [PATCH 427/539] Revert "[X86] Narrow BT/BTC/BTR/BTS compare + RMW patterns on very large integers (#165540)" (#165979) This reverts commit a55a7207c7e4d98dad32e8d53dd5964ee833edd9. See breaks i386 on bot and Rust, see #165540. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 114 +- llvm/test/CodeGen/X86/bittest-big-integer.ll | 7325 +++++++++++++++--- 2 files changed, 6333 insertions(+), 1106 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6f75a2eb7075a..c5fb5535d0057 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -53344,80 +53344,6 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, return SDValue(); } -// Look for a RMW operation that only touches one bit of a larger than legal -// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value. -static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, - SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - using namespace SDPatternMatch; - - // Only handle normal stores and its chain was a matching normal load. - auto *Ld = dyn_cast(St->getChain()); - if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld || - !ISD::isNormalLoad(Ld) || !Ld->isSimple() || - Ld->getBasePtr() != St->getBasePtr() || - Ld->getOffset() != St->getOffset()) - return SDValue(); - - SDValue LoadVal(Ld, 0); - SDValue StoredVal = St->getValue(); - EVT VT = StoredVal.getValueType(); - - // Only narrow larger than legal scalar integers. - if (!VT.isScalarInteger() || - VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32)) - return SDValue(); - - // BTR: X & ~(1 << ShAmt) - // BTS: X | (1 << ShAmt) - // BTC: X ^ (1 << ShAmt) - SDValue ShAmt; - if (!StoredVal.hasOneUse() || - !(sd_match(StoredVal, m_And(m_Specific(LoadVal), - m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || - sd_match(StoredVal, - m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || - sd_match(StoredVal, - m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))))) - return SDValue(); - - // Ensure the shift amount is in bounds. - KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); - if (KnownAmt.getMaxValue().uge(VT.getSizeInBits())) - return SDValue(); - - // Split the shift into an alignment shift that moves the active i32 block to - // the bottom bits for truncation and a modulo shift that can act on the i32. - EVT AmtVT = ShAmt.getValueType(); - SDValue AlignAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, - DAG.getSignedConstant(-32LL, DL, AmtVT)); - SDValue ModuloAmt = - DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT)); - - // Compute the byte offset for the i32 block that is changed by the RMW. - // combineTruncate will adjust the load for us in a similar way. - EVT PtrVT = St->getBasePtr().getValueType(); - SDValue PtrBitOfs = DAG.getZExtOrTrunc(AlignAmt, DL, PtrVT); - SDValue PtrByteOfs = DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs, - DAG.getShiftAmountConstant(3, PtrVT, DL)); - SDValue NewPtr = DAG.getMemBasePlusOffset(St->getBasePtr(), PtrByteOfs, DL, - SDNodeFlags::NoUnsignedWrap); - - // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store. - SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt); - X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); - - SDValue Mask = - DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), - DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); - if (StoredVal.getOpcode() == ISD::AND) - Mask = DAG.getNOT(DL, Mask, MVT::i32); - - SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); - return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), - Align(), St->getMemOperand()->getFlags()); -} - static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -53644,9 +53570,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, } } - if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget)) - return R; - // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC) // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC) if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && @@ -54579,9 +54502,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, // truncation, see if we can convert the shift into a pointer offset instead. // Limit this to normal (non-ext) scalar integer loads. if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL && - Src.hasOneUse() && ISD::isNormalLoad(Src.getOperand(0).getNode()) && - (Src.getOperand(0).hasOneUse() || - !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, SrcVT))) { + Src.hasOneUse() && Src.getOperand(0).hasOneUse() && + ISD::isNormalLoad(Src.getOperand(0).getNode())) { auto *Ld = cast(Src.getOperand(0)); if (Ld->isSimple() && VT.isByteSized() && isPowerOf2_64(VT.getSizeInBits())) { @@ -56381,7 +56303,6 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { - using namespace SDPatternMatch; const ISD::CondCode CC = cast(N->getOperand(2))->get(); const SDValue LHS = N->getOperand(0); const SDValue RHS = N->getOperand(1); @@ -56440,37 +56361,6 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, if (SDValue AndN = MatchAndCmpEq(RHS, LHS)) return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC); - // If we're performing a bit test on a larger than legal type, attempt - // to (aligned) shift down the value to the bottom 32-bits and then - // perform the bittest on the i32 value. - // ICMP_ZERO(AND(X,SHL(1,IDX))) - // --> ICMP_ZERO(AND(TRUNC(SRL(X,AND(IDX,-32))),SHL(1,AND(IDX,31)))) - if (isNullConstant(RHS) && - OpVT.getScalarSizeInBits() > (Subtarget.is64Bit() ? 64 : 32)) { - SDValue X, ShAmt; - if (sd_match(LHS, m_OneUse(m_And(m_Value(X), - m_Shl(m_One(), m_Value(ShAmt)))))) { - // Only attempt this if the shift amount is known to be in bounds. - KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); - if (KnownAmt.getMaxValue().ult(OpVT.getScalarSizeInBits())) { - EVT AmtVT = ShAmt.getValueType(); - SDValue AlignAmt = - DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, - DAG.getSignedConstant(-32LL, DL, AmtVT)); - SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, - DAG.getConstant(31, DL, AmtVT)); - SDValue Mask = DAG.getNode( - ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), - DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); - X = DAG.getNode(ISD::SRL, DL, OpVT, X, AlignAmt); - X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); - X = DAG.getNode(ISD::AND, DL, MVT::i32, X, Mask); - return DAG.getSetCC(DL, VT, X, DAG.getConstant(0, DL, MVT::i32), - CC); - } - } - } - // cmpeq(trunc(x),C) --> cmpeq(x,C) // cmpne(trunc(x),C) --> cmpne(x,C) // iff x upper bits are zero. diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index 06e7d4773c58d..8007d9dcf13bc 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -203,14 +203,24 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i64: ; X86: # %bb.0: +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: andl $32, %edx -; X86-NEXT: shrl $3, %edx -; X86-NEXT: movl (%eax,%edx), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: setb %al +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %edx +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: shll %cl, %edx +; X86-NEXT: testb $32, %cl +; X86-NEXT: je .LBB5_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: movl %edx, %esi +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: .LBB5_2: +; X86-NEXT: andl 4(%eax), %esi +; X86-NEXT: andl (%eax), %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: setne %al +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: test_ne_i64: @@ -232,20 +242,38 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind { define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_ne_i64: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $32, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setb %al -; X86-NEXT: btcl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: testb $32, %cl +; X86-NEXT: je .LBB6_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: movl %eax, %esi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: .LBB6_2: +; X86-NEXT: movl (%edx), %ecx +; X86-NEXT: movl 4(%edx), %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: andl %esi, %ebx +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: andl %eax, %ebp +; X86-NEXT: xorl %esi, %edi +; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: setne %al +; X86-NEXT: movl %ecx, (%edx) +; X86-NEXT: movl %edi, 4(%edx) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: complement_ne_i64: @@ -272,20 +300,40 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind { define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: reset_eq_i64: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $32, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setae %al -; X86-NEXT: btrl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %esi +; X86-NEXT: xorl %edi, %edi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: shll %cl, %esi +; X86-NEXT: testb $32, %cl +; X86-NEXT: je .LBB7_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: movl %esi, %edi +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: .LBB7_2: +; X86-NEXT: movl (%edx), %eax +; X86-NEXT: movl 4(%edx), %ecx +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: andl %edi, %ebx +; X86-NEXT: notl %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: andl %esi, %ebp +; X86-NEXT: notl %esi +; X86-NEXT: andl %ecx, %edi +; X86-NEXT: andl %eax, %esi +; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: sete %al +; X86-NEXT: movl %esi, (%edx) +; X86-NEXT: movl %edi, 4(%edx) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: reset_eq_i64: @@ -313,20 +361,38 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind { define i1 @set_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: set_ne_i64: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $32, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setb %al -; X86-NEXT: btsl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: testb $32, %cl +; X86-NEXT: je .LBB8_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: movl %eax, %esi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: .LBB8_2: +; X86-NEXT: movl (%edx), %ecx +; X86-NEXT: movl 4(%edx), %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: andl %esi, %ebx +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: andl %eax, %ebp +; X86-NEXT: orl %esi, %edi +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: setne %al +; X86-NEXT: movl %ecx, (%edx) +; X86-NEXT: movl %edi, 4(%edx) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: set_ne_i64: @@ -353,47 +419,52 @@ define i1 @set_ne_i64(ptr %word, i32 %position) nounwind { define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-LABEL: init_eq_i64: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: shldl %cl, %eax, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: shldl %cl, %eax, %edx ; X86-NEXT: shll %cl, %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: xorl %edi, %edi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: shll %cl, %esi ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB9_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl $0, %edx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl $0, %eax ; X86-NEXT: .LBB9_2: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: notl %esi -; X86-NEXT: notl %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: notl %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: notl %ebp ; X86-NEXT: je .LBB9_4 ; X86-NEXT: # %bb.3: -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl %esi, %edi +; X86-NEXT: xorl %esi, %esi ; X86-NEXT: .LBB9_4: -; X86-NEXT: andl 4(%ebx), %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: andl (%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl $32, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: movl (%ebx,%eax), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: setae %al -; X86-NEXT: movl %esi, 4(%ebx) -; X86-NEXT: movl %edx, (%ebx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl 4(%ecx), %ecx +; X86-NEXT: andl %ecx, %edx +; X86-NEXT: andl %ecx, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl (%edi), %ecx +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: andl %ecx, %ebp +; X86-NEXT: orl %esi, %ebp +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl %ebp, (%edi) +; X86-NEXT: movl %ebx, 4(%edi) +; X86-NEXT: sete %al ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: init_eq_i64: @@ -445,25 +516,101 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i128: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: andl $96, %edx -; X86-NEXT: shrl $3, %edx -; X86-NEXT: movl (%eax,%edx), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: setb %al +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $48, %esp +; X86-NEXT: movzbl 12(%ebp), %ecx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, (%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %esi +; X86-NEXT: movl 24(%esp,%esi), %edi +; X86-NEXT: movl 28(%esp,%esi), %eax +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl 16(%esp,%esi), %edx +; X86-NEXT: movl 20(%esp,%esi), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: andl 8(%ebx), %edi +; X86-NEXT: andl (%ebx), %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: andl 12(%ebx), %eax +; X86-NEXT: andl 4(%ebx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: setne %al +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; X64-LABEL: test_ne_i128: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: andl $96, %eax -; X64-NEXT: shrl $3, %eax -; X64-NEXT: movl (%rdi,%rax), %eax -; X64-NEXT: btl %esi, %eax -; X64-NEXT: setb %al -; X64-NEXT: retq +; SSE-LABEL: test_ne_i128: +; SSE: # %bb.0: +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: movl $1, %eax +; SSE-NEXT: xorl %edx, %edx +; SSE-NEXT: shldq %cl, %rax, %rdx +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: shlq %cl, %rax +; SSE-NEXT: testb $64, %cl +; SSE-NEXT: cmovneq %rax, %rdx +; SSE-NEXT: cmovneq %rsi, %rax +; SSE-NEXT: andq 8(%rdi), %rdx +; SSE-NEXT: andq (%rdi), %rax +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: setne %al +; SSE-NEXT: retq +; +; AVX2-LABEL: test_ne_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: movl $1, %edx +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: shldq %cl, %rdx, %rsi +; AVX2-NEXT: shlxq %rcx, %rdx, %rdx +; AVX2-NEXT: testb $64, %cl +; AVX2-NEXT: cmovneq %rdx, %rsi +; AVX2-NEXT: cmovneq %rax, %rdx +; AVX2-NEXT: andq 8(%rdi), %rsi +; AVX2-NEXT: andq (%rdi), %rdx +; AVX2-NEXT: orq %rsi, %rdx +; AVX2-NEXT: setne %al +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ne_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: movl $1, %eax +; AVX512-NEXT: xorl %edx, %edx +; AVX512-NEXT: shldq %cl, %rax, %rdx +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: shlxq %rcx, %rax, %rax +; AVX512-NEXT: testb $64, %cl +; AVX512-NEXT: cmovneq %rax, %rdx +; AVX512-NEXT: cmovneq %rsi, %rax +; AVX512-NEXT: andq 8(%rdi), %rdx +; AVX512-NEXT: andq (%rdi), %rax +; AVX512-NEXT: orq %rdx, %rax +; AVX512-NEXT: setne %al +; AVX512-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -476,33 +623,124 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind { define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_ne_i128: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $96, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setb %al -; X86-NEXT: btcl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $80, %esp +; X86-NEXT: movzbl 12(%ebp), %ecx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 56(%esp,%eax), %esi +; X86-NEXT: movl 60(%esp,%eax), %edx +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%esp,%eax), %edi +; X86-NEXT: movl 52(%esp,%eax), %ebx +; X86-NEXT: shldl %cl, %ebx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %ebx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl 8(%eax), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %eax +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: andl %edi, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl 12(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 4(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %ebx, 4(%eax) +; X86-NEXT: setne %al +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; X64-LABEL: complement_ne_i128: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: andl $96, %ecx -; X64-NEXT: shrl $3, %ecx -; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setb %al -; X64-NEXT: btcl %esi, %edx -; X64-NEXT: movl %edx, (%rdi,%rcx) -; X64-NEXT: retq +; SSE-LABEL: complement_ne_i128: +; SSE: # %bb.0: +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: movl $1, %edx +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: shldq %cl, %rdx, %rsi +; SSE-NEXT: shlq %cl, %rdx +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: testb $64, %cl +; SSE-NEXT: cmovneq %rdx, %rsi +; SSE-NEXT: cmovneq %rax, %rdx +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: movq 8(%rdi), %rcx +; SSE-NEXT: movq %rcx, %r8 +; SSE-NEXT: andq %rsi, %r8 +; SSE-NEXT: movq %rax, %r9 +; SSE-NEXT: andq %rdx, %r9 +; SSE-NEXT: xorq %rcx, %rsi +; SSE-NEXT: xorq %rax, %rdx +; SSE-NEXT: orq %r8, %r9 +; SSE-NEXT: setne %al +; SSE-NEXT: movq %rdx, (%rdi) +; SSE-NEXT: movq %rsi, 8(%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: complement_ne_i128: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movl $1, %edx +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: shldq %cl, %rdx, %rsi +; AVX-NEXT: shlxq %rcx, %rdx, %rdx +; AVX-NEXT: testb $64, %cl +; AVX-NEXT: cmovneq %rdx, %rsi +; AVX-NEXT: cmovneq %rax, %rdx +; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: movq 8(%rdi), %rcx +; AVX-NEXT: movq %rcx, %r8 +; AVX-NEXT: andq %rsi, %r8 +; AVX-NEXT: movq %rax, %r9 +; AVX-NEXT: andq %rdx, %r9 +; AVX-NEXT: xorq %rcx, %rsi +; AVX-NEXT: xorq %rax, %rdx +; AVX-NEXT: orq %r8, %r9 +; AVX-NEXT: setne %al +; AVX-NEXT: movq %rdx, (%rdi) +; AVX-NEXT: movq %rsi, 8(%rdi) +; AVX-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -517,33 +755,124 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind { define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: reset_eq_i128: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $96, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setae %al -; X86-NEXT: btrl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $80, %esp +; X86-NEXT: movzbl 12(%ebp), %ecx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 56(%esp,%eax), %edx +; X86-NEXT: movl 60(%esp,%eax), %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%esp,%eax), %esi +; X86-NEXT: movl 52(%esp,%eax), %edi +; X86-NEXT: shldl %cl, %edi, %edx +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: shll %cl, %esi +; X86-NEXT: movl 8(%ebx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %eax +; X86-NEXT: movl (%ebx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%ebx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl 4(%ebx), %ebx +; X86-NEXT: andl %ebx, %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: notl %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: notl %ecx +; X86-NEXT: andl %ebx, %ecx +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl 8(%ebp), %edi +; X86-NEXT: movl %edx, 8(%edi) +; X86-NEXT: movl %eax, 12(%edi) +; X86-NEXT: movl %esi, (%edi) +; X86-NEXT: movl %ecx, 4(%edi) +; X86-NEXT: sete %al +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; X64-LABEL: reset_eq_i128: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: andl $96, %ecx -; X64-NEXT: shrl $3, %ecx -; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setae %al -; X64-NEXT: btrl %esi, %edx -; X64-NEXT: movl %edx, (%rdi,%rcx) -; X64-NEXT: retq +; SSE-LABEL: reset_eq_i128: +; SSE: # %bb.0: +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: movl $1, %edx +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: shldq %cl, %rdx, %rsi +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: shlq %cl, %rdx +; SSE-NEXT: testb $64, %cl +; SSE-NEXT: cmovneq %rdx, %rsi +; SSE-NEXT: cmovneq %rax, %rdx +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: movq 8(%rdi), %rcx +; SSE-NEXT: movq %rcx, %r8 +; SSE-NEXT: andq %rsi, %r8 +; SSE-NEXT: notq %rsi +; SSE-NEXT: movq %rax, %r9 +; SSE-NEXT: andq %rdx, %r9 +; SSE-NEXT: notq %rdx +; SSE-NEXT: andq %rcx, %rsi +; SSE-NEXT: andq %rax, %rdx +; SSE-NEXT: orq %r8, %r9 +; SSE-NEXT: sete %al +; SSE-NEXT: movq %rdx, (%rdi) +; SSE-NEXT: movq %rsi, 8(%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: reset_eq_i128: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movl $1, %edx +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: shldq %cl, %rdx, %rsi +; AVX-NEXT: shlxq %rcx, %rdx, %rdx +; AVX-NEXT: testb $64, %cl +; AVX-NEXT: cmovneq %rdx, %rsi +; AVX-NEXT: cmovneq %rax, %rdx +; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: movq 8(%rdi), %rcx +; AVX-NEXT: andnq %rcx, %rsi, %r8 +; AVX-NEXT: andq %rsi, %rcx +; AVX-NEXT: andnq %rax, %rdx, %rsi +; AVX-NEXT: andq %rdx, %rax +; AVX-NEXT: orq %rcx, %rax +; AVX-NEXT: sete %al +; AVX-NEXT: movq %rsi, (%rdi) +; AVX-NEXT: movq %r8, 8(%rdi) +; AVX-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -559,33 +888,124 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind { define i1 @set_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: set_ne_i128: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $96, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setb %al -; X86-NEXT: btsl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $80, %esp +; X86-NEXT: movzbl 12(%ebp), %ecx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 56(%esp,%eax), %esi +; X86-NEXT: movl 60(%esp,%eax), %edx +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%esp,%eax), %edi +; X86-NEXT: movl 52(%esp,%eax), %ebx +; X86-NEXT: shldl %cl, %ebx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %ebx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl 8(%eax), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %eax +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: andl %edi, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl 12(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 4(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %ebx, 4(%eax) +; X86-NEXT: setne %al +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; X64-LABEL: set_ne_i128: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: andl $96, %ecx -; X64-NEXT: shrl $3, %ecx -; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setb %al -; X64-NEXT: btsl %esi, %edx -; X64-NEXT: movl %edx, (%rdi,%rcx) -; X64-NEXT: retq +; SSE-LABEL: set_ne_i128: +; SSE: # %bb.0: +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: movl $1, %edx +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: shldq %cl, %rdx, %rsi +; SSE-NEXT: shlq %cl, %rdx +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: testb $64, %cl +; SSE-NEXT: cmovneq %rdx, %rsi +; SSE-NEXT: cmovneq %rax, %rdx +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: movq 8(%rdi), %rcx +; SSE-NEXT: movq %rcx, %r8 +; SSE-NEXT: andq %rsi, %r8 +; SSE-NEXT: movq %rax, %r9 +; SSE-NEXT: andq %rdx, %r9 +; SSE-NEXT: orq %rcx, %rsi +; SSE-NEXT: orq %rax, %rdx +; SSE-NEXT: orq %r8, %r9 +; SSE-NEXT: setne %al +; SSE-NEXT: movq %rdx, (%rdi) +; SSE-NEXT: movq %rsi, 8(%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: set_ne_i128: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movl $1, %edx +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: shldq %cl, %rdx, %rsi +; AVX-NEXT: shlxq %rcx, %rdx, %rdx +; AVX-NEXT: testb $64, %cl +; AVX-NEXT: cmovneq %rdx, %rsi +; AVX-NEXT: cmovneq %rax, %rdx +; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: movq 8(%rdi), %rcx +; AVX-NEXT: movq %rcx, %r8 +; AVX-NEXT: andq %rsi, %r8 +; AVX-NEXT: movq %rax, %r9 +; AVX-NEXT: andq %rdx, %r9 +; AVX-NEXT: orq %rcx, %rsi +; AVX-NEXT: orq %rax, %rdx +; AVX-NEXT: orq %r8, %r9 +; AVX-NEXT: setne %al +; AVX-NEXT: movq %rdx, (%rdi) +; AVX-NEXT: movq %rsi, 8(%rdi) +; AVX-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -606,9 +1026,9 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $96, %esp -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movzbl 16(%ebp), %ebx +; X86-NEXT: subl $128, %esp +; X86-NEXT: movzbl 12(%ebp), %ecx +; X86-NEXT: movzbl 16(%ebp), %eax ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -617,29 +1037,25 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %edi -; X86-NEXT: movl 72(%esp,%edi), %edx -; X86-NEXT: movl 76(%esp,%edi), %esi -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 64(%esp,%edi), %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NEXT: movl 68(%esp,%edi), %ebx -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrb $3, %dl +; X86-NEXT: andb $12, %dl +; X86-NEXT: negb %dl +; X86-NEXT: movsbl %dl, %esi +; X86-NEXT: movl 64(%esp,%esi), %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: notl %esi +; X86-NEXT: movl 68(%esp,%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 72(%esp,%esi), %ebx ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl 76(%esp,%esi), %edi +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: shldl %cl, %ebx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shll %cl, %edx ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -647,53 +1063,72 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%esi), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %eax +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esi), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl 40(%esp,%eax), %edi -; X86-NEXT: movl 44(%esp,%eax), %esi -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %edi, %esi ; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 12(%ecx), %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%ecx), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl 4(%ecx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ecx, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: notl %ecx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl 100(%esp,%ecx), %edi +; X86-NEXT: movl 104(%esp,%ecx), %ecx +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movzbl 12(%ebp), %ecx +; X86-NEXT: shldl %cl, %edi, %ebx +; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl 108(%esp,%ebx), %ebx +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: orl %ebx, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: notl %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl 36(%esp,%esi), %esi -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: andl 8(%edx), %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: notl %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl 32(%esp,%eax), %eax -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl 8(%ebp), %edi -; X86-NEXT: andl 4(%edi), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl 96(%esp,%ebx), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: orl %ebx, %eax ; X86-NEXT: notl %edx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: andl (%edi), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl $96, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: movl (%edi,%eax), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 12(%edi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 8(%edi) -; X86-NEXT: movl %ebx, 4(%edi) -; X86-NEXT: movl %edx, (%edi) -; X86-NEXT: setae %al +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl %cl, %ebx, %edi +; X86-NEXT: orl %edi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl 8(%ebp), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: movl %edx, 4(%ecx) +; X86-NEXT: sete %al ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -716,84 +1151,86 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; SSE-NEXT: testb $64, %cl ; SSE-NEXT: cmovneq %rsi, %r8 ; SSE-NEXT: cmovneq %r9, %rsi -; SSE-NEXT: notq %r8 ; SSE-NEXT: cmovneq %rax, %rdx ; SSE-NEXT: cmovneq %r9, %rax +; SSE-NEXT: movq (%rdi), %rcx +; SSE-NEXT: movq 8(%rdi), %r9 +; SSE-NEXT: movq %r9, %r10 +; SSE-NEXT: andq %r8, %r10 +; SSE-NEXT: notq %r8 +; SSE-NEXT: movq %rcx, %r11 +; SSE-NEXT: andq %rsi, %r11 ; SSE-NEXT: notq %rsi -; SSE-NEXT: andq 8(%rdi), %r8 +; SSE-NEXT: andq %r9, %r8 ; SSE-NEXT: orq %rdx, %r8 -; SSE-NEXT: andq (%rdi), %rsi +; SSE-NEXT: andq %rcx, %rsi ; SSE-NEXT: orq %rax, %rsi -; SSE-NEXT: movl %ecx, %eax -; SSE-NEXT: andl $96, %eax -; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: movl (%rdi,%rax), %eax -; SSE-NEXT: btl %ecx, %eax -; SSE-NEXT: setae %al -; SSE-NEXT: movq %r8, 8(%rdi) +; SSE-NEXT: orq %r10, %r11 +; SSE-NEXT: sete %al ; SSE-NEXT: movq %rsi, (%rdi) +; SSE-NEXT: movq %r8, 8(%rdi) ; SSE-NEXT: retq ; ; AVX2-LABEL: init_eq_i128: ; AVX2: # %bb.0: ; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl $1, %eax -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shldq %cl, %rax, %rsi -; AVX2-NEXT: movl %edx, %edx +; AVX2-NEXT: movl $1, %esi +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: shldq %cl, %rsi, %rax ; AVX2-NEXT: xorl %r8d, %r8d -; AVX2-NEXT: shldq %cl, %rdx, %r8 +; AVX2-NEXT: movl %edx, %edx ; AVX2-NEXT: xorl %r9d, %r9d -; AVX2-NEXT: shlxq %rcx, %rax, %rax +; AVX2-NEXT: shldq %cl, %rdx, %r9 +; AVX2-NEXT: shlxq %rcx, %rsi, %rsi ; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rax, %rsi -; AVX2-NEXT: cmovneq %r9, %rax -; AVX2-NEXT: shlxq %rcx, %rdx, %rdx -; AVX2-NEXT: cmovneq %rdx, %r8 -; AVX2-NEXT: cmovneq %r9, %rdx -; AVX2-NEXT: andnq 8(%rdi), %rsi, %rsi -; AVX2-NEXT: orq %r8, %rsi -; AVX2-NEXT: andnq (%rdi), %rax, %r8 -; AVX2-NEXT: orq %rdx, %r8 -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $96, %eax -; AVX2-NEXT: shrl $3, %eax -; AVX2-NEXT: movl (%rdi,%rax), %eax -; AVX2-NEXT: btl %ecx, %eax -; AVX2-NEXT: setae %al -; AVX2-NEXT: movq %rsi, 8(%rdi) -; AVX2-NEXT: movq %r8, (%rdi) +; AVX2-NEXT: cmovneq %rsi, %rax +; AVX2-NEXT: cmovneq %r8, %rsi +; AVX2-NEXT: shlxq %rcx, %rdx, %rcx +; AVX2-NEXT: cmovneq %rcx, %r9 +; AVX2-NEXT: cmovneq %r8, %rcx +; AVX2-NEXT: movq (%rdi), %rdx +; AVX2-NEXT: movq 8(%rdi), %r8 +; AVX2-NEXT: andnq %r8, %rax, %r10 +; AVX2-NEXT: andq %rax, %r8 +; AVX2-NEXT: andnq %rdx, %rsi, %r11 +; AVX2-NEXT: andq %rsi, %rdx +; AVX2-NEXT: orq %r9, %r10 +; AVX2-NEXT: orq %rcx, %r11 +; AVX2-NEXT: orq %r8, %rdx +; AVX2-NEXT: sete %al +; AVX2-NEXT: movq %r11, (%rdi) +; AVX2-NEXT: movq %r10, 8(%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: init_eq_i128: ; AVX512: # %bb.0: ; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl $1, %eax -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shldq %cl, %rax, %rsi +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movl $1, %esi ; AVX512-NEXT: xorl %r8d, %r8d -; AVX512-NEXT: shlxq %rcx, %rax, %rax +; AVX512-NEXT: shldq %cl, %rsi, %r8 +; AVX512-NEXT: shlxq %rcx, %rsi, %rsi ; AVX512-NEXT: movl %edx, %edx ; AVX512-NEXT: xorl %r9d, %r9d ; AVX512-NEXT: shldq %cl, %rdx, %r9 ; AVX512-NEXT: testb $64, %cl +; AVX512-NEXT: cmovneq %rsi, %r8 ; AVX512-NEXT: cmovneq %rax, %rsi -; AVX512-NEXT: cmovneq %r8, %rax -; AVX512-NEXT: shlxq %rcx, %rdx, %rdx -; AVX512-NEXT: cmovneq %rdx, %r9 -; AVX512-NEXT: cmovneq %r8, %rdx -; AVX512-NEXT: andnq 8(%rdi), %rsi, %rsi -; AVX512-NEXT: orq %r9, %rsi -; AVX512-NEXT: andnq (%rdi), %rax, %r8 -; AVX512-NEXT: orq %rdx, %r8 -; AVX512-NEXT: movl %ecx, %eax -; AVX512-NEXT: andl $96, %eax -; AVX512-NEXT: shrl $3, %eax -; AVX512-NEXT: movl (%rdi,%rax), %eax -; AVX512-NEXT: btl %ecx, %eax -; AVX512-NEXT: setae %al -; AVX512-NEXT: movq %rsi, 8(%rdi) +; AVX512-NEXT: shlxq %rcx, %rdx, %rcx +; AVX512-NEXT: cmovneq %rcx, %r9 +; AVX512-NEXT: cmovneq %rax, %rcx +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: movq 8(%rdi), %rdx +; AVX512-NEXT: andnq %rdx, %r8, %r10 +; AVX512-NEXT: andq %r8, %rdx +; AVX512-NEXT: andnq %rax, %rsi, %r8 +; AVX512-NEXT: andq %rsi, %rax +; AVX512-NEXT: orq %r9, %r10 +; AVX512-NEXT: orq %rcx, %r8 +; AVX512-NEXT: orq %rdx, %rax +; AVX512-NEXT: sete %al ; AVX512-NEXT: movq %r8, (%rdi) +; AVX512-NEXT: movq %r10, 8(%rdi) ; AVX512-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 @@ -815,175 +1252,20 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i512(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i512: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrl $3, %edx -; X86-NEXT: andl $60, %edx -; X86-NEXT: movl (%eax,%edx), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: setb %al -; X86-NEXT: retl -; -; X64-LABEL: test_ne_i512: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: shrl $3, %eax -; X64-NEXT: andl $60, %eax -; X64-NEXT: movl (%rdi,%rax), %eax -; X64-NEXT: btl %esi, %eax -; X64-NEXT: setb %al -; X64-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %cmp = icmp ne i512 %test, 0 - ret i1 %cmp -} - -define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { -; X86-LABEL: complement_ne_i512: -; X86: # %bb.0: -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: andl $60, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setb %al -; X86-NEXT: btcl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: retl -; -; X64-LABEL: complement_ne_i512: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: shrl $3, %ecx -; X64-NEXT: andl $60, %ecx -; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setb %al -; X64-NEXT: btcl %esi, %edx -; X64-NEXT: movl %edx, (%rdi,%rcx) -; X64-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %res = xor i512 %ld, %bit - %cmp = icmp ne i512 %test, 0 - store i512 %res, ptr %word - ret i1 %cmp -} - -define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind { -; X86-LABEL: reset_eq_i512: -; X86: # %bb.0: -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: andl $60, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setae %al -; X86-NEXT: btrl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: retl -; -; X64-LABEL: reset_eq_i512: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: shrl $3, %ecx -; X64-NEXT: andl $60, %ecx -; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setae %al -; X64-NEXT: btrl %esi, %edx -; X64-NEXT: movl %edx, (%rdi,%rcx) -; X64-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %mask = xor i512 %bit, -1 - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %res = and i512 %ld, %mask - %cmp = icmp eq i512 %test, 0 - store i512 %res, ptr %word - ret i1 %cmp -} - -define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { -; X86-LABEL: set_ne_i512: -; X86: # %bb.0: -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: andl $60, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setb %al -; X86-NEXT: btsl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: retl -; -; X64-LABEL: set_ne_i512: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: shrl $3, %ecx -; X64-NEXT: andl $60, %ecx -; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setb %al -; X64-NEXT: btsl %esi, %edx -; X64-NEXT: movl %edx, (%rdi,%rcx) -; X64-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %res = or i512 %ld, %bit - %cmp = icmp ne i512 %test, 0 - store i512 %res, ptr %word - ret i1 %cmp -} - -define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { -; X86-LABEL: init_eq_i512: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $352, %esp # imm = 0x160 +; X86-NEXT: subl $224, %esp ; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrl $3, %edx -; X86-NEXT: andl $60, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl %edx, %eax +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: andl $60, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl %eax, %edx ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1016,88 +1298,325 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 56(%eax), %esi -; X86-NEXT: movl 60(%eax), %ebx -; X86-NEXT: movl 52(%eax), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%eax), %edi -; X86-NEXT: movl 44(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%eax), %eax +; X86-NEXT: movl 24(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%edx), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl 16(%ebp), %eax -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: andl $31, %ecx +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 56(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 60(%edx), %eax +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%edx), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%edx), %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%edx), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shldl %cl, %esi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl 52(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl 4(%edx), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %edi, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edi +; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: andl 40(%ebx), %eax +; X86-NEXT: andl 8(%ebx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 56(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 24(%ebx), %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: orl %esi, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: andl 44(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 12(%ebx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 60(%edi), %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: andl 28(%edi), %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: orl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%edx), %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: negl %edx +; X86-NEXT: movl 192(%esp,%edx), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: andl 32(%ebx), %ecx +; X86-NEXT: andl (%ebx), %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: andl 16(%ebx), %edi +; X86-NEXT: andl 48(%ebx), %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 36(%ebx), %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: andl 4(%ebx), %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 20(%ebx), %ecx +; X86-NEXT: andl 52(%ebx), %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: orl %edx, %eax +; X86-NEXT: setne %al +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; SSE-LABEL: test_ne_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $56, %esi +; SSE-NEXT: negl %esi +; SSE-NEXT: movslq %esi, %rbx +; SSE-NEXT: movq -48(%rsp,%rbx), %rdx +; SSE-NEXT: movq -40(%rsp,%rbx), %r14 +; SSE-NEXT: movq %r14, %rax +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq -16(%rsp,%rbx), %r11 +; SSE-NEXT: movq -8(%rsp,%rbx), %r10 +; SSE-NEXT: shldq %cl, %r11, %r10 +; SSE-NEXT: movq -32(%rsp,%rbx), %r9 +; SSE-NEXT: movq -24(%rsp,%rbx), %r15 +; SSE-NEXT: movq %r15, %r8 +; SSE-NEXT: shldq %cl, %r9, %r8 +; SSE-NEXT: movq -56(%rsp,%rbx), %rsi +; SSE-NEXT: shldq %cl, %rsi, %rdx +; SSE-NEXT: shldq %cl, %r15, %r11 +; SSE-NEXT: shldq %cl, %r14, %r9 +; SSE-NEXT: movq -64(%rsp,%rbx), %rbx +; SSE-NEXT: shldq %cl, %rbx, %rsi +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shlq %cl, %rbx +; SSE-NEXT: andq 32(%rdi), %r9 +; SSE-NEXT: andq 48(%rdi), %r11 +; SSE-NEXT: andq 16(%rdi), %rdx +; SSE-NEXT: orq %r11, %rdx +; SSE-NEXT: andq 40(%rdi), %r8 +; SSE-NEXT: andq 56(%rdi), %r10 +; SSE-NEXT: andq 24(%rdi), %rax +; SSE-NEXT: orq %r10, %rax +; SSE-NEXT: andq (%rdi), %rbx +; SSE-NEXT: orq %r9, %rbx +; SSE-NEXT: orq %rdx, %rbx +; SSE-NEXT: andq 8(%rdi), %rsi +; SSE-NEXT: orq %r8, %rsi +; SSE-NEXT: orq %rax, %rsi +; SSE-NEXT: orq %rbx, %rsi +; SSE-NEXT: setne %al +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: retq +; +; AVX2-LABEL: test_ne_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %esi +; AVX2-NEXT: andl $56, %esi +; AVX2-NEXT: negl %esi +; AVX2-NEXT: movslq %esi, %rsi +; AVX2-NEXT: movq -48(%rsp,%rsi), %rdx +; AVX2-NEXT: movq -40(%rsp,%rsi), %rbx +; AVX2-NEXT: movq %rbx, %rax +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq -16(%rsp,%rsi), %r11 +; AVX2-NEXT: movq -8(%rsp,%rsi), %r10 +; AVX2-NEXT: shldq %cl, %r11, %r10 +; AVX2-NEXT: movq -32(%rsp,%rsi), %r9 +; AVX2-NEXT: movq -24(%rsp,%rsi), %r14 +; AVX2-NEXT: movq %r14, %r8 +; AVX2-NEXT: shldq %cl, %r9, %r8 +; AVX2-NEXT: movq -64(%rsp,%rsi), %r15 +; AVX2-NEXT: movq -56(%rsp,%rsi), %rsi +; AVX2-NEXT: shldq %cl, %rsi, %rdx +; AVX2-NEXT: shldq %cl, %r14, %r11 +; AVX2-NEXT: shldq %cl, %rbx, %r9 +; AVX2-NEXT: shldq %cl, %r15, %rsi +; AVX2-NEXT: shlxq %rcx, %r15, %rcx +; AVX2-NEXT: andq 32(%rdi), %r9 +; AVX2-NEXT: andq 48(%rdi), %r11 +; AVX2-NEXT: andq 16(%rdi), %rdx +; AVX2-NEXT: andq 40(%rdi), %r8 +; AVX2-NEXT: andq 56(%rdi), %r10 +; AVX2-NEXT: andq 24(%rdi), %rax +; AVX2-NEXT: orq %r11, %rdx +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: andq (%rdi), %rcx +; AVX2-NEXT: orq %r9, %rcx +; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: andq 8(%rdi), %rsi +; AVX2-NEXT: orq %r8, %rsi +; AVX2-NEXT: orq %rax, %rsi +; AVX2-NEXT: orq %rcx, %rsi +; AVX2-NEXT: setne %al +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ne_i512: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: andl $63, %ecx +; AVX512-NEXT: shrl $3, %esi +; AVX512-NEXT: andl $56, %esi +; AVX512-NEXT: negl %esi +; AVX512-NEXT: movslq %esi, %rbx +; AVX512-NEXT: movq -48(%rsp,%rbx), %rdx +; AVX512-NEXT: movq -40(%rsp,%rbx), %r14 +; AVX512-NEXT: movq %r14, %rax +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq -16(%rsp,%rbx), %r11 +; AVX512-NEXT: movq -8(%rsp,%rbx), %r10 +; AVX512-NEXT: shldq %cl, %r11, %r10 +; AVX512-NEXT: movq -32(%rsp,%rbx), %r9 +; AVX512-NEXT: movq -24(%rsp,%rbx), %r15 +; AVX512-NEXT: movq %r15, %r8 +; AVX512-NEXT: shldq %cl, %r9, %r8 +; AVX512-NEXT: movq -56(%rsp,%rbx), %rsi +; AVX512-NEXT: shldq %cl, %rsi, %rdx +; AVX512-NEXT: shldq %cl, %r15, %r11 +; AVX512-NEXT: shldq %cl, %r14, %r9 +; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx +; AVX512-NEXT: shldq %cl, %rbx, %rsi +; AVX512-NEXT: shlxq %rcx, %rbx, %rcx +; AVX512-NEXT: andq 32(%rdi), %r9 +; AVX512-NEXT: andq 48(%rdi), %r11 +; AVX512-NEXT: andq 16(%rdi), %rdx +; AVX512-NEXT: andq 40(%rdi), %r8 +; AVX512-NEXT: andq 56(%rdi), %r10 +; AVX512-NEXT: andq 24(%rdi), %rax +; AVX512-NEXT: orq %r11, %rdx +; AVX512-NEXT: orq %r10, %rax +; AVX512-NEXT: andq (%rdi), %rcx +; AVX512-NEXT: orq %r9, %rcx +; AVX512-NEXT: orq %rdx, %rcx +; AVX512-NEXT: andq 8(%rdi), %rsi +; AVX512-NEXT: orq %r8, %rsi +; AVX512-NEXT: orq %rax, %rsi +; AVX512-NEXT: orq %rcx, %rsi +; AVX512-NEXT: setne %al +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %cmp = icmp ne i512 %test, 0 + ret i1 %cmp +} + +define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { +; X86-LABEL: complement_ne_i512: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $272, %esp # imm = 0x110 +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: andl $60, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1113,6 +1632,7 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1129,148 +1649,3833 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 56(%eax), %esi -; X86-NEXT: movl 60(%eax), %edi +; X86-NEXT: movl 24(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $31, %ecx +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 56(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 60(%edx), %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%edx), %ebx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%edx), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl 52(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: andl 60(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 4(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 52(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 56(%edx), %ebx -; X86-NEXT: orl %esi, %ebx +; X86-NEXT: shldl %cl, %eax, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 48(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 52(%edx), %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: movl 40(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %eax +; X86-NEXT: movl 8(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl 56(%edx), %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 44(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 48(%edx), %ebx -; X86-NEXT: orl %esi, %ebx +; X86-NEXT: andl %edi, %ebx +; X86-NEXT: movl 24(%edx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%eax), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl 12(%eax), %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl 60(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %edx, %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl 28(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl (%eax), %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 40(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 44(%edx), %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: negl %eax +; X86-NEXT: movl 240(%esp,%eax), %esi +; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: movl 32(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edi, %eax +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl 16(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %eax +; X86-NEXT: movl 48(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 36(%esi), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl 20(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl %esi, %edi +; X86-NEXT: movl 52(%eax), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: xorl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 36(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 40(%edx), %ebx -; X86-NEXT: orl %esi, %ebx +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: movl %ebx, 60(%edx) +; X86-NEXT: movl %edi, 56(%edx) +; X86-NEXT: movl %ecx, 52(%edx) +; X86-NEXT: movl %esi, 44(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 40(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 36(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 32(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 28(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 24(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 16(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 12(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 8(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 4(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, (%edx) +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 48(%edx) +; X86-NEXT: setne %al +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; SSE-LABEL: complement_ne_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $56, %rsp +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $56, %esi +; SSE-NEXT: negl %esi +; SSE-NEXT: movslq %esi, %rbx +; SSE-NEXT: movq (%rsp,%rbx), %rsi +; SSE-NEXT: movq 8(%rsp,%rbx), %r14 +; SSE-NEXT: movq %r14, %rax +; SSE-NEXT: shldq %cl, %rsi, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 32(%rsp,%rbx), %r8 +; SSE-NEXT: movq 40(%rsp,%rbx), %rbp +; SSE-NEXT: shldq %cl, %r8, %rbp +; SSE-NEXT: movq 16(%rsp,%rbx), %r9 +; SSE-NEXT: movq 24(%rsp,%rbx), %r15 +; SSE-NEXT: movq %r15, %r10 +; SSE-NEXT: shldq %cl, %r9, %r10 +; SSE-NEXT: movq -8(%rsp,%rbx), %r11 +; SSE-NEXT: shldq %cl, %r11, %rsi +; SSE-NEXT: shldq %cl, %r15, %r8 +; SSE-NEXT: shldq %cl, %r14, %r9 +; SSE-NEXT: movq -16(%rsp,%rbx), %rbx +; SSE-NEXT: shldq %cl, %rbx, %r11 +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shlq %cl, %rbx +; SSE-NEXT: movq 24(%rdi), %r15 +; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 56(%rdi), %rcx +; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 16(%rdi), %r12 +; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 48(%rdi), %r13 +; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %r8, %r13 +; SSE-NEXT: andq %rsi, %r12 +; SSE-NEXT: orq %r13, %r12 +; SSE-NEXT: movq %rcx, %r13 +; SSE-NEXT: andq %rbp, %r13 +; SSE-NEXT: andq %rax, %r15 +; SSE-NEXT: orq %r13, %r15 +; SSE-NEXT: movq 32(%rdi), %r14 +; SSE-NEXT: movq %r14, %rcx +; SSE-NEXT: andq %r9, %rcx +; SSE-NEXT: movq (%rdi), %r13 +; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rbx, %r13 +; SSE-NEXT: orq %rcx, %r13 +; SSE-NEXT: orq %r12, %r13 +; SSE-NEXT: movq 40(%rdi), %rcx +; SSE-NEXT: movq %rcx, %r12 +; SSE-NEXT: andq %r10, %r12 +; SSE-NEXT: movq 8(%rdi), %rdx +; SSE-NEXT: movq %rdx, %rax +; SSE-NEXT: andq %r11, %rax +; SSE-NEXT: orq %r12, %rax +; SSE-NEXT: orq %r15, %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; SSE-NEXT: xorq %rcx, %r10 +; SSE-NEXT: xorq %r14, %r9 +; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; SSE-NEXT: xorq %rdx, %r11 +; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; SSE-NEXT: orq %r13, %rax +; SSE-NEXT: movq %r8, 48(%rdi) +; SSE-NEXT: movq %rbp, 56(%rdi) +; SSE-NEXT: movq %r9, 32(%rdi) +; SSE-NEXT: movq %r10, 40(%rdi) +; SSE-NEXT: movq %rsi, 16(%rdi) +; SSE-NEXT: movq %r15, 24(%rdi) +; SSE-NEXT: movq %rbx, (%rdi) +; SSE-NEXT: movq %r11, 8(%rdi) +; SSE-NEXT: setne %al +; SSE-NEXT: addq $56, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX2-LABEL: complement_ne_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $72, %rsp +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] +; AVX2-NEXT: vmovups %ymm0, (%rsp) +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %esi +; AVX2-NEXT: andl $56, %esi +; AVX2-NEXT: negl %esi +; AVX2-NEXT: movslq %esi, %rbx +; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi +; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp +; AVX2-NEXT: movq %rbp, %rax +; AVX2-NEXT: shldq %cl, %rsi, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 48(%rsp,%rbx), %r8 +; AVX2-NEXT: movq 56(%rsp,%rbx), %r13 +; AVX2-NEXT: shldq %cl, %r8, %r13 +; AVX2-NEXT: movq 32(%rsp,%rbx), %r9 +; AVX2-NEXT: movq 40(%rsp,%rbx), %r14 +; AVX2-NEXT: movq %r14, %r10 +; AVX2-NEXT: shldq %cl, %r9, %r10 +; AVX2-NEXT: movq 8(%rsp,%rbx), %r11 +; AVX2-NEXT: shldq %cl, %r11, %rsi +; AVX2-NEXT: shldq %cl, %r14, %r8 +; AVX2-NEXT: movq 16(%rdi), %r12 +; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 48(%rdi), %r14 +; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r8, %r14 +; AVX2-NEXT: andq %rsi, %r12 +; AVX2-NEXT: orq %r14, %r12 +; AVX2-NEXT: movq 56(%rdi), %r15 +; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r13, %r15 +; AVX2-NEXT: movq 24(%rdi), %r14 +; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %rax, %r14 +; AVX2-NEXT: orq %r15, %r14 +; AVX2-NEXT: shldq %cl, %rbp, %r9 +; AVX2-NEXT: movq (%rsp,%rbx), %rdx +; AVX2-NEXT: movq 32(%rdi), %r15 +; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r9, %r15 +; AVX2-NEXT: shlxq %rcx, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq (%rdi), %rbx +; AVX2-NEXT: movq %rbx, %rbp +; AVX2-NEXT: andq %rax, %rbp +; AVX2-NEXT: orq %r15, %rbp +; AVX2-NEXT: orq %r12, %rbp +; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX2-NEXT: shldq %cl, %rdx, %r11 +; AVX2-NEXT: movq 40(%rdi), %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: andq %r10, %rcx +; AVX2-NEXT: movq 8(%rdi), %r15 +; AVX2-NEXT: movq %r15, %r12 +; AVX2-NEXT: andq %r11, %r12 +; AVX2-NEXT: orq %rcx, %r12 +; AVX2-NEXT: orq %r14, %r12 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; AVX2-NEXT: xorq %rax, %r10 +; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; AVX2-NEXT: xorq %r15, %r11 +; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; AVX2-NEXT: orq %rbp, %r12 +; AVX2-NEXT: movq %r8, 48(%rdi) +; AVX2-NEXT: movq %r13, 56(%rdi) +; AVX2-NEXT: movq %r9, 32(%rdi) +; AVX2-NEXT: movq %r10, 40(%rdi) +; AVX2-NEXT: movq %rsi, 16(%rdi) +; AVX2-NEXT: movq %rcx, 24(%rdi) +; AVX2-NEXT: movq %rbx, (%rdi) +; AVX2-NEXT: movq %r11, 8(%rdi) +; AVX2-NEXT: setne %al +; AVX2-NEXT: addq $72, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: complement_ne_i512: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $72, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] +; AVX512-NEXT: vmovups %ymm0, (%rsp) +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: andl $63, %ecx +; AVX512-NEXT: shrl $3, %esi +; AVX512-NEXT: andl $56, %esi +; AVX512-NEXT: negl %esi +; AVX512-NEXT: movslq %esi, %rbx +; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi +; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp +; AVX512-NEXT: movq %rbp, %rax +; AVX512-NEXT: shldq %cl, %rsi, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 48(%rsp,%rbx), %r8 +; AVX512-NEXT: movq 56(%rsp,%rbx), %r13 +; AVX512-NEXT: shldq %cl, %r8, %r13 +; AVX512-NEXT: movq 32(%rsp,%rbx), %r9 +; AVX512-NEXT: movq 40(%rsp,%rbx), %r14 +; AVX512-NEXT: movq %r14, %r10 +; AVX512-NEXT: shldq %cl, %r9, %r10 +; AVX512-NEXT: movq 8(%rsp,%rbx), %r11 +; AVX512-NEXT: shldq %cl, %r11, %rsi +; AVX512-NEXT: shldq %cl, %r14, %r8 +; AVX512-NEXT: movq 16(%rdi), %r12 +; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 48(%rdi), %r14 +; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r8, %r14 +; AVX512-NEXT: andq %rsi, %r12 +; AVX512-NEXT: orq %r14, %r12 +; AVX512-NEXT: movq 56(%rdi), %r15 +; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r13, %r15 +; AVX512-NEXT: movq 24(%rdi), %r14 +; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %rax, %r14 +; AVX512-NEXT: orq %r15, %r14 +; AVX512-NEXT: shldq %cl, %rbp, %r9 +; AVX512-NEXT: movq (%rsp,%rbx), %rdx +; AVX512-NEXT: movq 32(%rdi), %r15 +; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r9, %r15 +; AVX512-NEXT: shlxq %rcx, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq (%rdi), %rbx +; AVX512-NEXT: movq %rbx, %rbp +; AVX512-NEXT: andq %rax, %rbp +; AVX512-NEXT: orq %r15, %rbp +; AVX512-NEXT: orq %r12, %rbp +; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512-NEXT: shldq %cl, %rdx, %r11 +; AVX512-NEXT: movq 40(%rdi), %rax +; AVX512-NEXT: movq %rax, %rcx +; AVX512-NEXT: andq %r10, %rcx +; AVX512-NEXT: movq 8(%rdi), %r15 +; AVX512-NEXT: movq %r15, %r12 +; AVX512-NEXT: andq %r11, %r12 +; AVX512-NEXT: orq %rcx, %r12 +; AVX512-NEXT: orq %r14, %r12 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; AVX512-NEXT: xorq %rax, %r10 +; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; AVX512-NEXT: xorq %r15, %r11 +; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; AVX512-NEXT: orq %rbp, %r12 +; AVX512-NEXT: movq %r8, 48(%rdi) +; AVX512-NEXT: movq %r13, 56(%rdi) +; AVX512-NEXT: movq %r9, 32(%rdi) +; AVX512-NEXT: movq %r10, 40(%rdi) +; AVX512-NEXT: movq %rsi, 16(%rdi) +; AVX512-NEXT: movq %rcx, 24(%rdi) +; AVX512-NEXT: movq %rbx, (%rdi) +; AVX512-NEXT: movq %r11, 8(%rdi) +; AVX512-NEXT: setne %al +; AVX512-NEXT: addq $72, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %res = xor i512 %ld, %bit + %cmp = icmp ne i512 %test, 0 + store i512 %res, ptr %word + ret i1 %cmp +} + +define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind { +; X86-LABEL: reset_eq_i512: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $288, %esp # imm = 0x120 +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: andl $60, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %edi +; X86-NEXT: subl %eax, %edi +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 4(%edi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%edi), %eax +; X86-NEXT: andl $31, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: shldl %cl, %edx, %ebx +; X86-NEXT: movl 12(%edi), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%edi), %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%edi), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 24(%edi), %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%edi), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%edi), %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%edi), %esi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%edi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %eax, %edx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 32(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 36(%edx), %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl 8(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %eax, %ebx +; X86-NEXT: orl %edx, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 28(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 32(%edx), %ebx -; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl 44(%edi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl 52(%edi), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 56(%edi), %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 24(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 28(%edx), %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shldl %cl, %esi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: movl 56(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %eax, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 24(%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: movl 44(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 60(%edi), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 60(%ebx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%ebx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%edi), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: negl %eax +; X86-NEXT: movl 256(%esp,%eax), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl %esi, %edi +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl 32(%ebx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ecx, %edx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%ebx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ecx, %edi +; X86-NEXT: orl %edx, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 20(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 24(%edx), %ebx -; X86-NEXT: orl %esi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 16(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 20(%edx), %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl 16(%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ecx, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%esi), %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 12(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 16(%edx), %ebx -; X86-NEXT: orl %esi, %ebx +; X86-NEXT: andl %ebx, %edx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 4(%esi), %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 8(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 12(%edx), %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: andl %ebx, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl %cl, %edi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%esi), %edi +; X86-NEXT: andl %edi, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: movl 52(%ebx), %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: notl %ebx -; X86-NEXT: movl 4(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 8(%edx), %ebx -; X86-NEXT: orl %esi, %ebx +; X86-NEXT: andl %edi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: notl %edi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: notl %edi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: notl %edi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: notl %ecx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %edx, 60(%eax) +; X86-NEXT: movl %esi, 56(%eax) +; X86-NEXT: movl %ecx, 52(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 44(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 40(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 36(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 32(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 28(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 24(%eax) +; X86-NEXT: movl %ebx, 20(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 16(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 48(%eax) +; X86-NEXT: sete %al +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; SSE-LABEL: reset_eq_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $56, %rsp +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $56, %esi +; SSE-NEXT: negl %esi +; SSE-NEXT: movslq %esi, %rdx +; SSE-NEXT: movq (%rsp,%rdx), %r9 +; SSE-NEXT: movq 8(%rsp,%rdx), %r8 +; SSE-NEXT: movq %r8, %rsi +; SSE-NEXT: shldq %cl, %r9, %rsi +; SSE-NEXT: movq -8(%rsp,%rdx), %rax +; SSE-NEXT: shldq %cl, %rax, %r9 +; SSE-NEXT: movq 16(%rsp,%rdx), %r14 +; SSE-NEXT: movq 24(%rsp,%rdx), %r10 +; SSE-NEXT: movq %r10, %rbx +; SSE-NEXT: shldq %cl, %r14, %rbx +; SSE-NEXT: shldq %cl, %r8, %r14 +; SSE-NEXT: movq 32(%rsp,%rdx), %r13 +; SSE-NEXT: movq 40(%rsp,%rdx), %r12 +; SSE-NEXT: shldq %cl, %r13, %r12 +; SSE-NEXT: shldq %cl, %r10, %r13 +; SSE-NEXT: movq -16(%rsp,%rdx), %rdx +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shlq %cl, %rdx +; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq %r12, %rbp +; SSE-NEXT: movq %r9, %r15 +; SSE-NEXT: movq %rsi, %r11 +; SSE-NEXT: movq 16(%rdi), %r8 +; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 48(%rdi), %rcx +; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rcx, %r13 +; SSE-NEXT: andq %r8, %r9 +; SSE-NEXT: orq %r13, %r9 +; SSE-NEXT: movq 56(%rdi), %rcx +; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rcx, %r12 +; SSE-NEXT: movq 24(%rdi), %r10 +; SSE-NEXT: andq %r10, %rsi +; SSE-NEXT: orq %r12, %rsi +; SSE-NEXT: movq %r14, %r13 +; SSE-NEXT: movq 32(%rdi), %rcx +; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rcx, %r14 +; SSE-NEXT: movq %rdx, %r12 +; SSE-NEXT: movq (%rdi), %rcx +; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %r14, %rdx +; SSE-NEXT: orq %r9, %rdx +; SSE-NEXT: movq %rbx, %r14 +; SSE-NEXT: movq 40(%rdi), %rcx +; SSE-NEXT: andq %rcx, %rbx +; SSE-NEXT: movq %rax, %r9 +; SSE-NEXT: movq 8(%rdi), %r8 +; SSE-NEXT: andq %r8, %rax +; SSE-NEXT: orq %rbx, %rax +; SSE-NEXT: orq %rsi, %rax +; SSE-NEXT: notq %r11 +; SSE-NEXT: andq %r10, %r11 +; SSE-NEXT: notq %r15 +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; SSE-NEXT: notq %r14 +; SSE-NEXT: andq %rcx, %r14 +; SSE-NEXT: notq %r13 +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; SSE-NEXT: notq %rbp +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE-NEXT: notq %rcx +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; SSE-NEXT: notq %r9 +; SSE-NEXT: andq %r8, %r9 +; SSE-NEXT: notq %r12 +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: movq %rcx, 48(%rdi) +; SSE-NEXT: movq %rbp, 56(%rdi) +; SSE-NEXT: movq %r13, 32(%rdi) +; SSE-NEXT: movq %r14, 40(%rdi) +; SSE-NEXT: movq %r15, 16(%rdi) +; SSE-NEXT: movq %r11, 24(%rdi) +; SSE-NEXT: movq %r12, (%rdi) +; SSE-NEXT: movq %r9, 8(%rdi) +; SSE-NEXT: sete %al +; SSE-NEXT: addq $56, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX2-LABEL: reset_eq_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: pushq %rax +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %esi +; AVX2-NEXT: andl $56, %esi +; AVX2-NEXT: negl %esi +; AVX2-NEXT: movslq %esi, %rdx +; AVX2-NEXT: movq -48(%rsp,%rdx), %r8 +; AVX2-NEXT: movq -40(%rsp,%rdx), %rbx +; AVX2-NEXT: movq %rbx, %rax +; AVX2-NEXT: shldq %cl, %r8, %rax +; AVX2-NEXT: movq -16(%rsp,%rdx), %r10 +; AVX2-NEXT: movq -8(%rsp,%rdx), %rsi +; AVX2-NEXT: shldq %cl, %r10, %rsi +; AVX2-NEXT: movq -32(%rsp,%rdx), %r11 +; AVX2-NEXT: movq -24(%rsp,%rdx), %r14 +; AVX2-NEXT: movq %r14, %r9 +; AVX2-NEXT: shldq %cl, %r11, %r9 +; AVX2-NEXT: movq -64(%rsp,%rdx), %r15 +; AVX2-NEXT: movq -56(%rsp,%rdx), %rdx +; AVX2-NEXT: shldq %cl, %rdx, %r8 +; AVX2-NEXT: shldq %cl, %r14, %r10 +; AVX2-NEXT: shldq %cl, %rbx, %r11 +; AVX2-NEXT: shldq %cl, %r15, %rdx +; AVX2-NEXT: shlxq %rcx, %r15, %rcx +; AVX2-NEXT: movq 24(%rdi), %rbx +; AVX2-NEXT: movq 56(%rdi), %r14 +; AVX2-NEXT: movq 16(%rdi), %r15 +; AVX2-NEXT: movq 48(%rdi), %r13 +; AVX2-NEXT: movq 32(%rdi), %rbp +; AVX2-NEXT: andnq %rbp, %r11, %r12 +; AVX2-NEXT: andq %r11, %rbp +; AVX2-NEXT: andnq %r13, %r10, %r11 +; AVX2-NEXT: andq %r10, %r13 +; AVX2-NEXT: andnq %r15, %r8, %r10 +; AVX2-NEXT: andq %r8, %r15 +; AVX2-NEXT: movq 40(%rdi), %r8 +; AVX2-NEXT: orq %r13, %r15 +; AVX2-NEXT: andnq %r8, %r9, %r13 +; AVX2-NEXT: andq %r9, %r8 +; AVX2-NEXT: andnq %r14, %rsi, %r9 +; AVX2-NEXT: andq %rsi, %r14 +; AVX2-NEXT: andnq %rbx, %rax, %rsi +; AVX2-NEXT: andq %rax, %rbx +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: orq %r14, %rbx +; AVX2-NEXT: andnq %rax, %rcx, %r14 +; AVX2-NEXT: andq %rcx, %rax +; AVX2-NEXT: orq %rbp, %rax +; AVX2-NEXT: movq 8(%rdi), %rcx +; AVX2-NEXT: orq %r15, %rax +; AVX2-NEXT: andnq %rcx, %rdx, %r15 +; AVX2-NEXT: andq %rdx, %rcx +; AVX2-NEXT: orq %r8, %rcx +; AVX2-NEXT: orq %rbx, %rcx +; AVX2-NEXT: orq %rax, %rcx +; AVX2-NEXT: movq %r11, 48(%rdi) +; AVX2-NEXT: movq %r9, 56(%rdi) +; AVX2-NEXT: movq %r12, 32(%rdi) +; AVX2-NEXT: movq %r13, 40(%rdi) +; AVX2-NEXT: movq %r10, 16(%rdi) +; AVX2-NEXT: movq %rsi, 24(%rdi) +; AVX2-NEXT: movq %r14, (%rdi) +; AVX2-NEXT: movq %r15, 8(%rdi) +; AVX2-NEXT: sete %al +; AVX2-NEXT: addq $8, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: reset_eq_i512: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: andl $63, %ecx +; AVX512-NEXT: shrl $3, %esi +; AVX512-NEXT: andl $56, %esi +; AVX512-NEXT: negl %esi +; AVX512-NEXT: movslq %esi, %rbx +; AVX512-NEXT: movq -48(%rsp,%rbx), %r8 +; AVX512-NEXT: movq -40(%rsp,%rbx), %r14 +; AVX512-NEXT: movq %r14, %rax +; AVX512-NEXT: shldq %cl, %r8, %rax +; AVX512-NEXT: movq -16(%rsp,%rbx), %r10 +; AVX512-NEXT: movq -8(%rsp,%rbx), %rsi +; AVX512-NEXT: shldq %cl, %r10, %rsi +; AVX512-NEXT: movq -32(%rsp,%rbx), %r11 +; AVX512-NEXT: movq -24(%rsp,%rbx), %r15 +; AVX512-NEXT: movq %r15, %r9 +; AVX512-NEXT: shldq %cl, %r11, %r9 +; AVX512-NEXT: movq -56(%rsp,%rbx), %rdx +; AVX512-NEXT: shldq %cl, %rdx, %r8 +; AVX512-NEXT: shldq %cl, %r15, %r10 +; AVX512-NEXT: shldq %cl, %r14, %r11 +; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx +; AVX512-NEXT: shldq %cl, %rbx, %rdx +; AVX512-NEXT: shlxq %rcx, %rbx, %rcx +; AVX512-NEXT: movq 24(%rdi), %rbx +; AVX512-NEXT: movq 56(%rdi), %r14 +; AVX512-NEXT: movq 16(%rdi), %r15 +; AVX512-NEXT: movq 48(%rdi), %r13 +; AVX512-NEXT: movq 32(%rdi), %rbp +; AVX512-NEXT: andnq %rbp, %r11, %r12 +; AVX512-NEXT: andq %r11, %rbp +; AVX512-NEXT: andnq %r13, %r10, %r11 +; AVX512-NEXT: andq %r10, %r13 +; AVX512-NEXT: andnq %r15, %r8, %r10 +; AVX512-NEXT: andq %r8, %r15 +; AVX512-NEXT: movq 40(%rdi), %r8 +; AVX512-NEXT: orq %r13, %r15 +; AVX512-NEXT: andnq %r8, %r9, %r13 +; AVX512-NEXT: andq %r9, %r8 +; AVX512-NEXT: andnq %r14, %rsi, %r9 +; AVX512-NEXT: andq %rsi, %r14 +; AVX512-NEXT: andnq %rbx, %rax, %rsi +; AVX512-NEXT: andq %rax, %rbx +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: orq %r14, %rbx +; AVX512-NEXT: andnq %rax, %rcx, %r14 +; AVX512-NEXT: andq %rcx, %rax +; AVX512-NEXT: orq %rbp, %rax +; AVX512-NEXT: movq 8(%rdi), %rcx +; AVX512-NEXT: orq %r15, %rax +; AVX512-NEXT: andnq %rcx, %rdx, %r15 +; AVX512-NEXT: andq %rdx, %rcx +; AVX512-NEXT: orq %r8, %rcx +; AVX512-NEXT: orq %rbx, %rcx +; AVX512-NEXT: orq %rax, %rcx +; AVX512-NEXT: movq %r11, 48(%rdi) +; AVX512-NEXT: movq %r9, 56(%rdi) +; AVX512-NEXT: movq %r12, 32(%rdi) +; AVX512-NEXT: movq %r13, 40(%rdi) +; AVX512-NEXT: movq %r10, 16(%rdi) +; AVX512-NEXT: movq %rsi, 24(%rdi) +; AVX512-NEXT: movq %r14, (%rdi) +; AVX512-NEXT: movq %r15, 8(%rdi) +; AVX512-NEXT: sete %al +; AVX512-NEXT: addq $8, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %mask = xor i512 %bit, -1 + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %res = and i512 %ld, %mask + %cmp = icmp eq i512 %test, 0 + store i512 %res, ptr %word + ret i1 %cmp +} + +define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { +; X86-LABEL: set_ne_i512: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $272, %esp # imm = 0x110 +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: andl $60, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 24(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $31, %ecx +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 56(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 60(%edx), %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%edx), %ebx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%edx), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl 52(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 4(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: movl 40(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %eax +; X86-NEXT: movl 8(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl 56(%edx), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edi, %ebx +; X86-NEXT: movl 24(%edx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%eax), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl 12(%eax), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl 60(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %edx, %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl 28(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: negl %eax +; X86-NEXT: movl 240(%esp,%eax), %esi +; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: movl 32(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edi, %eax +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl 16(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %eax +; X86-NEXT: movl 48(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 36(%esi), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl 20(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl %esi, %edi +; X86-NEXT: movl 52(%eax), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl %ecx, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: movl %ebx, 60(%edx) +; X86-NEXT: movl %edi, 56(%edx) +; X86-NEXT: movl %ecx, 52(%edx) +; X86-NEXT: movl %esi, 44(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 40(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 36(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 32(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 28(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 24(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 16(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 12(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 8(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 4(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, (%edx) +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 48(%edx) +; X86-NEXT: setne %al +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; SSE-LABEL: set_ne_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $56, %rsp +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $56, %esi +; SSE-NEXT: negl %esi +; SSE-NEXT: movslq %esi, %rbx +; SSE-NEXT: movq (%rsp,%rbx), %rsi +; SSE-NEXT: movq 8(%rsp,%rbx), %r14 +; SSE-NEXT: movq %r14, %rax +; SSE-NEXT: shldq %cl, %rsi, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 32(%rsp,%rbx), %r8 +; SSE-NEXT: movq 40(%rsp,%rbx), %rbp +; SSE-NEXT: shldq %cl, %r8, %rbp +; SSE-NEXT: movq 16(%rsp,%rbx), %r9 +; SSE-NEXT: movq 24(%rsp,%rbx), %r15 +; SSE-NEXT: movq %r15, %r10 +; SSE-NEXT: shldq %cl, %r9, %r10 +; SSE-NEXT: movq -8(%rsp,%rbx), %r11 +; SSE-NEXT: shldq %cl, %r11, %rsi +; SSE-NEXT: shldq %cl, %r15, %r8 +; SSE-NEXT: shldq %cl, %r14, %r9 +; SSE-NEXT: movq -16(%rsp,%rbx), %rbx +; SSE-NEXT: shldq %cl, %rbx, %r11 +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shlq %cl, %rbx +; SSE-NEXT: movq 24(%rdi), %r15 +; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 56(%rdi), %rcx +; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 16(%rdi), %r12 +; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 48(%rdi), %r13 +; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %r8, %r13 +; SSE-NEXT: andq %rsi, %r12 +; SSE-NEXT: orq %r13, %r12 +; SSE-NEXT: movq %rcx, %r13 +; SSE-NEXT: andq %rbp, %r13 +; SSE-NEXT: andq %rax, %r15 +; SSE-NEXT: orq %r13, %r15 +; SSE-NEXT: movq 32(%rdi), %r14 +; SSE-NEXT: movq %r14, %rcx +; SSE-NEXT: andq %r9, %rcx +; SSE-NEXT: movq (%rdi), %r13 +; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rbx, %r13 +; SSE-NEXT: orq %rcx, %r13 +; SSE-NEXT: orq %r12, %r13 +; SSE-NEXT: movq 40(%rdi), %rcx +; SSE-NEXT: movq %rcx, %r12 +; SSE-NEXT: andq %r10, %r12 +; SSE-NEXT: movq 8(%rdi), %rdx +; SSE-NEXT: movq %rdx, %rax +; SSE-NEXT: andq %r11, %rax +; SSE-NEXT: orq %r12, %rax +; SSE-NEXT: orq %r15, %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; SSE-NEXT: orq %rcx, %r10 +; SSE-NEXT: orq %r14, %r9 +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; SSE-NEXT: orq %rdx, %r11 +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; SSE-NEXT: orq %r13, %rax +; SSE-NEXT: movq %r8, 48(%rdi) +; SSE-NEXT: movq %rbp, 56(%rdi) +; SSE-NEXT: movq %r9, 32(%rdi) +; SSE-NEXT: movq %r10, 40(%rdi) +; SSE-NEXT: movq %rsi, 16(%rdi) +; SSE-NEXT: movq %r15, 24(%rdi) +; SSE-NEXT: movq %rbx, (%rdi) +; SSE-NEXT: movq %r11, 8(%rdi) +; SSE-NEXT: setne %al +; SSE-NEXT: addq $56, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX2-LABEL: set_ne_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $72, %rsp +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] +; AVX2-NEXT: vmovups %ymm0, (%rsp) +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %esi +; AVX2-NEXT: andl $56, %esi +; AVX2-NEXT: negl %esi +; AVX2-NEXT: movslq %esi, %rbx +; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi +; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp +; AVX2-NEXT: movq %rbp, %rax +; AVX2-NEXT: shldq %cl, %rsi, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 48(%rsp,%rbx), %r8 +; AVX2-NEXT: movq 56(%rsp,%rbx), %r13 +; AVX2-NEXT: shldq %cl, %r8, %r13 +; AVX2-NEXT: movq 32(%rsp,%rbx), %r9 +; AVX2-NEXT: movq 40(%rsp,%rbx), %r14 +; AVX2-NEXT: movq %r14, %r10 +; AVX2-NEXT: shldq %cl, %r9, %r10 +; AVX2-NEXT: movq 8(%rsp,%rbx), %r11 +; AVX2-NEXT: shldq %cl, %r11, %rsi +; AVX2-NEXT: shldq %cl, %r14, %r8 +; AVX2-NEXT: movq 16(%rdi), %r12 +; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 48(%rdi), %r14 +; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r8, %r14 +; AVX2-NEXT: andq %rsi, %r12 +; AVX2-NEXT: orq %r14, %r12 +; AVX2-NEXT: movq 56(%rdi), %r15 +; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r13, %r15 +; AVX2-NEXT: movq 24(%rdi), %r14 +; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %rax, %r14 +; AVX2-NEXT: orq %r15, %r14 +; AVX2-NEXT: shldq %cl, %rbp, %r9 +; AVX2-NEXT: movq (%rsp,%rbx), %rdx +; AVX2-NEXT: movq 32(%rdi), %r15 +; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r9, %r15 +; AVX2-NEXT: shlxq %rcx, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq (%rdi), %rbx +; AVX2-NEXT: movq %rbx, %rbp +; AVX2-NEXT: andq %rax, %rbp +; AVX2-NEXT: orq %r15, %rbp +; AVX2-NEXT: orq %r12, %rbp +; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX2-NEXT: shldq %cl, %rdx, %r11 +; AVX2-NEXT: movq 40(%rdi), %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: andq %r10, %rcx +; AVX2-NEXT: movq 8(%rdi), %r15 +; AVX2-NEXT: movq %r15, %r12 +; AVX2-NEXT: andq %r11, %r12 +; AVX2-NEXT: orq %rcx, %r12 +; AVX2-NEXT: orq %r14, %r12 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; AVX2-NEXT: orq %rax, %r10 +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; AVX2-NEXT: orq %r15, %r11 +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; AVX2-NEXT: orq %rbp, %r12 +; AVX2-NEXT: movq %r8, 48(%rdi) +; AVX2-NEXT: movq %r13, 56(%rdi) +; AVX2-NEXT: movq %r9, 32(%rdi) +; AVX2-NEXT: movq %r10, 40(%rdi) +; AVX2-NEXT: movq %rsi, 16(%rdi) +; AVX2-NEXT: movq %rcx, 24(%rdi) +; AVX2-NEXT: movq %rbx, (%rdi) +; AVX2-NEXT: movq %r11, 8(%rdi) +; AVX2-NEXT: setne %al +; AVX2-NEXT: addq $72, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: set_ne_i512: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $72, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] +; AVX512-NEXT: vmovups %ymm0, (%rsp) +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: andl $63, %ecx +; AVX512-NEXT: shrl $3, %esi +; AVX512-NEXT: andl $56, %esi +; AVX512-NEXT: negl %esi +; AVX512-NEXT: movslq %esi, %rbx +; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi +; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp +; AVX512-NEXT: movq %rbp, %rax +; AVX512-NEXT: shldq %cl, %rsi, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 48(%rsp,%rbx), %r8 +; AVX512-NEXT: movq 56(%rsp,%rbx), %r13 +; AVX512-NEXT: shldq %cl, %r8, %r13 +; AVX512-NEXT: movq 32(%rsp,%rbx), %r9 +; AVX512-NEXT: movq 40(%rsp,%rbx), %r14 +; AVX512-NEXT: movq %r14, %r10 +; AVX512-NEXT: shldq %cl, %r9, %r10 +; AVX512-NEXT: movq 8(%rsp,%rbx), %r11 +; AVX512-NEXT: shldq %cl, %r11, %rsi +; AVX512-NEXT: shldq %cl, %r14, %r8 +; AVX512-NEXT: movq 16(%rdi), %r12 +; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 48(%rdi), %r14 +; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r8, %r14 +; AVX512-NEXT: andq %rsi, %r12 +; AVX512-NEXT: orq %r14, %r12 +; AVX512-NEXT: movq 56(%rdi), %r15 +; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r13, %r15 +; AVX512-NEXT: movq 24(%rdi), %r14 +; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %rax, %r14 +; AVX512-NEXT: orq %r15, %r14 +; AVX512-NEXT: shldq %cl, %rbp, %r9 +; AVX512-NEXT: movq (%rsp,%rbx), %rdx +; AVX512-NEXT: movq 32(%rdi), %r15 +; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r9, %r15 +; AVX512-NEXT: shlxq %rcx, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq (%rdi), %rbx +; AVX512-NEXT: movq %rbx, %rbp +; AVX512-NEXT: andq %rax, %rbp +; AVX512-NEXT: orq %r15, %rbp +; AVX512-NEXT: orq %r12, %rbp +; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512-NEXT: shldq %cl, %rdx, %r11 +; AVX512-NEXT: movq 40(%rdi), %rax +; AVX512-NEXT: movq %rax, %rcx +; AVX512-NEXT: andq %r10, %rcx +; AVX512-NEXT: movq 8(%rdi), %r15 +; AVX512-NEXT: movq %r15, %r12 +; AVX512-NEXT: andq %r11, %r12 +; AVX512-NEXT: orq %rcx, %r12 +; AVX512-NEXT: orq %r14, %r12 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; AVX512-NEXT: orq %rax, %r10 +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; AVX512-NEXT: orq %r15, %r11 +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; AVX512-NEXT: orq %rbp, %r12 +; AVX512-NEXT: movq %r8, 48(%rdi) +; AVX512-NEXT: movq %r13, 56(%rdi) +; AVX512-NEXT: movq %r9, 32(%rdi) +; AVX512-NEXT: movq %r10, 40(%rdi) +; AVX512-NEXT: movq %rsi, 16(%rdi) +; AVX512-NEXT: movq %rcx, 24(%rdi) +; AVX512-NEXT: movq %rbx, (%rdi) +; AVX512-NEXT: movq %r11, 8(%rdi) +; AVX512-NEXT: setne %al +; AVX512-NEXT: addq $72, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %res = or i512 %ld, %bit + %cmp = icmp ne i512 %test, 0 + store i512 %res, ptr %word + ret i1 %cmp +} + +define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { +; X86-LABEL: init_eq_i512: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $432, %esp # imm = 0x1B0 +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: andl $60, %edx +; X86-NEXT: leal {{[0-9]+}}(%esp), %esi +; X86-NEXT: subl %edx, %esi +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 56(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 60(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 52(%esi), %eax +; X86-NEXT: movl 48(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 24(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 4(%esi), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl 16(%ebp), %ebx +; X86-NEXT: movzbl %bl, %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: leal {{[0-9]+}}(%esp), %esi +; X86-NEXT: subl %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $31, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl %cl, %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: shldl %cl, %edi, %edx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%ebx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%ebx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %eax, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%ebx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %eax, %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 56(%ebx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 24(%ebx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 52(%ebx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%ebx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%ebx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%ebx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 60(%ebx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%ebx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %eax, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%ebx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%ebx), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edi, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%ebx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 4(%ebx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl 56(%edi), %ebx +; X86-NEXT: movl 60(%edi), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl %cl, %ebx, %eax +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 52(%edi), %eax +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 48(%edi), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: notl %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl 40(%edi), %ebx +; X86-NEXT: movl 44(%edi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %ebx, %eax +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 36(%edi), %eax +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 32(%edi), %ebx +; X86-NEXT: shldl %cl, %ebx, %eax +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 28(%edi), %eax +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 24(%edi), %ebx +; X86-NEXT: shldl %cl, %ebx, %eax +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 20(%edi), %eax +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 16(%edi), %ebx +; X86-NEXT: shldl %cl, %ebx, %eax +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 12(%edi), %eax +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl 8(%edi), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: notl %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl 4(%edi), %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl (%edi), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: notl %edi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %eax +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 60(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 56(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 52(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 44(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 40(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 36(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 32(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 28(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 24(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 20(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 16(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %esi, 48(%eax) +; X86-NEXT: sete %al +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; SSE-LABEL: init_eq_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $216, %rsp +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $56, %esi +; SSE-NEXT: negl %esi +; SSE-NEXT: movslq %esi, %r10 +; SSE-NEXT: movq 184(%rsp,%r10), %r11 +; SSE-NEXT: movq 192(%rsp,%r10), %rsi +; SSE-NEXT: movq %rsi, %r13 +; SSE-NEXT: shldq %cl, %r11, %r13 +; SSE-NEXT: movq 200(%rsp,%r10), %r15 +; SSE-NEXT: shldq %cl, %rsi, %r15 +; SSE-NEXT: movq 168(%rsp,%r10), %rbx +; SSE-NEXT: movq 176(%rsp,%r10), %rsi +; SSE-NEXT: movq %rsi, %r14 +; SSE-NEXT: shldq %cl, %rbx, %r14 +; SSE-NEXT: shldq %cl, %rsi, %r11 +; SSE-NEXT: movq 152(%rsp,%r10), %rax +; SSE-NEXT: movq 160(%rsp,%r10), %r8 +; SSE-NEXT: movq %r8, %r12 +; SSE-NEXT: shldq %cl, %rax, %r12 +; SSE-NEXT: shldq %cl, %r8, %rbx +; SSE-NEXT: movq 144(%rsp,%r10), %r9 +; SSE-NEXT: movq %r9, %r8 +; SSE-NEXT: shlq %cl, %r8 +; SSE-NEXT: shldq %cl, %r9, %rax +; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movl %edx, %edx +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, (%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq 16(%rdi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 48(%rdi), %rsi +; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rsi, %r13 +; SSE-NEXT: andq %rdx, %r12 +; SSE-NEXT: orq %r13, %r12 +; SSE-NEXT: movq %r15, %rsi +; SSE-NEXT: movq 56(%rdi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rdx, %r15 +; SSE-NEXT: movq %rbx, %r13 +; SSE-NEXT: movq 24(%rdi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rdx, %rbx +; SSE-NEXT: orq %r15, %rbx +; SSE-NEXT: movq %r14, %rbp +; SSE-NEXT: movq 32(%rdi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rdx, %r14 +; SSE-NEXT: movq %r8, %r15 +; SSE-NEXT: movq (%rdi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rdx, %r8 +; SSE-NEXT: orq %r14, %r8 +; SSE-NEXT: orq %r12, %r8 +; SSE-NEXT: movq %r11, %r12 +; SSE-NEXT: movq 40(%rdi), %r9 +; SSE-NEXT: andq %r9, %r11 +; SSE-NEXT: movq %rax, %r14 +; SSE-NEXT: movq 8(%rdi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rdx, %rax +; SSE-NEXT: orq %r11, %rax +; SSE-NEXT: orq %rbx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: notq %rax +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: movq 56(%rsp,%r10), %r11 +; SSE-NEXT: movq 64(%rsp,%r10), %rax +; SSE-NEXT: movq %rax, %rbx +; SSE-NEXT: shldq %cl, %r11, %rbx +; SSE-NEXT: orq %rbx, %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: notq %rsi +; SSE-NEXT: movq 72(%rsp,%r10), %rbx +; SSE-NEXT: shldq %cl, %rax, %rbx +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; SSE-NEXT: orq %rbx, %rsi +; SSE-NEXT: notq %rbp +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; SSE-NEXT: movq 40(%rsp,%r10), %rax +; SSE-NEXT: movq 48(%rsp,%r10), %rdx +; SSE-NEXT: movq %rdx, %rbx +; SSE-NEXT: shldq %cl, %rax, %rbx +; SSE-NEXT: orq %rbx, %rbp +; SSE-NEXT: notq %r12 +; SSE-NEXT: andq %r9, %r12 +; SSE-NEXT: shldq %cl, %rdx, %r11 +; SSE-NEXT: movq 24(%rsp,%r10), %r9 +; SSE-NEXT: movq 32(%rsp,%r10), %rdx +; SSE-NEXT: movq %rdx, %rbx +; SSE-NEXT: shldq %cl, %r9, %rbx +; SSE-NEXT: orq %r11, %r12 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE-NEXT: notq %r11 +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: orq %rbx, %r11 +; SSE-NEXT: notq %r13 +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; SSE-NEXT: orq %rax, %r13 +; SSE-NEXT: notq %r15 +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; SSE-NEXT: movq 16(%rsp,%r10), %rax +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: shlq %cl, %rdx +; SSE-NEXT: orq %rdx, %r15 +; SSE-NEXT: notq %r14 +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shldq %cl, %rax, %r9 +; SSE-NEXT: orq %r9, %r14 +; SSE-NEXT: orq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: movq %rax, 48(%rdi) +; SSE-NEXT: movq %rsi, 56(%rdi) +; SSE-NEXT: movq %rbp, 32(%rdi) +; SSE-NEXT: movq %r12, 40(%rdi) +; SSE-NEXT: movq %r11, 16(%rdi) +; SSE-NEXT: movq %r13, 24(%rdi) +; SSE-NEXT: movq %r15, (%rdi) +; SSE-NEXT: movq %r14, 8(%rdi) +; SSE-NEXT: sete %al +; SSE-NEXT: addq $216, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX2-LABEL: init_eq_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $200, %rsp +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0] +; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %esi, %r8d +; AVX2-NEXT: andl $63, %r8d +; AVX2-NEXT: shrl $3, %esi +; AVX2-NEXT: andl $56, %esi +; AVX2-NEXT: negl %esi +; AVX2-NEXT: movslq %esi, %rsi +; AVX2-NEXT: movq 144(%rsp,%rsi), %r11 +; AVX2-NEXT: movq 152(%rsp,%rsi), %r12 +; AVX2-NEXT: movq %r12, %r10 +; AVX2-NEXT: movl %r8d, %ecx +; AVX2-NEXT: shldq %cl, %r11, %r10 +; AVX2-NEXT: movq 176(%rsp,%rsi), %r14 +; AVX2-NEXT: movq 184(%rsp,%rsi), %r9 +; AVX2-NEXT: shldq %cl, %r14, %r9 +; AVX2-NEXT: movq 160(%rsp,%rsi), %r15 +; AVX2-NEXT: movq 168(%rsp,%rsi), %r13 +; AVX2-NEXT: movq %r13, %rbx +; AVX2-NEXT: shldq %cl, %r15, %rbx +; AVX2-NEXT: movq 128(%rsp,%rsi), %rbp +; AVX2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 136(%rsp,%rsi), %rax +; AVX2-NEXT: shldq %cl, %rax, %r11 +; AVX2-NEXT: shldq %cl, %r13, %r14 +; AVX2-NEXT: shldq %cl, %r12, %r15 +; AVX2-NEXT: shldq %cl, %rbp, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl %edx, %edx +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rdx, (%rsp) +; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movq 16(%rdi), %r12 +; AVX2-NEXT: movq 48(%rdi), %rbp +; AVX2-NEXT: movq 32(%rdi), %r13 +; AVX2-NEXT: andnq %r13, %r15, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r15, %r13 +; AVX2-NEXT: andnq %rbp, %r14, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r14, %rbp +; AVX2-NEXT: andnq %r12, %r11, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r11, %r12 +; AVX2-NEXT: movq 40(%rdi), %rax +; AVX2-NEXT: orq %rbp, %r12 +; AVX2-NEXT: andnq %rax, %rbx, %rcx +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq %rax, %rbp +; AVX2-NEXT: andq %rbx, %rbp +; AVX2-NEXT: movq 56(%rdi), %rcx +; AVX2-NEXT: andnq %rcx, %r9, %rbx +; AVX2-NEXT: andq %r9, %rcx +; AVX2-NEXT: movq 24(%rdi), %rax +; AVX2-NEXT: andnq %rax, %r10, %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r10, %rax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: shlxq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX2-NEXT: movq (%rdi), %r10 +; AVX2-NEXT: andnq %r10, %rcx, %r15 +; AVX2-NEXT: andq %rcx, %r10 +; AVX2-NEXT: movq 40(%rsp,%rsi), %rdx +; AVX2-NEXT: movq 48(%rsp,%rsi), %r11 +; AVX2-NEXT: movq %r11, %r9 +; AVX2-NEXT: movl %r8d, %ecx +; AVX2-NEXT: shldq %cl, %rdx, %r9 +; AVX2-NEXT: orq %r13, %r10 +; AVX2-NEXT: orq %r12, %r10 +; AVX2-NEXT: movq 8(%rdi), %r13 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: andnq %r13, %rcx, %r12 +; AVX2-NEXT: andq %rcx, %r13 +; AVX2-NEXT: orq %rbp, %r13 +; AVX2-NEXT: orq %rax, %r13 +; AVX2-NEXT: movq 56(%rsp,%rsi), %rax +; AVX2-NEXT: movl %r8d, %ecx +; AVX2-NEXT: shldq %cl, %r11, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: orq %r9, %r14 +; AVX2-NEXT: orq %rax, %rbx +; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 24(%rsp,%rsi), %rax +; AVX2-NEXT: movq 32(%rsp,%rsi), %r9 +; AVX2-NEXT: movq %r9, %r11 +; AVX2-NEXT: shldq %cl, %rax, %r11 +; AVX2-NEXT: shldq %cl, %r9, %rdx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX2-NEXT: orq %r11, %rbp +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX2-NEXT: orq %rdx, %rbx +; AVX2-NEXT: movq 8(%rsp,%rsi), %rdx +; AVX2-NEXT: movq 16(%rsp,%rsi), %r9 +; AVX2-NEXT: movq %r9, %r11 +; AVX2-NEXT: shldq %cl, %rdx, %r11 +; AVX2-NEXT: shldq %cl, %r9, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX2-NEXT: orq %r11, %r9 +; AVX2-NEXT: movq (%rsp,%rsi), %rsi +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX2-NEXT: orq %rax, %r11 +; AVX2-NEXT: shlxq %r8, %rsi, %rax +; AVX2-NEXT: shldq %cl, %rsi, %rdx +; AVX2-NEXT: orq %rax, %r15 +; AVX2-NEXT: orq %rdx, %r12 +; AVX2-NEXT: orq %r10, %r13 +; AVX2-NEXT: movq %r14, 48(%rdi) +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: movq %rax, 56(%rdi) +; AVX2-NEXT: movq %rbp, 32(%rdi) +; AVX2-NEXT: movq %rbx, 40(%rdi) +; AVX2-NEXT: movq %r9, 16(%rdi) +; AVX2-NEXT: movq %r11, 24(%rdi) +; AVX2-NEXT: movq %r15, (%rdi) +; AVX2-NEXT: movq %r12, 8(%rdi) +; AVX2-NEXT: sete %al +; AVX2-NEXT: addq $200, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: init_eq_i512: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $184, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0] +; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: andl $63, %ecx +; AVX512-NEXT: shrl $3, %esi +; AVX512-NEXT: andl $56, %esi +; AVX512-NEXT: negl %esi +; AVX512-NEXT: movslq %esi, %rsi +; AVX512-NEXT: movq 128(%rsp,%rsi), %r10 +; AVX512-NEXT: movq 136(%rsp,%rsi), %r12 +; AVX512-NEXT: movq %r12, %rax +; AVX512-NEXT: shldq %cl, %r10, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 160(%rsp,%rsi), %r14 +; AVX512-NEXT: movq 168(%rsp,%rsi), %rax +; AVX512-NEXT: shldq %cl, %r14, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 144(%rsp,%rsi), %r15 +; AVX512-NEXT: movq 152(%rsp,%rsi), %r11 +; AVX512-NEXT: movq %r11, %rbx +; AVX512-NEXT: shldq %cl, %r15, %rbx +; AVX512-NEXT: movq 120(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rax, %r10 +; AVX512-NEXT: shldq %cl, %r11, %r14 +; AVX512-NEXT: movq %rdi, %r9 +; AVX512-NEXT: movq 112(%rsp,%rsi), %r11 +; AVX512-NEXT: shldq %cl, %r12, %r15 +; AVX512-NEXT: movl %edx, %edx +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movq 16(%rdi), %r12 +; AVX512-NEXT: movq 48(%rdi), %r13 +; AVX512-NEXT: movq 32(%rdi), %rbp +; AVX512-NEXT: andnq %rbp, %r15, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r15, %rbp +; AVX512-NEXT: andnq %r13, %r14, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r14, %r13 +; AVX512-NEXT: andnq %r12, %r10, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r10, %r12 +; AVX512-NEXT: movq 40(%rdi), %r8 +; AVX512-NEXT: orq %r13, %r12 +; AVX512-NEXT: andnq %r8, %rbx, %rdi +; AVX512-NEXT: andq %rbx, %r8 +; AVX512-NEXT: movq 56(%r9), %r13 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX512-NEXT: andnq %r13, %rdx, %r10 +; AVX512-NEXT: andq %rdx, %r13 +; AVX512-NEXT: movq 24(%r9), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX512-NEXT: andnq %rax, %rdx, %r15 +; AVX512-NEXT: andq %rdx, %rax +; AVX512-NEXT: orq %r13, %rax +; AVX512-NEXT: shlxq %rcx, %r11, %r13 +; AVX512-NEXT: movq (%r9), %rdx +; AVX512-NEXT: andnq %rdx, %r13, %r14 +; AVX512-NEXT: andq %r13, %rdx +; AVX512-NEXT: orq %rbp, %rdx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r11, %rbp +; AVX512-NEXT: orq %r12, %rdx +; AVX512-NEXT: movq 8(%r9), %r13 +; AVX512-NEXT: andnq %r13, %rbp, %rbx +; AVX512-NEXT: andq %rbp, %r13 +; AVX512-NEXT: orq %r8, %r13 +; AVX512-NEXT: movq 24(%rsp,%rsi), %r8 +; AVX512-NEXT: orq %rax, %r13 +; AVX512-NEXT: movq 32(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, %r12 +; AVX512-NEXT: shldq %cl, %r8, %r12 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX512-NEXT: orq %r12, %r11 +; AVX512-NEXT: movq 40(%rsp,%rsi), %r12 +; AVX512-NEXT: shldq %cl, %rax, %r12 +; AVX512-NEXT: orq %r12, %r10 +; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 8(%rsp,%rsi), %rax +; AVX512-NEXT: movq 16(%rsp,%rsi), %r12 +; AVX512-NEXT: movq %r12, %rbp +; AVX512-NEXT: shldq %cl, %rax, %rbp +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: orq %rbp, %r10 +; AVX512-NEXT: shldq %cl, %r12, %r8 +; AVX512-NEXT: orq %r8, %rdi +; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq -8(%rsp,%rsi), %r8 +; AVX512-NEXT: movq (%rsp,%rsi), %r12 +; AVX512-NEXT: movq %r12, %rbp +; AVX512-NEXT: shldq %cl, %r8, %rbp +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AVX512-NEXT: orq %rbp, %rdi +; AVX512-NEXT: movq -16(%rsp,%rsi), %rsi +; AVX512-NEXT: shldq %cl, %r12, %rax +; AVX512-NEXT: orq %rax, %r15 +; AVX512-NEXT: shlxq %rcx, %rsi, %rax +; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512-NEXT: shldq %cl, %rsi, %r8 +; AVX512-NEXT: orq %rax, %r14 +; AVX512-NEXT: orq %r8, %rbx +; AVX512-NEXT: orq %rdx, %r13 +; AVX512-NEXT: movq %r11, 48(%r9) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 56(%r9) +; AVX512-NEXT: movq %r10, 32(%r9) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 40(%r9) +; AVX512-NEXT: movq %rdi, 16(%r9) +; AVX512-NEXT: movq %r15, 24(%r9) +; AVX512-NEXT: movq %r14, (%r9) +; AVX512-NEXT: movq %rbx, 8(%r9) +; AVX512-NEXT: sete %al +; AVX512-NEXT: addq $184, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %mask = xor i512 %bit, -1 + %val0 = zext i1 %value to i512 + %val = shl nuw i512 %val0, %ofs + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %res0 = and i512 %ld, %mask + %res = or i512 %res0, %val + %cmp = icmp eq i512 %test, 0 + store i512 %res, ptr %word + ret i1 %cmp +} + +; i4096 + +define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { +; X86-LABEL: test_ne_i4096: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $1792, %esp # imm = 0x700 +; X86-NEXT: movl 12(%ebp), %ebx +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: andl $508, %ecx # imm = 0x1FC +; X86-NEXT: leal {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl %ecx, %esi +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 248(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 252(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $31, %ebx +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 504(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 508(%esi), %edx +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 120(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 124(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 376(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 380(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 184(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 188(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 440(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 444(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 56(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 60(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 312(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 316(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 216(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 220(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 472(%esi), %edi +; X86-NEXT: movl 476(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 88(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 92(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 344(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 348(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 152(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 156(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 408(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 412(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 24(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 280(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 284(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 232(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 236(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 488(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 492(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 104(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 108(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 360(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 364(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 168(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 172(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 424(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 428(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 296(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 300(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 200(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 204(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 456(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 460(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 72(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 76(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 328(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 332(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 136(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 140(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 392(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 396(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 264(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 268(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 240(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 244(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 496(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 500(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 112(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 116(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 368(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 372(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 176(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 180(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 432(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 436(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 52(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 304(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 308(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 208(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 212(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 464(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 468(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 84(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 336(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 340(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 144(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 148(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 400(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 404(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 272(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 276(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 224(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 228(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 480(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 484(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 96(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 100(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 352(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 356(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 160(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 164(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 416(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 420(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 288(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 292(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 192(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 196(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 448(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 452(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 64(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 68(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 320(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 324(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 128(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 132(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl 256(%esi), %edi +; X86-NEXT: movl 260(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl 388(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl 4(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shrdl $1, %eax, %edi +; X86-NEXT: shrl %eax +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: notb %cl +; X86-NEXT: shrdl %cl, %eax, %edi +; X86-NEXT: shrl %cl, %ebx +; X86-NEXT: movb $32, %cl +; X86-NEXT: testb %cl, %cl +; X86-NEXT: movl (%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: jne .LBB20_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: .LBB20_2: +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shll %cl, %edx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 320(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 64(%eax), %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 448(%eax), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 192(%eax), %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 288(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 32(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 416(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 160(%eax), %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 352(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 96(%eax), %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 480(%eax), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 224(%eax), %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: orl %edi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 272(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 16(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 400(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 144(%eax), %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 336(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 80(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 464(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 208(%eax), %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: orl %esi, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 304(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 48(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 432(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 176(%eax), %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 368(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 112(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 496(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: andl 240(%eax), %ebx +; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: orl %edx, %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 264(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 8(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 392(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 136(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 328(%ebx), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 72(%ebx), %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 456(%ebx), %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 200(%ebx), %esi +; X86-NEXT: orl %edi, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 296(%ebx), %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 40(%ebx), %eax +; X86-NEXT: orl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 424(%ebx), %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 168(%ebx), %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 360(%ebx), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 104(%ebx), %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 488(%ebx), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 232(%ebx), %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: orl %esi, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 280(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 24(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 408(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 152(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 344(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 88(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 472(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 216(%ebx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 312(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 56(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 440(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 184(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 376(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 120(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 504(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 248(%ebx), %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: orl %esi, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 324(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 68(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 452(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 196(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 292(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 36(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 420(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 164(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 356(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 100(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 484(%ebx), %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: movl (%eax), %eax -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: andl 4(%edx), %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: movl %esi, %edi +; X86-NEXT: andl 228(%ebx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 276(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 20(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 404(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 148(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 340(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 84(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 468(%ebx), %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: andl (%edx), %esi +; X86-NEXT: andl 212(%ebx), %esi ; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl %edx, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl (%edx,%eax), %eax -; X86-NEXT: btl %ecx, %eax +; X86-NEXT: andl 308(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 52(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 60(%edx) +; X86-NEXT: andl 436(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 180(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 56(%edx) +; X86-NEXT: andl 372(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 116(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 52(%edx) +; X86-NEXT: andl 500(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 244(%ebx), %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: orl %esi, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 48(%edx) +; X86-NEXT: andl 268(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 12(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 44(%edx) +; X86-NEXT: andl 396(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 140(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 40(%edx) +; X86-NEXT: andl 332(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 76(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 36(%edx) +; X86-NEXT: andl 460(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 204(%ebx), %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: orl %edx, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 32(%edx) +; X86-NEXT: andl 300(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 44(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 28(%edx) +; X86-NEXT: andl 428(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 172(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 24(%edx) +; X86-NEXT: andl 364(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 108(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: andl 492(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 236(%ebx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: orl %edi, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) +; X86-NEXT: andl 284(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 28(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 12(%edx) -; X86-NEXT: movl %ebx, 8(%edx) -; X86-NEXT: movl %edi, 4(%edx) -; X86-NEXT: movl %esi, (%edx) -; X86-NEXT: setae %al +; X86-NEXT: andl 412(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 156(%ebx), %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 348(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 92(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 476(%ebx), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 220(%ebx), %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: orl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 316(%ebx), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 60(%ebx), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 444(%ebx), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 188(%ebx), %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 380(%ebx), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 124(%ebx), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 508(%ebx), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: andl 252(%esi), %ebx +; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: orl %edx, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: orl %eax, %ebx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: negl %ecx +; X86-NEXT: movl 1648(%esp,%ecx), %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: andl 128(%edx), %ecx +; X86-NEXT: andl 384(%edx), %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %eax +; X86-NEXT: andl (%edx), %eax +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 256(%edx), %eax +; X86-NEXT: orl %eax, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 260(%edx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 4(%edx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 132(%edx), %eax +; X86-NEXT: andl 388(%edx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl %ebx, %esi +; X86-NEXT: orl %edi, %esi +; X86-NEXT: setne %al ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -1278,7 +5483,7 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: init_eq_i512: +; SSE-LABEL: test_ne_i4096: ; SSE: # %bb.0: ; SSE-NEXT: pushq %rbp ; SSE-NEXT: pushq %r15 @@ -1286,117 +5491,524 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; SSE-NEXT: pushq %r13 ; SSE-NEXT: pushq %r12 ; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $184, %rsp -; SSE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: subq $1576, %rsp # imm = 0x628 +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: movl %esi, %eax +; SSE-NEXT: andl $4032, %eax # imm = 0xFC0 ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) ; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) ; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) ; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx ; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: movl %esi, %eax ; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: # kill: def $eax killed $eax killed $rax -; SSE-NEXT: andl $56, %eax ; SSE-NEXT: negl %eax -; SSE-NEXT: movslq %eax, %r12 -; SSE-NEXT: movq 160(%rsp,%r12), %rax -; SSE-NEXT: movq 168(%rsp,%r12), %r10 -; SSE-NEXT: shldq %cl, %rax, %r10 -; SSE-NEXT: movq 152(%rsp,%r12), %rsi -; SSE-NEXT: shldq %cl, %rsi, %rax +; SSE-NEXT: movslq %eax, %rsi +; SSE-NEXT: movq 1296(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1304(%rsp,%rsi), %rax ; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 144(%rsp,%r12), %r11 -; SSE-NEXT: shldq %cl, %r11, %rsi -; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 136(%rsp,%r12), %rbx -; SSE-NEXT: shldq %cl, %rbx, %r11 -; SSE-NEXT: movq 128(%rsp,%r12), %r14 -; SSE-NEXT: shldq %cl, %r14, %rbx -; SSE-NEXT: movq 120(%rsp,%r12), %r15 -; SSE-NEXT: shldq %cl, %r15, %r14 -; SSE-NEXT: movq 112(%rsp,%r12), %r13 -; SSE-NEXT: shldq %cl, %r13, %r15 -; SSE-NEXT: shlq %cl, %r13 -; SSE-NEXT: movl %edx, %eax -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq 32(%rsp,%r12), %rax -; SSE-NEXT: movq 40(%rsp,%r12), %rdx +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1552(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1560(%rsp,%rsi), %rax +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1168(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1176(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1424(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1432(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1232(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1240(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1488(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1496(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1104(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1112(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1360(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, (%rsp) # 8-byte Spill +; SSE-NEXT: movq 1368(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1264(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1272(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1520(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1528(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1136(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1144(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1392(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1400(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1200(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1208(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1456(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1464(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1072(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1080(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1328(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1336(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1280(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1288(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1536(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1544(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1152(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1160(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1408(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1416(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1216(%rsp,%rsi), %r11 +; SSE-NEXT: movq 1224(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %r11, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1472(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1480(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1088(%rsp,%rsi), %r9 +; SSE-NEXT: movq 1096(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %r9, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1344(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1352(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1248(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1256(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE-NEXT: shldq %cl, %rax, %rdx ; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 24(%rsp,%r12), %rdx +; SSE-NEXT: movq 1504(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1512(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq 16(%rsp,%r12), %rsi -; SSE-NEXT: shldq %cl, %rsi, %rdx -; SSE-NEXT: movq 8(%rsp,%r12), %r8 -; SSE-NEXT: shldq %cl, %r8, %rsi -; SSE-NEXT: movq (%rsp,%r12), %rbp -; SSE-NEXT: shldq %cl, %rbp, %r8 -; SSE-NEXT: movq -8(%rsp,%r12), %r9 -; SSE-NEXT: shldq %cl, %r9, %rbp -; SSE-NEXT: notq %r10 -; SSE-NEXT: andq 56(%rdi), %r10 -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE-NEXT: notq %r10 -; SSE-NEXT: andq 48(%rdi), %r10 -; SSE-NEXT: orq %rax, %r10 +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1120(%rsp,%rsi), %rax +; SSE-NEXT: movq 1128(%rsp,%rsi), %r8 +; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rax, %r8 +; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1376(%rsp,%rsi), %r13 +; SSE-NEXT: movq 1384(%rsp,%rsi), %rbx +; SSE-NEXT: movq %rbx, %r8 +; SSE-NEXT: shldq %cl, %r13, %r8 +; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1184(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1192(%rsp,%rsi), %r15 +; SSE-NEXT: movq %r15, %r14 +; SSE-NEXT: shldq %cl, %rdx, %r14 +; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1440(%rsp,%rsi), %r10 +; SSE-NEXT: movq 1448(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, %r14 +; SSE-NEXT: shldq %cl, %r10, %r14 +; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1312(%rsp,%rsi), %r14 +; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1320(%rsp,%rsi), %rbp +; SSE-NEXT: movq %rbp, %r12 +; SSE-NEXT: shldq %cl, %r14, %r12 +; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, (%rsp) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq 1064(%rsp,%rsi), %rbx +; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; SSE-NEXT: shldq %cl, %rbp, %r14 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: shldq %cl, %rdx, %r11 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r15, %rdx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r15, %r9 +; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r15, %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r15, %rbp +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r15, %r9 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r15, %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r15, %r13 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r12, %r15 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r12, %r10 +; SSE-NEXT: andq 384(%rdi), %r10 +; SSE-NEXT: andq 128(%rdi), %r15 +; SSE-NEXT: andq 320(%rdi), %r13 +; SSE-NEXT: andq 64(%rdi), %rax +; SSE-NEXT: orq %r10, %r15 +; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: orq %r13, %rax +; SSE-NEXT: andq 448(%rdi), %r9 +; SSE-NEXT: andq 192(%rdi), %rbp +; SSE-NEXT: orq %r9, %rbp +; SSE-NEXT: orq %rax, %rbp +; SSE-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq 288(%rdi), %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; SSE-NEXT: andq 32(%rdi), %r9 +; SSE-NEXT: andq 416(%rdi), %rdx +; SSE-NEXT: andq 160(%rdi), %r11 +; SSE-NEXT: orq %r8, %r9 +; SSE-NEXT: orq %rdx, %r11 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: andq 352(%rdi), %rdx +; SSE-NEXT: orq %r9, %r11 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 96(%rdi), %rax +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 480(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 224(%rdi), %r8 +; SSE-NEXT: orq %rax, %r8 +; SSE-NEXT: orq %rdx, %r8 +; SSE-NEXT: andq 272(%rdi), %r14 +; SSE-NEXT: orq %r11, %r8 +; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: notq %rax -; SSE-NEXT: andq 40(%rdi), %rax +; SSE-NEXT: andq 16(%rdi), %rax +; SSE-NEXT: orq %r14, %rax +; SSE-NEXT: movq %rax, %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: andq 400(%rdi), %rdx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 144(%rdi), %rax +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: orq %r8, %rax +; SSE-NEXT: movq %rax, %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; SSE-NEXT: andq 336(%rdi), %r9 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 80(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: andq 464(%rdi), %rdx +; SSE-NEXT: orq %r9, %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE-NEXT: andq 208(%rdi), %r11 +; SSE-NEXT: orq %rdx, %r11 +; SSE-NEXT: orq %rax, %r11 +; SSE-NEXT: orq %r8, %r11 +; SSE-NEXT: movq (%rsp), %rdx # 8-byte Reload +; SSE-NEXT: andq 304(%rdi), %rdx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 48(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; SSE-NEXT: andq 432(%rdi), %r9 +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: movq %rax, %r10 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 176(%rdi), %r8 +; SSE-NEXT: orq %r9, %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; SSE-NEXT: andq 368(%rdi), %r9 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 112(%rdi), %rax +; SSE-NEXT: orq %r10, %r8 +; SSE-NEXT: movq %r8, %r10 +; SSE-NEXT: orq %r9, %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 496(%rdi), %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; SSE-NEXT: andq 240(%rdi), %rbp +; SSE-NEXT: orq %r8, %rbp +; SSE-NEXT: orq %rax, %rbp +; SSE-NEXT: orq %r10, %rbp +; SSE-NEXT: orq %r11, %rbp +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 392(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; SSE-NEXT: andq 136(%rdi), %r12 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: andq 328(%rdi), %rdx +; SSE-NEXT: orq %rax, %r12 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 72(%rdi), %rax ; SSE-NEXT: orq %rdx, %rax ; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: notq %r11 -; SSE-NEXT: andq 32(%rdi), %r11 -; SSE-NEXT: orq %rsi, %r11 -; SSE-NEXT: notq %rbx -; SSE-NEXT: andq 24(%rdi), %rbx -; SSE-NEXT: orq %r8, %rbx -; SSE-NEXT: notq %r14 -; SSE-NEXT: andq 16(%rdi), %r14 -; SSE-NEXT: orq %rbp, %r14 -; SSE-NEXT: notq %r15 -; SSE-NEXT: movq -16(%rsp,%r12), %rax -; SSE-NEXT: shldq %cl, %rax, %r9 -; SSE-NEXT: andq 8(%rdi), %r15 -; SSE-NEXT: orq %r9, %r15 -; SSE-NEXT: notq %r13 -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: andq (%rdi), %r13 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 456(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; SSE-NEXT: andq 200(%rdi), %r13 ; SSE-NEXT: orq %rax, %r13 +; SSE-NEXT: orq %rdx, %r13 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: andq 296(%rdi), %rdx ; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andl $60, %eax -; SSE-NEXT: movl (%rdi,%rax), %eax -; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; SSE-NEXT: btl %ecx, %eax +; SSE-NEXT: andq 40(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 424(%rdi), %r8 +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: movq %rax, %r9 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: andq 168(%rdi), %rdx +; SSE-NEXT: orq %r8, %rdx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 360(%rdi), %r8 ; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: movq %rax, 56(%rdi) -; SSE-NEXT: movq %r10, 48(%rdi) -; SSE-NEXT: movq %rdx, 40(%rdi) -; SSE-NEXT: movq %r11, 32(%rdi) -; SSE-NEXT: movq %rbx, 24(%rdi) -; SSE-NEXT: movq %r14, 16(%rdi) -; SSE-NEXT: movq %r15, 8(%rdi) -; SSE-NEXT: movq %r13, (%rdi) -; SSE-NEXT: setae %al -; SSE-NEXT: addq $184, %rsp +; SSE-NEXT: andq 104(%rdi), %rax +; SSE-NEXT: orq %r9, %rdx +; SSE-NEXT: orq %r8, %rax +; SSE-NEXT: movq %rax, %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 488(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: andq 232(%rdi), %r15 +; SSE-NEXT: orq %rax, %r15 +; SSE-NEXT: orq %r8, %r15 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 280(%rdi), %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 24(%rdi), %rax +; SSE-NEXT: orq %rdx, %r15 +; SSE-NEXT: orq %r8, %rax +; SSE-NEXT: movq %rax, %r10 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 408(%rdi), %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 152(%rdi), %rax +; SSE-NEXT: orq %r8, %rax +; SSE-NEXT: orq %r10, %rax +; SSE-NEXT: movq %rax, %r10 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE-NEXT: andq 344(%rdi), %r11 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 88(%rdi), %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 472(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; SSE-NEXT: andq 216(%rdi), %r14 +; SSE-NEXT: orq %r11, %r8 +; SSE-NEXT: orq %rax, %r14 +; SSE-NEXT: orq %r8, %r14 +; SSE-NEXT: orq %r10, %r14 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE-NEXT: andq 312(%rdi), %r11 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE-NEXT: andq 56(%rdi), %r10 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 440(%rdi), %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; SSE-NEXT: andq 184(%rdi), %r9 +; SSE-NEXT: orq %r11, %r10 +; SSE-NEXT: orq %r8, %r9 +; SSE-NEXT: orq %r10, %r9 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: shldq %cl, %rax, %rdx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE-NEXT: andq 376(%rdi), %r10 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 120(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE-NEXT: andq 504(%rdi), %r11 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 248(%rdi), %r8 +; SSE-NEXT: orq %r10, %rax +; SSE-NEXT: movq %rax, %r10 +; SSE-NEXT: orq %r11, %r8 +; SSE-NEXT: movq 1056(%rsp,%rsi), %rax +; SSE-NEXT: shldq %cl, %rax, %rbx +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shlq %cl, %rax +; SSE-NEXT: orq %r10, %r8 +; SSE-NEXT: orq %r9, %r8 +; SSE-NEXT: andq 256(%rdi), %rdx +; SSE-NEXT: orq %r14, %r8 +; SSE-NEXT: andq (%rdi), %rax +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; SSE-NEXT: orq %rbp, %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE-NEXT: andq 264(%rdi), %rcx +; SSE-NEXT: andq 8(%rdi), %rbx +; SSE-NEXT: orq %rcx, %rbx +; SSE-NEXT: orq %r12, %rbx +; SSE-NEXT: orq %r13, %rbx +; SSE-NEXT: orq %r15, %rbx +; SSE-NEXT: orq %r8, %rbx +; SSE-NEXT: orq %rax, %rbx +; SSE-NEXT: setne %al +; SSE-NEXT: addq $1576, %rsp # imm = 0x628 ; SSE-NEXT: popq %rbx ; SSE-NEXT: popq %r12 ; SSE-NEXT: popq %r13 @@ -1405,7 +6017,7 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; -; AVX2-LABEL: init_eq_i512: +; AVX2-LABEL: test_ne_i4096: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rbp ; AVX2-NEXT: pushq %r15 @@ -1413,103 +6025,490 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; AVX2-NEXT: pushq %r13 ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $168, %rsp +; AVX2-NEXT: subq $1560, %rsp # imm = 0x618 +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: movl %esi, %eax ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $4032, %eax # imm = 0xFC0 +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: movl %esi, %r11d -; AVX2-NEXT: shrl $3, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: andl $56, %eax +; AVX2-NEXT: shrl $3, %eax ; AVX2-NEXT: negl %eax -; AVX2-NEXT: movslq %eax, %r10 -; AVX2-NEXT: movq 104(%rsp,%r10), %r15 -; AVX2-NEXT: movq 112(%rsp,%r10), %rax -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: shldq %cl, %r15, %rsi -; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 120(%rsp,%r10), %rsi -; AVX2-NEXT: movq %rsi, %r8 -; AVX2-NEXT: shldq %cl, %rax, %r8 -; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 128(%rsp,%r10), %rax -; AVX2-NEXT: movq %rax, %rbx -; AVX2-NEXT: shldq %cl, %rsi, %rbx -; AVX2-NEXT: movq 136(%rsp,%r10), %rsi -; AVX2-NEXT: movq %rsi, %r14 -; AVX2-NEXT: shldq %cl, %rax, %r14 -; AVX2-NEXT: movq 144(%rsp,%r10), %rax -; AVX2-NEXT: movq %rax, %r12 -; AVX2-NEXT: shldq %cl, %rsi, %r12 -; AVX2-NEXT: movq 96(%rsp,%r10), %rsi -; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 152(%rsp,%r10), %r13 -; AVX2-NEXT: shldq %cl, %rax, %r13 -; AVX2-NEXT: shldq %cl, %rsi, %r15 -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 16(%rsp,%r10), %rbp -; AVX2-NEXT: movq 24(%rsp,%r10), %r9 -; AVX2-NEXT: shldq %cl, %rbp, %r9 -; AVX2-NEXT: movq 8(%rsp,%r10), %rdx -; AVX2-NEXT: shldq %cl, %rdx, %rbp -; AVX2-NEXT: movq (%rsp,%r10), %rax +; AVX2-NEXT: movslq %eax, %rsi +; AVX2-NEXT: movq 1280(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1288(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1536(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1544(%rsp,%rsi), %rax +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1152(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1160(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1408(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1416(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1216(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, (%rsp) # 8-byte Spill +; AVX2-NEXT: movq 1224(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1472(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1480(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1088(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1096(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1344(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1352(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1248(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1256(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1504(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1512(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1120(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1128(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1376(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1384(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1184(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1192(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1440(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1448(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1056(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1064(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1312(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1320(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1264(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1272(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1520(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1528(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1136(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1144(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1392(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1400(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1200(%rsp,%rsi), %r11 +; AVX2-NEXT: movq 1208(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %r11, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1456(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1464(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1072(%rsp,%rsi), %r12 +; AVX2-NEXT: movq 1080(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %r12, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1328(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1336(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1232(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1240(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: shldq %cl, %rax, %rdx -; AVX2-NEXT: movq -8(%rsp,%r10), %r8 -; AVX2-NEXT: shldq %cl, %r8, %rax -; AVX2-NEXT: movq -16(%rsp,%r10), %rsi -; AVX2-NEXT: shldq %cl, %rsi, %r8 -; AVX2-NEXT: andnq 56(%rdi), %r13, %r13 -; AVX2-NEXT: orq %r9, %r13 -; AVX2-NEXT: movq -24(%rsp,%r10), %r9 -; AVX2-NEXT: shldq %cl, %r9, %rsi -; AVX2-NEXT: andnq 48(%rdi), %r12, %r12 -; AVX2-NEXT: andnq 40(%rdi), %r14, %r14 -; AVX2-NEXT: orq %rbp, %r12 -; AVX2-NEXT: orq %rdx, %r14 -; AVX2-NEXT: andnq 32(%rdi), %rbx, %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; AVX2-NEXT: movq -32(%rsp,%r10), %r10 -; AVX2-NEXT: shlxq %rcx, %r10, %rbx -; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %r10, %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: andnq 24(%rdi), %rcx, %rcx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1488(%rsp,%rsi), %rbp +; AVX2-NEXT: movq 1496(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rbp, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1104(%rsp,%rsi), %rax +; AVX2-NEXT: movq 1112(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rax, %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1360(%rsp,%rsi), %r10 +; AVX2-NEXT: movq 1368(%rsp,%rsi), %r8 +; AVX2-NEXT: movq %r8, %rdx +; AVX2-NEXT: shldq %cl, %r10, %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1168(%rsp,%rsi), %r9 +; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1176(%rsp,%rsi), %rbx +; AVX2-NEXT: movq %rbx, %rdx +; AVX2-NEXT: shldq %cl, %r9, %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1424(%rsp,%rsi), %r9 +; AVX2-NEXT: movq 1432(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, %r14 +; AVX2-NEXT: shldq %cl, %r9, %r14 +; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1296(%rsp,%rsi), %r15 +; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1304(%rsp,%rsi), %r14 +; AVX2-NEXT: movq %r14, %r13 +; AVX2-NEXT: shldq %cl, %r15, %r13 +; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, (%rsp) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq 1048(%rsp,%rsi), %rdx +; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, %rbx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, %r11 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, %r12 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, %r13 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, %rbp +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, %r14 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, %r9 +; AVX2-NEXT: andq 384(%rdi), %r9 +; AVX2-NEXT: andq 128(%rdi), %r14 +; AVX2-NEXT: andq 320(%rdi), %r10 +; AVX2-NEXT: orq %r9, %r14 +; AVX2-NEXT: movq %r14, %r15 +; AVX2-NEXT: andq 64(%rdi), %rax +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: andq 448(%rdi), %rbp +; AVX2-NEXT: andq 192(%rdi), %r13 +; AVX2-NEXT: orq %rbp, %r13 +; AVX2-NEXT: orq %rax, %r13 +; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq 288(%rdi), %r8 +; AVX2-NEXT: andq 32(%rdi), %r12 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 416(%rdi), %rax +; AVX2-NEXT: orq %r8, %r12 +; AVX2-NEXT: andq 160(%rdi), %r11 +; AVX2-NEXT: orq %rax, %r11 +; AVX2-NEXT: andq 352(%rdi), %rbx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 96(%rdi), %rax +; AVX2-NEXT: orq %r12, %r11 +; AVX2-NEXT: orq %rbx, %rax ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andnq 16(%rdi), %r10, %r10 -; AVX2-NEXT: orq %r8, %rcx -; AVX2-NEXT: orq %rsi, %r10 -; AVX2-NEXT: andnq 8(%rdi), %r15, %rsi -; AVX2-NEXT: orq %r9, %rsi -; AVX2-NEXT: andnq (%rdi), %rax, %rax +; AVX2-NEXT: andq 480(%rdi), %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX2-NEXT: andq 224(%rdi), %r13 +; AVX2-NEXT: orq %r10, %r13 +; AVX2-NEXT: orq %rax, %r13 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 272(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 16(%rdi), %rax +; AVX2-NEXT: orq %r11, %r13 +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX2-NEXT: andq 400(%rdi), %r9 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 144(%rdi), %rax +; AVX2-NEXT: orq %r9, %rax +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: movq %rax, %r9 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: andq 336(%rdi), %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 80(%rdi), %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 464(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX2-NEXT: andq 208(%rdi), %r11 +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: orq %r8, %r11 +; AVX2-NEXT: orq %rax, %r11 +; AVX2-NEXT: orq %r9, %r11 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX2-NEXT: andq 304(%rdi), %r9 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 48(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: andq 432(%rdi), %r10 +; AVX2-NEXT: movq (%rsp), %rax # 8-byte Reload +; AVX2-NEXT: andq 176(%rdi), %rax +; AVX2-NEXT: orq %r9, %r8 +; AVX2-NEXT: movq %r8, %r9 +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 368(%rdi), %r8 +; AVX2-NEXT: orq %r9, %rax +; AVX2-NEXT: movq %rax, %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 112(%rdi), %rax +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 496(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX2-NEXT: andq 240(%rdi), %r9 +; AVX2-NEXT: orq %r8, %r9 +; AVX2-NEXT: orq %rax, %r9 +; AVX2-NEXT: orq %r10, %r9 +; AVX2-NEXT: orq %r11, %r9 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: andq 392(%rdi), %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX2-NEXT: andq 136(%rdi), %rbp +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 328(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 72(%rdi), %rax +; AVX2-NEXT: orq %r10, %rbp +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 456(%rdi), %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; AVX2-NEXT: andq 200(%rdi), %r12 +; AVX2-NEXT: orq %rax, %r12 +; AVX2-NEXT: orq %r8, %r12 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: andq 296(%rdi), %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 40(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX2-NEXT: andq 424(%rdi), %r11 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 168(%rdi), %rax +; AVX2-NEXT: orq %r10, %r8 +; AVX2-NEXT: movq %r8, %r10 +; AVX2-NEXT: orq %r11, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 360(%rdi), %r8 +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: movq %rax, %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 104(%rdi), %rax +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 488(%rdi), %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: andq 232(%rdi), %r14 +; AVX2-NEXT: orq %rax, %r14 +; AVX2-NEXT: orq %r8, %r14 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 280(%rdi), %r8 +; AVX2-NEXT: orq %r10, %r14 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 24(%rdi), %rax +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: movq %rax, %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 408(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 152(%rdi), %rax +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: movq %rax, %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX2-NEXT: andq 344(%rdi), %r11 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 88(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 472(%rdi), %rax +; AVX2-NEXT: orq %r11, %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX2-NEXT: andq 216(%rdi), %rbx +; AVX2-NEXT: orq %rax, %rbx +; AVX2-NEXT: orq %r8, %rbx +; AVX2-NEXT: orq %r10, %rbx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 312(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 56(%rdi), %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: andq 440(%rdi), %r10 +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: movq %rax, %r11 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 184(%rdi), %r8 +; AVX2-NEXT: orq %r10, %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: andq 376(%rdi), %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 120(%rdi), %rax +; AVX2-NEXT: orq %r11, %r8 +; AVX2-NEXT: movq %r8, %r11 +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: movq %rax, %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 504(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 248(%rdi), %rax +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r8, %r10 +; AVX2-NEXT: orq %r11, %rax +; AVX2-NEXT: movq 1040(%rsp,%rsi), %rsi ; AVX2-NEXT: orq %rbx, %rax -; AVX2-NEXT: andl $60, %r11d -; AVX2-NEXT: movl (%rdi,%r11), %r8d -; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload -; AVX2-NEXT: btl %r9d, %r8d -; AVX2-NEXT: movq %r13, 56(%rdi) -; AVX2-NEXT: movq %r12, 48(%rdi) -; AVX2-NEXT: movq %r14, 40(%rdi) -; AVX2-NEXT: movq %rdx, 32(%rdi) -; AVX2-NEXT: movq %rcx, 24(%rdi) -; AVX2-NEXT: movq %r10, 16(%rdi) -; AVX2-NEXT: movq %rsi, 8(%rdi) -; AVX2-NEXT: movq %rax, (%rdi) -; AVX2-NEXT: setae %al -; AVX2-NEXT: addq $168, %rsp +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: shlxq %rcx, %rsi, %rax +; AVX2-NEXT: andq 256(%rdi), %r10 +; AVX2-NEXT: andq (%rdi), %rax +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: orq %r15, %rax +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX2-NEXT: orq %r13, %rax +; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX2-NEXT: shldq %cl, %rsi, %rdx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: andq 264(%rdi), %rcx +; AVX2-NEXT: andq 8(%rdi), %rdx +; AVX2-NEXT: orq %r9, %rax +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: orq %rbp, %rdx +; AVX2-NEXT: orq %r12, %rdx +; AVX2-NEXT: orq %r14, %rdx +; AVX2-NEXT: orq %r8, %rdx +; AVX2-NEXT: orq %rax, %rdx +; AVX2-NEXT: setne %al +; AVX2-NEXT: addq $1560, %rsp # imm = 0x618 ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %r12 ; AVX2-NEXT: popq %r13 @@ -1519,7 +6518,7 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: init_eq_i512: +; AVX512-LABEL: test_ne_i4096: ; AVX512: # %bb.0: ; AVX512-NEXT: pushq %rbp ; AVX512-NEXT: pushq %r15 @@ -1527,100 +6526,489 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; AVX512-NEXT: pushq %r13 ; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $152, %rsp +; AVX512-NEXT: subq $1560, %rsp # imm = 0x618 +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: andl $4032, %eax # imm = 0xFC0 ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: movl %esi, %r8d -; AVX512-NEXT: shrl $3, %r8d -; AVX512-NEXT: movl %r8d, %eax -; AVX512-NEXT: andl $56, %eax +; AVX512-NEXT: shrl $3, %eax ; AVX512-NEXT: negl %eax -; AVX512-NEXT: movslq %eax, %r9 -; AVX512-NEXT: movq 88(%rsp,%r9), %r10 -; AVX512-NEXT: movq 96(%rsp,%r9), %rax -; AVX512-NEXT: movq %rax, %rsi -; AVX512-NEXT: shldq %cl, %r10, %rsi -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 104(%rsp,%r9), %rsi -; AVX512-NEXT: movq %rsi, %r11 -; AVX512-NEXT: shldq %cl, %rax, %r11 -; AVX512-NEXT: movq 112(%rsp,%r9), %rax -; AVX512-NEXT: movq %rax, %rbx -; AVX512-NEXT: shldq %cl, %rsi, %rbx -; AVX512-NEXT: movq 120(%rsp,%r9), %rsi -; AVX512-NEXT: movq %rsi, %r14 -; AVX512-NEXT: shldq %cl, %rax, %r14 -; AVX512-NEXT: movq 128(%rsp,%r9), %rax -; AVX512-NEXT: movq %rax, %r12 -; AVX512-NEXT: shldq %cl, %rsi, %r12 -; AVX512-NEXT: movq 136(%rsp,%r9), %r13 -; AVX512-NEXT: shldq %cl, %rax, %r13 -; AVX512-NEXT: movq 80(%rsp,%r9), %r15 -; AVX512-NEXT: shldq %cl, %r15, %r10 -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq (%rsp,%r9), %rbp -; AVX512-NEXT: movq 8(%rsp,%r9), %rsi -; AVX512-NEXT: shldq %cl, %rbp, %rsi -; AVX512-NEXT: movq -8(%rsp,%r9), %rdx -; AVX512-NEXT: shldq %cl, %rdx, %rbp -; AVX512-NEXT: movq -16(%rsp,%r9), %rax +; AVX512-NEXT: movslq %eax, %rsi +; AVX512-NEXT: movq 1280(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1288(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1536(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1544(%rsp,%rsi), %rax +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1152(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1160(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1408(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1416(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1216(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, (%rsp) # 8-byte Spill +; AVX512-NEXT: movq 1224(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1472(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1480(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1088(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1096(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1344(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1352(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1248(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1256(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1504(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1512(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1120(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1128(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1376(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1384(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1184(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1192(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1440(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1448(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1056(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1064(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1312(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1320(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1264(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1272(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1520(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1528(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1136(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1144(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1392(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1400(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1200(%rsp,%rsi), %r10 +; AVX512-NEXT: movq 1208(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %r10, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1456(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1464(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1072(%rsp,%rsi), %r14 +; AVX512-NEXT: movq 1080(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %r14, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1328(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1336(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1232(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1240(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: andnq 56(%rdi), %r13, %r13 -; AVX512-NEXT: andnq 48(%rdi), %r12, %r12 -; AVX512-NEXT: orq %rsi, %r13 -; AVX512-NEXT: orq %rbp, %r12 -; AVX512-NEXT: andnq 40(%rdi), %r14, %r14 -; AVX512-NEXT: orq %rdx, %r14 -; AVX512-NEXT: movq -24(%rsp,%r9), %rsi -; AVX512-NEXT: shldq %cl, %rsi, %rax -; AVX512-NEXT: andnq 32(%rdi), %rbx, %rdx -; AVX512-NEXT: orq %rax, %rdx -; AVX512-NEXT: movq -32(%rsp,%r9), %rax -; AVX512-NEXT: shldq %cl, %rax, %rsi -; AVX512-NEXT: shlxq %rcx, %r15, %rbx -; AVX512-NEXT: andnq 24(%rdi), %r11, %r11 -; AVX512-NEXT: orq %rsi, %r11 -; AVX512-NEXT: movq -48(%rsp,%r9), %rsi -; AVX512-NEXT: movq -40(%rsp,%r9), %r9 -; AVX512-NEXT: shldq %cl, %r9, %rax +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1488(%rsp,%rsi), %r12 +; AVX512-NEXT: movq 1496(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %r12, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1104(%rsp,%rsi), %rax +; AVX512-NEXT: movq 1112(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rax, %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1360(%rsp,%rsi), %r11 +; AVX512-NEXT: movq 1368(%rsp,%rsi), %rbx +; AVX512-NEXT: movq %rbx, %rdx +; AVX512-NEXT: shldq %cl, %r11, %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1168(%rsp,%rsi), %r9 +; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1176(%rsp,%rsi), %r8 +; AVX512-NEXT: movq %r8, %rdx +; AVX512-NEXT: shldq %cl, %r9, %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1424(%rsp,%rsi), %r9 +; AVX512-NEXT: movq 1432(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, %r15 +; AVX512-NEXT: shldq %cl, %r9, %r15 +; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1296(%rsp,%rsi), %rbp +; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1304(%rsp,%rsi), %r15 +; AVX512-NEXT: movq %r15, %r13 +; AVX512-NEXT: shldq %cl, %rbp, %r13 +; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, (%rsp) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq 1048(%rsp,%rsi), %rdx +; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r15, %rbx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r15, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r15, %r14 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r15, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r15, %r13 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r15, %r12 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r15, %rax ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: andnq 16(%rdi), %r15, %r15 +; AVX512-NEXT: shldq %cl, %r15, %r11 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX512-NEXT: shldq %cl, %rbp, %r15 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX512-NEXT: shldq %cl, %rbp, %r9 +; AVX512-NEXT: andq 384(%rdi), %r9 +; AVX512-NEXT: andq 128(%rdi), %r15 +; AVX512-NEXT: orq %r9, %r15 +; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq 320(%rdi), %r11 +; AVX512-NEXT: andq 64(%rdi), %rax +; AVX512-NEXT: orq %r11, %rax +; AVX512-NEXT: andq 448(%rdi), %r12 +; AVX512-NEXT: andq 192(%rdi), %r13 +; AVX512-NEXT: orq %r12, %r13 +; AVX512-NEXT: orq %rax, %r13 +; AVX512-NEXT: andq 288(%rdi), %r8 +; AVX512-NEXT: andq 32(%rdi), %r14 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 416(%rdi), %rax +; AVX512-NEXT: orq %r8, %r14 +; AVX512-NEXT: andq 160(%rdi), %r10 +; AVX512-NEXT: orq %rax, %r10 +; AVX512-NEXT: andq 352(%rdi), %rbx +; AVX512-NEXT: orq %r14, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 96(%rdi), %rax +; AVX512-NEXT: orq %rbx, %rax +; AVX512-NEXT: movq %rax, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 480(%rdi), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: andq 224(%rdi), %r15 ; AVX512-NEXT: orq %rax, %r15 -; AVX512-NEXT: shlxq %rcx, %rsi, %rax +; AVX512-NEXT: orq %r8, %r15 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 272(%rdi), %r8 +; AVX512-NEXT: orq %r10, %r15 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 16(%rdi), %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: movq %rax, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX512-NEXT: andq 400(%rdi), %r9 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 144(%rdi), %rax +; AVX512-NEXT: orq %r9, %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: movq %rax, %r9 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: andq 336(%rdi), %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 80(%rdi), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 464(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX512-NEXT: andq 208(%rdi), %r11 +; AVX512-NEXT: orq %r10, %rax +; AVX512-NEXT: orq %r8, %r11 +; AVX512-NEXT: orq %rax, %r11 +; AVX512-NEXT: orq %r9, %r11 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: andq 304(%rdi), %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 48(%rdi), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX512-NEXT: andq 432(%rdi), %r9 +; AVX512-NEXT: movq (%rsp), %r8 # 8-byte Reload +; AVX512-NEXT: andq 176(%rdi), %r8 +; AVX512-NEXT: orq %r10, %rax +; AVX512-NEXT: movq %rax, %r10 +; AVX512-NEXT: orq %r9, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX512-NEXT: andq 368(%rdi), %r9 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 112(%rdi), %rax +; AVX512-NEXT: orq %r10, %r8 +; AVX512-NEXT: movq %r8, %r10 +; AVX512-NEXT: orq %r9, %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 496(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX512-NEXT: andq 240(%rdi), %r9 +; AVX512-NEXT: orq %r8, %r9 +; AVX512-NEXT: orq %rax, %r9 +; AVX512-NEXT: orq %r10, %r9 +; AVX512-NEXT: orq %r11, %r9 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: andq 392(%rdi), %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX512-NEXT: andq 136(%rdi), %rbp +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 328(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 72(%rdi), %rax +; AVX512-NEXT: orq %r10, %rbp +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: movq %rax, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 456(%rdi), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; AVX512-NEXT: andq 200(%rdi), %r12 +; AVX512-NEXT: orq %rax, %r12 +; AVX512-NEXT: orq %r8, %r12 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 296(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 40(%rdi), %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: movq %rax, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 424(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 168(%rdi), %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: orq %r10, %rax +; AVX512-NEXT: movq %rax, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 360(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 104(%rdi), %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: movq %rax, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 488(%rdi), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX512-NEXT: andq 232(%rdi), %r14 +; AVX512-NEXT: orq %rax, %r14 +; AVX512-NEXT: orq %r8, %r14 +; AVX512-NEXT: orq %r10, %r14 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 280(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 24(%rdi), %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: movq %rax, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 408(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 152(%rdi), %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: orq %r10, %rax +; AVX512-NEXT: movq %rax, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX512-NEXT: andq 344(%rdi), %r11 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 88(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 472(%rdi), %rax +; AVX512-NEXT: orq %r11, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX512-NEXT: andq 216(%rdi), %rbx +; AVX512-NEXT: orq %rax, %rbx +; AVX512-NEXT: orq %r8, %rbx +; AVX512-NEXT: orq %r10, %rbx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: andq 312(%rdi), %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 56(%rdi), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 440(%rdi), %r8 +; AVX512-NEXT: orq %r10, %rax +; AVX512-NEXT: movq %rax, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 184(%rdi), %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 376(%rdi), %r8 +; AVX512-NEXT: orq %r10, %rax +; AVX512-NEXT: movq %rax, %r11 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 120(%rdi), %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: movq %rax, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 504(%rdi), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 248(%rdi), %r8 +; AVX512-NEXT: orq %rax, %r8 +; AVX512-NEXT: orq %r10, %r8 +; AVX512-NEXT: orq %r11, %r8 +; AVX512-NEXT: movq 1040(%rsp,%rsi), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %rsi, %r10 +; AVX512-NEXT: orq %rbx, %r8 +; AVX512-NEXT: shlxq %rcx, %rax, %rsi +; AVX512-NEXT: andq 256(%rdi), %r10 +; AVX512-NEXT: andq (%rdi), %rsi +; AVX512-NEXT: orq %r10, %rsi +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; AVX512-NEXT: orq %r13, %rsi +; AVX512-NEXT: orq %r15, %rsi ; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rsi, %r9 -; AVX512-NEXT: andnq 8(%rdi), %r10, %rcx -; AVX512-NEXT: orq %r9, %rcx -; AVX512-NEXT: andnq (%rdi), %rbx, %rsi -; AVX512-NEXT: orq %rax, %rsi -; AVX512-NEXT: andl $60, %r8d -; AVX512-NEXT: movl (%rdi,%r8), %eax -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload -; AVX512-NEXT: btl %r8d, %eax -; AVX512-NEXT: movq %r13, 56(%rdi) -; AVX512-NEXT: movq %r12, 48(%rdi) -; AVX512-NEXT: movq %r14, 40(%rdi) -; AVX512-NEXT: movq %rdx, 32(%rdi) -; AVX512-NEXT: movq %r11, 24(%rdi) -; AVX512-NEXT: movq %r15, 16(%rdi) -; AVX512-NEXT: movq %rcx, 8(%rdi) -; AVX512-NEXT: movq %rsi, (%rdi) -; AVX512-NEXT: setae %al -; AVX512-NEXT: addq $152, %rsp +; AVX512-NEXT: shldq %cl, %rax, %rdx +; AVX512-NEXT: orq %r9, %rsi +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 264(%rdi), %rax +; AVX512-NEXT: andq 8(%rdi), %rdx +; AVX512-NEXT: orq %rax, %rdx +; AVX512-NEXT: orq %rbp, %rdx +; AVX512-NEXT: orq %r12, %rdx +; AVX512-NEXT: orq %r14, %rdx +; AVX512-NEXT: orq %r8, %rdx +; AVX512-NEXT: orq %rsi, %rdx +; AVX512-NEXT: setne %al +; AVX512-NEXT: addq $1560, %rsp # imm = 0x618 ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r12 ; AVX512-NEXT: popq %r13 @@ -1629,45 +7017,6 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %mask = xor i512 %bit, -1 - %val0 = zext i1 %value to i512 - %val = shl nuw i512 %val0, %ofs - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %res0 = and i512 %ld, %mask - %res = or i512 %res0, %val - %cmp = icmp eq i512 %test, 0 - store i512 %res, ptr %word - ret i1 %cmp -} - -; i4096 - -define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { -; X86-LABEL: test_ne_i4096: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: andl $4064, %edx # imm = 0xFE0 -; X86-NEXT: shrl $3, %edx -; X86-NEXT: movl (%eax,%edx), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: setb %al -; X86-NEXT: retl -; -; X64-LABEL: test_ne_i4096: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: andl $4064, %eax # imm = 0xFE0 -; X64-NEXT: shrl $3, %eax -; X64-NEXT: movl (%rdi,%rax), %eax -; X64-NEXT: btl %esi, %eax -; X64-NEXT: setb %al -; X64-NEXT: retq %rem = and i32 %position, 4095 %ofs = zext nneg i32 %rem to i4096 %bit = shl nuw i4096 1, %ofs @@ -1812,8 +7161,8 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $64, %esp -; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: subl $80, %esp +; X86-NEXT: movzbl 12(%ebp), %ecx ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1826,41 +7175,52 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $12, %al ; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %edi -; X86-NEXT: movl 36(%esp,%edi), %edx -; X86-NEXT: movl 40(%esp,%edi), %ebx -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl 32(%esp,%edi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%esp,%edi), %edi -; X86-NEXT: shldl %cl, %ebx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: notl %ebx -; X86-NEXT: movl 16(%ebp), %eax -; X86-NEXT: movl (%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%ebp), %eax -; X86-NEXT: andl $96, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl (%ecx,%eax), %eax -; X86-NEXT: andl %ebx, (%ecx) -; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 56(%esp,%eax), %esi +; X86-NEXT: movl 60(%esp,%eax), %edx +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%esp,%eax), %edi +; X86-NEXT: movl 52(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl 8(%ebx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %ecx +; X86-NEXT: movl (%ebx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edi, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: movl 12(%ebx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl 4(%ebx), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: notl %ecx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: notl %ebx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: notl %edx -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: andl %edx, 4(%ebx) -; X86-NEXT: notl %esi -; X86-NEXT: andl %esi, 8(%ebx) +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: notl %edi -; X86-NEXT: andl %edi, 12(%ebx) -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: jae .LBB22_2 +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: movl %ebx, 8(%esi) +; X86-NEXT: movl %ecx, 12(%esi) +; X86-NEXT: movl %edi, (%esi) +; X86-NEXT: movl %edx, 4(%esi) +; X86-NEXT: je .LBB22_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: .LBB22_2: @@ -1882,75 +7242,52 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; SSE-NEXT: testb $64, %cl ; SSE-NEXT: cmovneq %rsi, %r8 ; SSE-NEXT: cmovneq %rax, %rsi +; SSE-NEXT: movq (%rdi), %rcx +; SSE-NEXT: movq 8(%rdi), %r9 +; SSE-NEXT: movq %r9, %r10 +; SSE-NEXT: andq %r8, %r10 ; SSE-NEXT: notq %r8 +; SSE-NEXT: movq %rcx, %r11 +; SSE-NEXT: andq %rsi, %r11 ; SSE-NEXT: notq %rsi -; SSE-NEXT: movl %ecx, %r9d -; SSE-NEXT: andl $96, %r9d -; SSE-NEXT: shrl $3, %r9d -; SSE-NEXT: movl (%rdi,%r9), %r9d -; SSE-NEXT: btl %ecx, %r9d -; SSE-NEXT: jb .LBB22_2 +; SSE-NEXT: andq %r9, %r8 +; SSE-NEXT: andq %rcx, %rsi +; SSE-NEXT: orq %r10, %r11 +; SSE-NEXT: jne .LBB22_2 ; SSE-NEXT: # %bb.1: ; SSE-NEXT: movl (%rdx), %eax ; SSE-NEXT: .LBB22_2: -; SSE-NEXT: andq %r8, 8(%rdi) -; SSE-NEXT: andq %rsi, (%rdi) +; SSE-NEXT: movq %rsi, (%rdi) +; SSE-NEXT: movq %r8, 8(%rdi) ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: reset_multiload_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: movl $1, %r8d -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shldq %cl, %r8, %rsi -; AVX2-NEXT: shlxq %rcx, %r8, %r8 -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %r8, %rsi -; AVX2-NEXT: cmovneq %rax, %r8 -; AVX2-NEXT: notq %rsi -; AVX2-NEXT: notq %r8 -; AVX2-NEXT: movl %ecx, %r9d -; AVX2-NEXT: andl $96, %r9d -; AVX2-NEXT: shrl $3, %r9d -; AVX2-NEXT: movl (%rdi,%r9), %r9d -; AVX2-NEXT: btl %ecx, %r9d -; AVX2-NEXT: jb .LBB22_2 -; AVX2-NEXT: # %bb.1: -; AVX2-NEXT: movl (%rdx), %eax -; AVX2-NEXT: .LBB22_2: -; AVX2-NEXT: andq %rsi, 8(%rdi) -; AVX2-NEXT: andq %r8, (%rdi) -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax -; AVX2-NEXT: retq -; -; AVX512-LABEL: reset_multiload_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl $1, %r8d -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shldq %cl, %r8, %rsi -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: shlxq %rcx, %r8, %r8 -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %r8, %rsi -; AVX512-NEXT: cmovneq %rax, %r8 -; AVX512-NEXT: notq %rsi -; AVX512-NEXT: notq %r8 -; AVX512-NEXT: movl %ecx, %r9d -; AVX512-NEXT: andl $96, %r9d -; AVX512-NEXT: shrl $3, %r9d -; AVX512-NEXT: movl (%rdi,%r9), %r9d -; AVX512-NEXT: btl %ecx, %r9d -; AVX512-NEXT: jb .LBB22_2 -; AVX512-NEXT: # %bb.1: -; AVX512-NEXT: movl (%rdx), %eax -; AVX512-NEXT: .LBB22_2: -; AVX512-NEXT: andq %rsi, 8(%rdi) -; AVX512-NEXT: andq %r8, (%rdi) -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: retq +; AVX-LABEL: reset_multiload_i128: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: movl $1, %esi +; AVX-NEXT: xorl %r8d, %r8d +; AVX-NEXT: shldq %cl, %rsi, %r8 +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: shlxq %rcx, %rsi, %r9 +; AVX-NEXT: testb $64, %cl +; AVX-NEXT: cmovneq %r9, %r8 +; AVX-NEXT: cmovneq %rax, %r9 +; AVX-NEXT: movq (%rdi), %r10 +; AVX-NEXT: movq 8(%rdi), %r11 +; AVX-NEXT: andnq %r11, %r8, %rcx +; AVX-NEXT: andq %r8, %r11 +; AVX-NEXT: andnq %r10, %r9, %rsi +; AVX-NEXT: andq %r9, %r10 +; AVX-NEXT: orq %r11, %r10 +; AVX-NEXT: jne .LBB22_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: movl (%rdx), %eax +; AVX-NEXT: .LBB22_2: +; AVX-NEXT: movq %rsi, (%rdi) +; AVX-NEXT: movq %rcx, 8(%rdi) +; AVX-NEXT: # kill: def $eax killed $eax killed $rax +; AVX-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs From b0808cf40ddc6fd6cc615ae65862a0c049e657cc Mon Sep 17 00:00:00 2001 From: Hui Date: Sat, 1 Nov 2025 08:51:35 +0000 Subject: [PATCH 428/539] [libc++] constexpr flat_multiset (#161016) Fixes https://github.com/llvm/llvm-project/issues/128676 --- libcxx/docs/FeatureTestMacroTable.rst | 4 + libcxx/include/__flat_set/flat_multiset.h | 272 +++++++++++------- libcxx/include/version | 4 + .../flat.multiset/insert.temporary.pass.cpp | 5 +- .../flat.multiset/insert_range.pass.cpp | 19 +- .../flat.multiset.capacity/empty.pass.cpp | 14 +- .../flat.multiset.capacity/max_size.pass.cpp | 7 +- .../flat.multiset.capacity/size.pass.cpp | 18 +- .../flat.multiset.cons/alloc.pass.cpp | 37 ++- .../assign_initializer_list.pass.cpp | 14 +- .../flat.multiset.cons/compare.pass.cpp | 75 +++-- .../flat.multiset.cons/containers.pass.cpp | 71 +++-- .../flat.multiset.cons/copy.pass.cpp | 22 +- .../flat.multiset.cons/copy_alloc.pass.cpp | 24 +- .../flat.multiset.cons/copy_assign.pass.cpp | 24 +- .../flat.multiset.cons/default.pass.cpp | 43 ++- .../flat.multiset.cons/dtor_noexcept.pass.cpp | 31 +- .../initializer_list.pass.cpp | 50 ++-- .../flat.multiset.cons/iter_iter.pass.cpp | 36 ++- .../flat.multiset.cons/move.pass.cpp | 24 +- .../flat.multiset.cons/move_alloc.pass.cpp | 22 +- .../flat.multiset.cons/move_assign.pass.cpp | 48 ++-- .../flat.multiset.cons/range.pass.cpp | 38 ++- .../sorted_container.pass.cpp | 54 ++-- .../sorted_initializer_list.pass.cpp | 43 ++- .../sorted_iter_iter.pass.cpp | 41 ++- .../flat.multiset.erasure/erase_if.pass.cpp | 26 +- .../flat.multiset.iterators/iterator.pass.cpp | 14 +- .../iterator_comparison.pass.cpp | 14 +- .../reverse_iterator.pass.cpp | 92 +++--- .../flat.multiset.modifiers/clear.pass.cpp | 14 +- .../flat.multiset.modifiers/emplace.pass.cpp | 21 +- .../emplace_hint.pass.cpp | 19 +- .../erase_iter.pass.cpp | 14 +- .../erase_iter_iter.pass.cpp | 14 +- .../erase_key.pass.cpp | 14 +- .../erase_key_transparent.pass.cpp | 29 +- .../flat.multiset.modifiers/extract.pass.cpp | 14 +- .../insert_cv.pass.cpp | 14 +- .../insert_initializer_list.pass.cpp | 14 +- .../insert_iter_cv.pass.cpp | 14 +- .../insert_iter_iter.pass.cpp | 14 +- .../insert_iter_rv.pass.cpp | 18 +- .../insert_range.pass.cpp | 14 +- .../insert_rv.pass.cpp | 18 +- .../insert_sorted_initializer_list.pass.cpp | 14 +- .../insert_sorted_iter_iter.pass.cpp | 14 +- .../flat.multiset.modifiers/replace.pass.cpp | 14 +- .../swap_free.pass.cpp | 14 +- .../swap_member.pass.cpp | 14 +- .../flat.multiset.observers/comp.pass.cpp | 9 +- .../contains.pass.cpp | 14 +- .../contains_transparent.pass.cpp | 14 +- .../flat.multiset.operations/count.pass.cpp | 14 +- .../count_transparent.pass.cpp | 14 +- .../equal_range.pass.cpp | 14 +- .../equal_range_transparent.pass.cpp | 14 +- .../flat.multiset.operations/find.pass.cpp | 14 +- .../find_transparent.pass.cpp | 14 +- .../lower_bound.pass.cpp | 14 +- .../lower_bound_transparent.pass.cpp | 14 +- .../upper_bound.pass.cpp | 14 +- .../upper_bound_transparent.pass.cpp | 14 +- .../flat.multiset/helpers.h | 2 +- .../flat.multiset/op_compare.pass.cpp | 16 +- .../flat_map.version.compile.pass.cpp | 27 ++ .../flat_set.version.compile.pass.cpp | 27 ++ .../version.version.compile.pass.cpp | 54 ++++ .../generate_feature_test_macro_components.py | 10 + 69 files changed, 1317 insertions(+), 497 deletions(-) diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index 8fba6db871f08..dd9bf8ad353c3 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -426,6 +426,10 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_constexpr_algorithms`` ``202306L`` ---------------------------------------------------------- ----------------- + ``__cpp_lib_constexpr_flat_map`` ``202502L`` + ---------------------------------------------------------- ----------------- + ``__cpp_lib_constexpr_flat_set`` ``202502L`` + ---------------------------------------------------------- ----------------- ``__cpp_lib_constexpr_forward_list`` ``202502L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_constexpr_list`` ``202502L`` diff --git a/libcxx/include/__flat_set/flat_multiset.h b/libcxx/include/__flat_set/flat_multiset.h index 7be0b2d20c54d..0f6bae584ca90 100644 --- a/libcxx/include/__flat_set/flat_multiset.h +++ b/libcxx/include/__flat_set/flat_multiset.h @@ -95,16 +95,16 @@ class flat_multiset { public: // [flat.multiset.cons], constructors - _LIBCPP_HIDE_FROM_ABI flat_multiset() noexcept(is_nothrow_default_constructible_v<_KeyContainer> && - is_nothrow_default_constructible_v<_Compare>) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset() noexcept( + is_nothrow_default_constructible_v<_KeyContainer> && is_nothrow_default_constructible_v<_Compare>) : __keys_(), __compare_() {} - _LIBCPP_HIDE_FROM_ABI flat_multiset(const flat_multiset&) = default; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(const flat_multiset&) = default; // The copy/move constructors are not specified in the spec, which means they should be defaulted. // However, the move constructor can potentially leave a moved-from object in an inconsistent // state if an exception is thrown. - _LIBCPP_HIDE_FROM_ABI flat_multiset(flat_multiset&& __other) noexcept( + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(flat_multiset&& __other) noexcept( is_nothrow_move_constructible_v<_KeyContainer> && is_nothrow_move_constructible_v<_Compare>) # if _LIBCPP_HAS_EXCEPTIONS try @@ -121,14 +121,16 @@ class flat_multiset { # endif // _LIBCPP_HAS_EXCEPTIONS } - _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(const key_compare& __comp) : __keys_(), __compare_(__comp) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multiset(const key_compare& __comp) + : __keys_(), __compare_(__comp) {} - _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(container_type __keys, const key_compare& __comp = key_compare()) + _LIBCPP_HIDE_FROM_ABI + _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multiset(container_type __keys, const key_compare& __comp = key_compare()) : __keys_(std::move(__keys)), __compare_(__comp) { ranges::sort(__keys_, __compare_); } - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(sorted_equivalent_t, container_type __keys, const key_compare& __comp = key_compare()) : __keys_(std::move(__keys)), __compare_(__comp) { _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); @@ -136,7 +138,7 @@ class flat_multiset { template requires __has_input_iterator_category<_InputIterator>::value - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(_InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare()) : __keys_(), __compare_(__comp) { insert(__first, __last); @@ -144,48 +146,53 @@ class flat_multiset { template requires __has_input_iterator_category<_InputIterator>::value - _LIBCPP_HIDE_FROM_ABI flat_multiset( + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset( sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare()) : __keys_(__first, __last), __compare_(__comp) { _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); } template <_ContainerCompatibleRange _Range> - _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t __fr, _Range&& __rg) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(from_range_t __fr, _Range&& __rg) : flat_multiset(__fr, std::forward<_Range>(__rg), key_compare()) {} template <_ContainerCompatibleRange _Range> - _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp) : flat_multiset(__comp) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp) + : flat_multiset(__comp) { insert_range(std::forward<_Range>(__rg)); } - _LIBCPP_HIDE_FROM_ABI flat_multiset(initializer_list __il, const key_compare& __comp = key_compare()) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(initializer_list __il, const key_compare& __comp = key_compare()) : flat_multiset(__il.begin(), __il.end(), __comp) {} - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(sorted_equivalent_t, initializer_list __il, const key_compare& __comp = key_compare()) : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __comp) {} template requires uses_allocator::value - _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multiset(const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator(__alloc)), __compare_() {} template requires uses_allocator::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(const key_compare& __comp, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(const key_compare& __comp, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator(__alloc)), __compare_(__comp) {} template requires uses_allocator::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(const container_type& __keys, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(const container_type& __keys, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator(__alloc, __keys)), __compare_() { ranges::sort(__keys_, __compare_); } template requires uses_allocator::value - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(const container_type& __keys, const key_compare& __comp, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator(__alloc, __keys)), __compare_(__comp) { ranges::sort(__keys_, __compare_); @@ -193,14 +200,15 @@ class flat_multiset { template requires uses_allocator::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(sorted_equivalent_t, const container_type& __keys, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(sorted_equivalent_t, const container_type& __keys, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator(__alloc, __keys)), __compare_() { _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); } template requires uses_allocator::value - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(sorted_equivalent_t, const container_type& __keys, const key_compare& __comp, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator(__alloc, __keys)), __compare_(__comp) { _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); @@ -208,13 +216,14 @@ class flat_multiset { template requires uses_allocator::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(const flat_multiset& __other, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(const flat_multiset& __other, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator(__alloc, __other.__keys_)), __compare_(__other.__compare_) {} template requires uses_allocator::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(flat_multiset&& __other, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(flat_multiset&& __other, const _Allocator& __alloc) # if _LIBCPP_HAS_EXCEPTIONS try # endif // _LIBCPP_HAS_EXCEPTIONS @@ -230,14 +239,15 @@ class flat_multiset { template requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator::value) - _LIBCPP_HIDE_FROM_ABI flat_multiset(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator(__alloc)), __compare_() { insert(__first, __last); } template requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator::value) - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(_InputIterator __first, _InputIterator __last, const key_compare& __comp, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator(__alloc)), __compare_(__comp) { insert(__first, __last); @@ -245,7 +255,7 @@ class flat_multiset { template requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator::value) - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator(__alloc, __first, __last)), __compare_() { _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); @@ -253,53 +263,57 @@ class flat_multiset { template requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator::value) - _LIBCPP_HIDE_FROM_ABI - flat_multiset(sorted_equivalent_t, - _InputIterator __first, - _InputIterator __last, - const key_compare& __comp, - const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset( + sorted_equivalent_t, + _InputIterator __first, + _InputIterator __last, + const key_compare& __comp, + const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator(__alloc, __first, __last)), __compare_(__comp) { _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); } template <_ContainerCompatibleRange _Range, class _Allocator> requires uses_allocator::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(from_range_t, _Range&& __rg, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator(__alloc)), __compare_() { insert_range(std::forward<_Range>(__rg)); } template <_ContainerCompatibleRange _Range, class _Allocator> requires uses_allocator::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc) : __keys_(std::make_obj_using_allocator(__alloc)), __compare_(__comp) { insert_range(std::forward<_Range>(__rg)); } template requires uses_allocator::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(initializer_list __il, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(initializer_list __il, const _Allocator& __alloc) : flat_multiset(__il.begin(), __il.end(), __alloc) {} template requires uses_allocator::value - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(initializer_list __il, const key_compare& __comp, const _Allocator& __alloc) : flat_multiset(__il.begin(), __il.end(), __comp, __alloc) {} template requires uses_allocator::value - _LIBCPP_HIDE_FROM_ABI flat_multiset(sorted_equivalent_t, initializer_list __il, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multiset(sorted_equivalent_t, initializer_list __il, const _Allocator& __alloc) : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __alloc) {} template requires uses_allocator::value - _LIBCPP_HIDE_FROM_ABI flat_multiset( + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset( sorted_equivalent_t, initializer_list __il, const key_compare& __comp, const _Allocator& __alloc) : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __comp, __alloc) {} - _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(initializer_list __il) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset& operator=(initializer_list __il) { clear(); insert(__il); return *this; @@ -308,9 +322,9 @@ class flat_multiset { // copy/move assignment are not specified in the spec (defaulted) // but move assignment can potentially leave moved from object in an inconsistent // state if an exception is thrown - _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(const flat_multiset&) = default; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset& operator=(const flat_multiset&) = default; - _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(flat_multiset&& __other) noexcept( + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset& operator=(flat_multiset&& __other) noexcept( is_nothrow_move_assignable_v<_KeyContainer> && is_nothrow_move_assignable_v<_Compare>) { auto __clear_other_guard = std::__make_scope_guard([&]() noexcept { __other.clear() /* noexcept */; }); auto __clear_self_guard = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); @@ -321,30 +335,52 @@ class flat_multiset { } // iterators - _LIBCPP_HIDE_FROM_ABI iterator begin() noexcept { return iterator(std::as_const(__keys_).begin()); } - _LIBCPP_HIDE_FROM_ABI const_iterator begin() const noexcept { return const_iterator(__keys_.begin()); } - _LIBCPP_HIDE_FROM_ABI iterator end() noexcept { return iterator(std::as_const(__keys_).end()); } - _LIBCPP_HIDE_FROM_ABI const_iterator end() const noexcept { return const_iterator(__keys_.end()); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator begin() noexcept { + return iterator(std::as_const(__keys_).begin()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator begin() const noexcept { + return const_iterator(__keys_.begin()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator end() noexcept { + return iterator(std::as_const(__keys_).end()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator end() const noexcept { + return const_iterator(__keys_.end()); + } - _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() noexcept { return reverse_iterator(end()); } - _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const noexcept { return const_reverse_iterator(end()); } - _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() noexcept { return reverse_iterator(begin()); } - _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const noexcept { return const_reverse_iterator(begin()); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rbegin() noexcept { + return reverse_iterator(end()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator rbegin() const noexcept { + return const_reverse_iterator(end()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rend() noexcept { + return reverse_iterator(begin()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator rend() const noexcept { + return const_reverse_iterator(begin()); + } - _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const noexcept { return begin(); } - _LIBCPP_HIDE_FROM_ABI const_iterator cend() const noexcept { return end(); } - _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(end()); } - _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const noexcept { return const_reverse_iterator(begin()); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cbegin() const noexcept { return begin(); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cend() const noexcept { return end(); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator crbegin() const noexcept { + return const_reverse_iterator(end()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator crend() const noexcept { + return const_reverse_iterator(begin()); + } // capacity - [[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool empty() const noexcept { return __keys_.empty(); } - _LIBCPP_HIDE_FROM_ABI size_type size() const noexcept { return __keys_.size(); } - _LIBCPP_HIDE_FROM_ABI size_type max_size() const noexcept { return __keys_.max_size(); } + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool empty() const noexcept { + return __keys_.empty(); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type size() const noexcept { return __keys_.size(); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type max_size() const noexcept { return __keys_.max_size(); } // [flat.multiset.modifiers], modifiers template requires is_constructible_v - _LIBCPP_HIDE_FROM_ABI iterator emplace(_Args&&... __args) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator emplace(_Args&&... __args) { if constexpr (sizeof...(__args) == 1 && (is_same_v, _Key> && ...)) { return __emplace(std::forward<_Args>(__args)...); } else { @@ -354,7 +390,7 @@ class flat_multiset { template requires is_constructible_v - _LIBCPP_HIDE_FROM_ABI iterator emplace_hint(const_iterator __hint, _Args&&... __args) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator emplace_hint(const_iterator __hint, _Args&&... __args) { if constexpr (sizeof...(__args) == 1 && (is_same_v, _Key> && ...)) { return __emplace_hint(std::move(__hint), std::forward<_Args>(__args)...); } else { @@ -362,21 +398,23 @@ class flat_multiset { } } - _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return emplace(__x); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const value_type& __x) { return emplace(__x); } - _LIBCPP_HIDE_FROM_ABI iterator insert(value_type&& __x) { return emplace(std::move(__x)); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(value_type&& __x) { + return emplace(std::move(__x)); + } - _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, const value_type& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, const value_type& __x) { return emplace_hint(__hint, __x); } - _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, value_type&& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, value_type&& __x) { return emplace_hint(__hint, std::move(__x)); } template requires __has_input_iterator_category<_InputIterator>::value - _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(_InputIterator __first, _InputIterator __last) { if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) { __reserve(__last - __first); } @@ -385,7 +423,8 @@ class flat_multiset { template requires __has_input_iterator_category<_InputIterator>::value - _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, _InputIterator __first, _InputIterator __last) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void + insert(sorted_equivalent_t, _InputIterator __first, _InputIterator __last) { if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) { __reserve(__last - __first); } @@ -394,7 +433,7 @@ class flat_multiset { } template <_ContainerCompatibleRange _Range> - _LIBCPP_HIDE_FROM_ABI void insert_range(_Range&& __range) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert_range(_Range&& __range) { if constexpr (ranges::sized_range<_Range>) { __reserve(ranges::size(__range)); } @@ -402,26 +441,29 @@ class flat_multiset { __append_sort_merge(std::forward<_Range>(__range)); } - _LIBCPP_HIDE_FROM_ABI void insert(initializer_list __il) { insert(__il.begin(), __il.end()); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(initializer_list __il) { + insert(__il.begin(), __il.end()); + } - _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, initializer_list __il) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void + insert(sorted_equivalent_t, initializer_list __il) { insert(sorted_equivalent, __il.begin(), __il.end()); } - _LIBCPP_HIDE_FROM_ABI container_type extract() && { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 container_type extract() && { auto __guard = std::__make_scope_guard([&]() noexcept { clear() /* noexcept */; }); auto __ret = std::move(__keys_); return __ret; } - _LIBCPP_HIDE_FROM_ABI void replace(container_type&& __keys) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void replace(container_type&& __keys) { _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys, __compare_), "Key container is not sorted"); auto __guard = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); __keys_ = std::move(__keys); __guard.__complete(); } - _LIBCPP_HIDE_FROM_ABI iterator erase(iterator __position) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(iterator __position) { auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); auto __key_iter = __keys_.erase(__position.__base()); __on_failure.__complete(); @@ -431,7 +473,7 @@ class flat_multiset { // The following overload is the same as the iterator overload // iterator erase(const_iterator __position); - _LIBCPP_HIDE_FROM_ABI size_type erase(const key_type& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type erase(const key_type& __x) { auto [__first, __last] = equal_range(__x); auto __res = __last - __first; erase(__first, __last); @@ -441,21 +483,21 @@ class flat_multiset { template requires(__is_transparent_v<_Compare> && !is_convertible_v<_Kp &&, iterator> && !is_convertible_v<_Kp &&, const_iterator>) - _LIBCPP_HIDE_FROM_ABI size_type erase(_Kp&& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type erase(_Kp&& __x) { auto [__first, __last] = equal_range(__x); auto __res = __last - __first; erase(__first, __last); return __res; } - _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __first, const_iterator __last) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(const_iterator __first, const_iterator __last) { auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); auto __key_it = __keys_.erase(__first.__base(), __last.__base()); __on_failure.__complete(); return iterator(std::move(__key_it)); } - _LIBCPP_HIDE_FROM_ABI void swap(flat_multiset& __y) noexcept { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void swap(flat_multiset& __y) noexcept { // warning: The spec has unconditional noexcept, which means that // if any of the following functions throw an exception, // std::terminate will be called @@ -464,126 +506,139 @@ class flat_multiset { ranges::swap(__keys_, __y.__keys_); } - _LIBCPP_HIDE_FROM_ABI void clear() noexcept { __keys_.clear(); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void clear() noexcept { __keys_.clear(); } // observers - _LIBCPP_HIDE_FROM_ABI key_compare key_comp() const { return __compare_; } - _LIBCPP_HIDE_FROM_ABI value_compare value_comp() const { return __compare_; } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 key_compare key_comp() const { return __compare_; } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 value_compare value_comp() const { return __compare_; } // map operations - _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __x) { return __find_impl(*this, __x); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const key_type& __x) { + return __find_impl(*this, __x); + } - _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __x) const { return __find_impl(*this, __x); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const key_type& __x) const { + return __find_impl(*this, __x); + } template requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI iterator find(const _Kp& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const _Kp& __x) { return __find_impl(*this, __x); } template requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI const_iterator find(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const _Kp& __x) const { return __find_impl(*this, __x); } - _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const key_type& __x) const { auto [__first, __last] = equal_range(__x); return __last - __first; } template requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI size_type count(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const _Kp& __x) const { auto [__first, __last] = equal_range(__x); return __last - __first; } - _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __x) const { return find(__x) != end(); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const key_type& __x) const { + return find(__x) != end(); + } template requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI bool contains(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const _Kp& __x) const { return find(__x) != end(); } - _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const key_type& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const key_type& __x) { const auto& __keys = __keys_; return iterator(std::lower_bound(__keys.begin(), __keys.end(), __x, __compare_)); } - _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const key_type& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator lower_bound(const key_type& __x) const { return const_iterator(std::lower_bound(__keys_.begin(), __keys_.end(), __x, __compare_)); } template requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const _Kp& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const _Kp& __x) { const auto& __keys = __keys_; return iterator(std::lower_bound(__keys.begin(), __keys.end(), __x, __compare_)); } template requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator lower_bound(const _Kp& __x) const { return const_iterator(std::lower_bound(__keys_.begin(), __keys_.end(), __x, __compare_)); } - _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const key_type& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const key_type& __x) { const auto& __keys = __keys_; return iterator(std::upper_bound(__keys.begin(), __keys.end(), __x, __compare_)); } - _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const key_type& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator upper_bound(const key_type& __x) const { return const_iterator(std::upper_bound(__keys_.begin(), __keys_.end(), __x, __compare_)); } template requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const _Kp& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const _Kp& __x) { const auto& __keys = __keys_; return iterator(std::upper_bound(__keys.begin(), __keys.end(), __x, __compare_)); } template requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator upper_bound(const _Kp& __x) const { return const_iterator(std::upper_bound(__keys_.begin(), __keys_.end(), __x, __compare_)); } - _LIBCPP_HIDE_FROM_ABI pair equal_range(const key_type& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair equal_range(const key_type& __x) { return __equal_range_impl(*this, __x); } - _LIBCPP_HIDE_FROM_ABI pair equal_range(const key_type& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair + equal_range(const key_type& __x) const { return __equal_range_impl(*this, __x); } template requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI pair equal_range(const _Kp& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair equal_range(const _Kp& __x) { return __equal_range_impl(*this, __x); } template requires __is_transparent_v<_Compare> - _LIBCPP_HIDE_FROM_ABI pair equal_range(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair + equal_range(const _Kp& __x) const { return __equal_range_impl(*this, __x); } - friend _LIBCPP_HIDE_FROM_ABI bool operator==(const flat_multiset& __x, const flat_multiset& __y) { + friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool + operator==(const flat_multiset& __x, const flat_multiset& __y) { return ranges::equal(__x, __y); } - friend _LIBCPP_HIDE_FROM_ABI auto operator<=>(const flat_multiset& __x, const flat_multiset& __y) { + friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 auto + operator<=>(const flat_multiset& __x, const flat_multiset& __y) { return std::lexicographical_compare_three_way( __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way); } - friend _LIBCPP_HIDE_FROM_ABI void swap(flat_multiset& __x, flat_multiset& __y) noexcept { __x.swap(__y); } + friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void + swap(flat_multiset& __x, flat_multiset& __y) noexcept { + __x.swap(__y); + } private: template - _LIBCPP_HIDE_FROM_ABI void __append_sort_merge(_Args&&... __args) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void __append_sort_merge(_Args&&... __args) { auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); size_type __old_size = size(); __flat_set_utils::__append(*this, std::forward<_Args>(__args)...); @@ -598,13 +653,13 @@ class flat_multiset { } template - _LIBCPP_HIDE_FROM_ABI iterator __emplace(_Kp&& __key) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator __emplace(_Kp&& __key) { auto __it = upper_bound(__key); return __flat_set_utils::__emplace_exact_pos(*this, __it, std::forward<_Kp>(__key)); } template - _LIBCPP_HIDE_FROM_ABI iterator __emplace_hint(const_iterator __hint, _Kp&& __key) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator __emplace_hint(const_iterator __hint, _Kp&& __key) { auto __prev_larger = __hint != cbegin() && __compare_(__key, *std::prev(__hint)); auto __next_smaller = __hint != cend() && __compare_(*__hint, __key); @@ -636,7 +691,7 @@ class flat_multiset { } template - _LIBCPP_HIDE_FROM_ABI static auto __find_impl(_Self&& __self, const _Kp& __key) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto __find_impl(_Self&& __self, const _Kp& __key) { auto __it = __self.lower_bound(__key); auto __last = __self.end(); if (__it == __last || __self.__compare_(__key, *__it)) { @@ -646,29 +701,30 @@ class flat_multiset { } template - _LIBCPP_HIDE_FROM_ABI static auto __equal_range_impl(_Self&& __self, const _Kp& __key) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto __equal_range_impl(_Self&& __self, const _Kp& __key) { using __iter = _If>, const_iterator, iterator>; auto [__key_first, __key_last] = std::equal_range(__self.__keys_.begin(), __self.__keys_.end(), __key, __self.__compare_); return std::make_pair(__iter(__key_first), __iter(__key_last)); } - _LIBCPP_HIDE_FROM_ABI void __reserve(size_t __size) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void __reserve(size_t __size) { if constexpr (__container_traits<_KeyContainer>::__reservable) { __keys_.reserve(__size); } } template - friend typename flat_multiset<_Key2, _Compare2, _KeyContainer2>::size_type + friend typename flat_multiset<_Key2, _Compare2, _KeyContainer2>::size_type _LIBCPP_CONSTEXPR_SINCE_CXX26 erase_if(flat_multiset<_Key2, _Compare2, _KeyContainer2>&, _Predicate); _KeyContainer __keys_; _LIBCPP_NO_UNIQUE_ADDRESS key_compare __compare_; struct __key_equiv { - _LIBCPP_HIDE_FROM_ABI __key_equiv(key_compare __c) : __comp_(__c) {} - _LIBCPP_HIDE_FROM_ABI bool operator()(const_reference __x, const_reference __y) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __key_equiv(key_compare __c) : __comp_(__c) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool + operator()(const_reference __x, const_reference __y) const { return !__comp_(std::get<0>(__x), std::get<0>(__y)) && !__comp_(std::get<0>(__y), std::get<0>(__x)); } key_compare __comp_; @@ -757,7 +813,7 @@ struct uses_allocator, _Allocator> : bool_constant > {}; template -_LIBCPP_HIDE_FROM_ABI typename flat_multiset<_Key, _Compare, _KeyContainer>::size_type +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 typename flat_multiset<_Key, _Compare, _KeyContainer>::size_type erase_if(flat_multiset<_Key, _Compare, _KeyContainer>& __flat_multiset, _Predicate __pred) { auto __guard = std::__make_exception_guard([&] { __flat_multiset.clear(); }); auto __it = diff --git a/libcxx/include/version b/libcxx/include/version index 0fef1bb87cf60..b41cc9ed4ce06 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -71,6 +71,8 @@ __cpp_lib_constexpr_charconv 202207L __cpp_lib_constexpr_cmath 202202L __cpp_lib_constexpr_complex 201711L __cpp_lib_constexpr_dynamic_alloc 201907L +__cpp_lib_constexpr_flat_map 202502L +__cpp_lib_constexpr_flat_set 202502L __cpp_lib_constexpr_forward_list 202502L __cpp_lib_constexpr_functional 201907L __cpp_lib_constexpr_iterator 201811L @@ -552,6 +554,8 @@ __cpp_lib_void_t 201411L # define __cpp_lib_bitset 202306L # undef __cpp_lib_constexpr_algorithms # define __cpp_lib_constexpr_algorithms 202306L +# define __cpp_lib_constexpr_flat_map 202502L +# define __cpp_lib_constexpr_flat_set 202502L # define __cpp_lib_constexpr_forward_list 202502L # define __cpp_lib_constexpr_list 202502L # if !defined(_LIBCPP_ABI_VCRUNTIME) diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp index 248f282209fd7..acd20ce525a0d 100644 --- a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp +++ b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp @@ -21,7 +21,7 @@ #include "../flat_helpers.h" #include "test_macros.h" -bool test() { +constexpr bool test() { using M = std::flat_multiset; { M m; @@ -43,6 +43,9 @@ bool test() { int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp index 57a581c6c5cb9..c2fcd86fcf913 100644 --- a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp +++ b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp @@ -20,27 +20,36 @@ #include #include #include -#include #include #include "../flat_helpers.h" +#include "test_iterators.h" #include "test_macros.h" -void test() { +constexpr bool test() { NotQuiteSequenceContainer v; std::flat_multiset s(v); - std::istringstream ints("0 1 1 0"); - auto r = std::ranges::subrange(std::istream_iterator(ints), std::istream_iterator()) | - std::views::transform([](int i) { return i * i; }); + + int ar[] = {0, 1, 1, 0}; + using Iter = cpp20_input_iterator; + using Sent = sentinel_wrapper; + using R = std::ranges::subrange; + auto r = R(Iter(ar), Sent(Iter(ar + 4))); + static_assert( ![](auto& t) { return requires { t.insert_range(t.end(), r); }; }(v), "This test is to test the case where the underlying container does not provide insert_range"); s.insert_range(r); assert(std::ranges::equal(s, std::vector{0, 0, 1, 1})); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp index 52f77438df2ce..88a76d3c1c8b8 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp @@ -24,7 +24,7 @@ #include "min_allocator.h" template -void test_one() { +constexpr void test_one() { using Key = typename KeyContainer::value_type; using M = std::flat_multiset, KeyContainer>; M m; @@ -38,15 +38,23 @@ void test_one() { assert(m.empty()); } -void test() { +constexpr bool test() { test_one>(); - test_one>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one>(); test_one>(); test_one>>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp index 4e3d1414b28af..fb9c38f592262 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp @@ -24,7 +24,7 @@ #include "test_allocator.h" #include "test_macros.h" -void test() { +constexpr bool test() { { using A1 = limited_allocator; using C = std::flat_multiset, std::vector>; @@ -59,10 +59,15 @@ void test() { assert(c.max_size() <= max_dist); assert(c.max_size() <= alloc_max_size(std::allocator())); } + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp index 4aff08b8127b6..156bb27fae992 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=200000000 +// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=800000000 // @@ -23,7 +25,7 @@ #include "min_allocator.h" template -void test_one() { +constexpr void test_one() { using M = std::flat_multiset, KeyContainer>; using S = typename M::size_type; { @@ -46,7 +48,7 @@ void test_one() { } { M m; - S s = 500000; + S s = 5000; for (std::size_t i = 0u; i < s; ++i) { m.emplace(i); m.emplace(i); @@ -57,15 +59,23 @@ void test_one() { } } -void test() { +constexpr bool test() { test_one>(); - test_one>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test_one>(); test_one>(); test_one>>(); + + return true; } int main(int, char**) { test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp index 4fffcb304d20a..2426fbc0fc063 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp @@ -14,6 +14,7 @@ // explicit flat_multiset(const Allocator& a); #include +#include #include #include #include @@ -22,7 +23,8 @@ #include "test_allocator.h" #include "../../../test_compare.h" -void test() { +template