125 changes: 74 additions & 51 deletions clang/lib/Sema/SemaTemplate.cpp

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion clang/lib/Sema/SemaType.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3792,7 +3792,7 @@ static QualType GetDeclSpecTypeForDeclarator(TypeProcessingState &state,
<< Kind << Error << (int)SemaRef.getTemplateNameKindForDiagnostics(TN)
<< QualType(Deduced, 0) << AutoRange;
if (auto *TD = TN.getAsTemplateDecl())
SemaRef.Diag(TD->getLocation(), diag::note_template_decl_here);
SemaRef.NoteTemplateLocation(*TD);

T = SemaRef.Context.IntTy;
D.setInvalidType(true);
Expand Down
12 changes: 4 additions & 8 deletions clang/test/CodeGenCXX/debug-info-class.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,18 +117,11 @@ int main(int argc, char **argv) {
// CHECK-NOT: identifier:
// CHECK-SAME: ){{$}}

// CHECK: !DIGlobalVariableExpression(var: ![[HDR_VAR:[0-9]+]], expr: !DIExpression(DW_OP_constu, 52, DW_OP_stack_value))
// CHECK: ![[HDR_VAR]] = distinct !DIGlobalVariable(name: "HdrSize",
// CHECK-SAME: isLocal: true, isDefinition: true, declaration: ![[HDR_VAR_DECL:[0-9]+]])
// CHECK: ![[INT:[0-9]+]] = !DIBasicType(name: "int"
// CHECK: ![[HDR_VAR_DECL]] = !DIDerivedType(tag: DW_TAG_member, name: "HdrSize"

// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "A"

// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "I"
// CHECK-NOT: DIFlagFwdDecl
// CHECK-SAME: ){{$}}

// CHECK: ![[INT:[0-9]+]] = !DIBasicType(name: "int"
// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "foo"
// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "bar"
// CHECK: !DICompositeType(tag: DW_TAG_union_type, name: "baz"
Expand Down Expand Up @@ -194,5 +187,8 @@ int main(int argc, char **argv) {
// CHECK: [[G_INNER_I]] = !DIDerivedType(tag: DW_TAG_member, name: "j"
// CHECK-SAME: baseType: ![[INT]]

// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "A"
// CHECK: !DIDerivedType(tag: DW_TAG_member, name: "HdrSize"

// CHECK: ![[EXCEPTLOC]] = !DILocation(line: 100,
// CHECK: ![[RETLOC]] = !DILocation(line: 99,
104 changes: 0 additions & 104 deletions clang/test/CodeGenCXX/debug-info-static-inline-member.cpp

This file was deleted.

20 changes: 4 additions & 16 deletions clang/test/CodeGenCXX/debug-info-static-member.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
// RUN: %clangxx -target x86_64-unknown-unknown -g -gdwarf-4 %s -emit-llvm -S -o - | FileCheck --check-prefixes=CHECK,DWARF4,CPP11,NOT-MS %s
// RUN: %clangxx -target x86_64-unknown-unknown -g -gdwarf-4 %s -emit-llvm -S -o - | FileCheck --check-prefixes=CHECK,DWARF4,NOT-MS %s
// RUN: %clangxx -target x86_64-unknown-unknown -g -gdwarf-4 -std=c++98 %s -emit-llvm -S -o - | FileCheck --check-prefixes=CHECK,DWARF4,NOT-MS %s
// RUN: %clangxx -target x86_64-unknown-unknown -g -gdwarf-4 -std=c++11 %s -emit-llvm -S -o - | FileCheck --check-prefixes=CHECK,DWARF4,CPP11,NOT-MS %s
// RUN: %clangxx -target x86_64-unknown-unknown -g -gdwarf-5 -std=c++11 %s -emit-llvm -S -o - | FileCheck --check-prefixes=CHECK,DWARF5,CPP11 %s
// RUN: %clangxx -target x86_64-windows-msvc -g -gdwarf-4 %s -emit-llvm -S -o - | FileCheck --check-prefixes=CHECK,DWARF4,CPP11 %s
// RUN: %clangxx -target x86_64-unknown-unknown -g -gdwarf-4 -std=c++11 %s -emit-llvm -S -o - | FileCheck --check-prefixes=CHECK,DWARF4,NOT-MS %s
// RUN: %clangxx -target x86_64-unknown-unknown -g -gdwarf-5 -std=c++11 %s -emit-llvm -S -o - | FileCheck --check-prefixes=CHECK,DWARF5 %s
// RUN: %clangxx -target x86_64-windows-msvc -g -gdwarf-4 %s -emit-llvm -S -o - | FileCheck --check-prefixes=CHECK,DWARF4 %s
// PR14471

// CHECK: @{{.*}}a{{.*}} = dso_local global i32 4, align 4, !dbg [[A:![0-9]+]]
Expand Down Expand Up @@ -166,15 +166,3 @@ struct y {
};
int y::z;
}

// CHECK: !DIGlobalVariableExpression(var: ![[CONST_A_VAR:[0-9]+]], expr: !DIExpression(DW_OP_constu, 1, DW_OP_stack_value))
// CHECK: ![[CONST_A_VAR]] = distinct !DIGlobalVariable(name: "const_a"
// CHECK-SAME: isLocal: true, isDefinition: true, declaration: ![[CONST_A_DECL]])

// CPP11: !DIGlobalVariableExpression(var: ![[CONST_B_VAR:[0-9]+]], expr: !DIExpression(DW_OP_constu, {{.*}}, DW_OP_stack_value))
// CPP11: ![[CONST_B_VAR]] = distinct !DIGlobalVariable(name: "const_b"
// CPP11-SAME: isLocal: true, isDefinition: true, declaration: ![[CONST_B_DECL]])

// CHECK: !DIGlobalVariableExpression(var: ![[CONST_C_VAR:[0-9]+]], expr: !DIExpression(DW_OP_constu, 18, DW_OP_stack_value))
// CHECK: ![[CONST_C_VAR]] = distinct !DIGlobalVariable(name: "const_c"
// CHECK-SAME: isLocal: true, isDefinition: true, declaration: ![[CONST_C_DECL]])
9 changes: 4 additions & 5 deletions clang/test/SemaHLSL/BuiltIns/vector-errors.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,16 @@

// Some bad declarations
hlsl::vector ShouldWorkSomeday; // expected-error{{use of alias template 'hlsl::vector' requires template arguments}}
// expected-note@*:* {{template declaration from hidden source: template <class element = float, int element_count = 4> using vector = element __attribute__((ext_vector_type(element_count)))}}

hlsl::vector<1> BadVec; // expected-error{{template argument for template type parameter must be a type}}
// expected-note@*:* {{template is declared here}}
// expected-note@*:* {{template parameter is declared here}}

// expected-note@*:* {{template parameter from hidden source: class element = float}}

hlsl::vector<int, float> AnotherBadVec; // expected-error{{template argument for non-type template parameter must be an expression}}
// expected-note@*:* {{template parameter is declared here}}
// expected-note@*:* {{template parameter from hidden source: int element_count = 4}}

hlsl::vector<int, 2, 3> YABV; // expected-error{{too many template arguments for alias template 'vector'}}
// expected-note@*:* {{template is declared here}}
// expected-note@*:* {{template declaration from hidden source: template <class element = float, int element_count = 4> using vector = element __attribute__((ext_vector_type(element_count)))}}

// This code is rejected by clang because clang puts the HLSL built-in types
// into the HLSL namespace.
Expand Down
2 changes: 1 addition & 1 deletion flang/test/Lower/OpenACC/acc-set.f90
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ program test_acc_set
! CHECK: %[[C0:.*]] = arith.constant 0 : i32
! CHECK: acc.set device_num(%[[C0]] : i32)

! CHECK: acc.set attributes {device_type = #acc.device_type<*>}
! CHECK: acc.set attributes {device_type = #acc.device_type<star>}

! CHECK: acc.set attributes {device_type = #acc.device_type<multicore>}

Expand Down
2 changes: 1 addition & 1 deletion flang/test/Lower/OpenACC/acc-update.f90
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ subroutine acc_update

!$acc update host(a) device_type(*)
! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
! CHECK: acc.update dataOperands(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) attributes {device_types = [#acc.device_type<*>]}
! CHECK: acc.update dataOperands(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) attributes {device_types = [#acc.device_type<star>]}
! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}

end subroutine acc_update
119 changes: 65 additions & 54 deletions libcxx/docs/Hardening.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,61 +15,72 @@ assertions that prevent undefined behavior caused by violating preconditions of
the standard library. Different hardening modes make different trade-offs
between the amount of checking and runtime performance. The available hardening
modes are:
- fast mode;
- extensive mode;
- debug mode.

The fast mode contains a set of security-critical checks that can be done with
relatively little overhead in constant time and are intended to be used in
production. We recommend most projects to adopt the fast mode.

The extensive mode contains all the checks from the fast mode and additionally
some checks for undefined behavior that incur relatively little overhead but
aren't security-critical. While the performance penalty is somewhat more
significant compared to the fast mode, the extensive mode is still intended to
be usable in production.

The debug mode enables all the available checks in the library, including
internal assertions, some of which might be very expensive. This mode is
intended to be used for testing, not in production.

Vendors can set the default hardening mode by using the
``LIBCXX_HARDENING_MODE`` variable at CMake configuration time with the possible
values of ``none``, ``fast``, ``extensive`` and ``debug``. The default value is
``none`` which doesn't enable any hardening checks (this mode is sometimes
called the ``unchecked`` mode).

When hardening is enabled, the compiled library is built with the corresponding
mode enabled, **and** user code will be built with the same mode enabled by
default. If the mode is set to "none" at the CMake configuration time, the
compiled library will not contain any assertions and the default when building
user code will be to have assertions disabled. As a user, you can consult your
vendor to know which level of hardening is enabled by default.

Furthermore, independently of any vendor-selected default, users can always
control which level of hardening is enabled in their code by defining the macro
``_LIBCPP_HARDENING_MODE`` before including any libc++ headers (preferably by
passing ``-D_LIBCPP_HARDENING_MODE=X`` to the compiler). The macro can be
set to one of the following possible values:

- ``_LIBCPP_HARDENING_MODE_NONE``;
- ``_LIBCPP_HARDENING_MODE_FAST``;
- ``_LIBCPP_HARDENING_MODE_EXTENSIVE``;
- ``_LIBCPP_HARDENING_MODE_DEBUG``.

The exact numeric values of these macros are unspecified and users should not
rely on them (e.g. expect the values to be sorted in any way).

Note that if the compiled library was built by the vendor with the hardening
mode set to "none", functions compiled inside the static or shared library won't
have any hardening enabled even if the user compiles with hardening enabled (the
same is true for the inverse case where the static or shared library was
compiled **with** hardening enabled but the user tries to disable it). However,
most of the code in libc++ is in the headers, so the user-selected value for
``_LIBCPP_HARDENING_MODE``, if any, will usually be respected.

Enabling hardening has no impact on the ABI.

- **Unchecked mode/none**, which disables all hardening checks.
- **Fast mode**, which contains a set of security-critical checks that can be
done with relatively little overhead in constant time and are intended to be
used in production. We recommend most projects adopt this.
- **Extensive mode**, which contains all the checks from fast mode and some
additional checks for undefined behavior that incur relatively little overhead
but aren't security-critical. Production builds requiring a broader set of
checks than fast mode should consider enabling extensive mode. The additional
rigour impacts performance more than fast mode: we recommend benchmarking to
determine if that is acceptable for your program.
- **Debug mode**, which enables all the available checks in the library,
including internal assertions, some of which might be very expensive. This
mode is intended to be used for testing, not in production.

.. note::

Enabling hardening has no impact on the ABI.

Notes for users
---------------

As a libc++ user, consult with your vendor to determine the level of hardening
enabled by default.

Users wishing for a different hardening level to their vendor default are able
to control the level by passing **one** of the following options to the compiler:

- ``-D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_NONE``
- ``-D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_FAST``
- ``-D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_EXTENSIVE``
- ``-D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_DEBUG``

.. warning::

The exact numeric values of these macros are unspecified and users should not
rely on them (e.g. expect the values to be sorted in any way).

.. warning::

If you would prefer to override the hardening level on a per-translation-unit
basis, you must do so **before** including any headers to avoid `ODR issues`_.

.. _`ODR issues`: https://en.cppreference.com/w/cpp/language/definition#:~:text=is%20ill%2Dformed.-,One%20Definition%20Rule,-Only%20one%20definition

.. note::

Since the static and shared library components of libc++ are built by the
vendor, setting this macro will have no impact on the hardening mode for the
pre-built components. Most libc++ code is header-based, so a user-provided
value for ``_LIBCPP_HARDENING_MODE`` will be mostly respected.

Notes for vendors
-----------------

Vendors can set the default hardening mode by providing ``LIBCXX_HARDENING_MODE``
as a configuration option, with the possible values of ``none``, ``fast``,
``extensive`` and ``debug``. The default value is ``none`` which doesn't enable
any hardening checks (this mode is sometimes called the ``unchecked`` mode).

This option controls both the hardening mode that the precompiled library is
built with and the default hardening mode that users will build with. If set to
``none``, the precompiled library will not contain any assertions, and user code
will default to building without assertions.

Iterator bounds checking
------------------------

TODO(hardening)
64 changes: 2 additions & 62 deletions lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,54 +142,6 @@ static bool ShouldIgnoreArtificialField(llvm::StringRef FieldName) {
|| FieldName.starts_with("_vptr.");
}

std::optional<DWARFFormValue>
DWARFASTParserClang::FindConstantOnVariableDefinition(DWARFDIE die) {
assert(die.Tag() == DW_TAG_member || die.Tag() == DW_TAG_variable);

auto *dwarf = die.GetDWARF();
if (!dwarf)
return {};

ConstString name{die.GetName()};
if (!name)
return {};

auto *CU = die.GetCU();
if (!CU)
return {};

DWARFASTParser *dwarf_ast = dwarf->GetDWARFParser(*CU);
auto parent_decl_ctx = dwarf_ast->GetDeclContextContainingUIDFromDWARF(die);

// Make sure we populate the GetDieToVariable cache.
VariableList variables;
dwarf->FindGlobalVariables(name, parent_decl_ctx, UINT_MAX, variables);

// The cache contains the variable definition whose DW_AT_specification
// points to our declaration DIE. Look up that definition using our
// declaration.
auto const &die_to_var = dwarf->GetDIEToVariable();
auto it = die_to_var.find(die.GetDIE());
if (it == die_to_var.end())
return {};

auto var_sp = it->getSecond();
assert(var_sp != nullptr);

if (!var_sp->GetLocationIsConstantValueData())
return {};

auto def = dwarf->GetDIE(var_sp->GetID());
auto def_attrs = def.GetAttributes();
DWARFFormValue form_value;
if (!def_attrs.ExtractFormValueAtIndex(
def_attrs.FindAttributeIndex(llvm::dwarf::DW_AT_const_value),
form_value))
return {};

return form_value;
}

TypeSP DWARFASTParserClang::ParseTypeFromClangModule(const SymbolContext &sc,
const DWARFDIE &die,
Log *log) {
Expand Down Expand Up @@ -2916,23 +2868,11 @@ void DWARFASTParserClang::CreateStaticMemberVariable(

bool unused;
// TODO: Support float/double static members as well.
if (!ct.IsIntegerOrEnumerationType(unused))
if (!ct.IsIntegerOrEnumerationType(unused) || !attrs.const_value_form)
return;

auto maybe_const_form_value = attrs.const_value_form;

// Newer versions of Clang don't emit the DW_AT_const_value
// on the declaration of an inline static data member. Instead
// it's attached to the definition DIE. If that's the case,
// try and fetch it.
if (!maybe_const_form_value) {
maybe_const_form_value = FindConstantOnVariableDefinition(die);
if (!maybe_const_form_value)
return;
}

llvm::Expected<llvm::APInt> const_value_or_err =
ExtractIntFromFormValue(ct, *maybe_const_form_value);
ExtractIntFromFormValue(ct, *attrs.const_value_form);
if (!const_value_or_err) {
LLDB_LOG_ERROR(log, const_value_or_err.takeError(),
"Failed to add const value to variable {1}: {0}",
Expand Down
11 changes: 0 additions & 11 deletions lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
Original file line number Diff line number Diff line change
Expand Up @@ -413,17 +413,6 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
lldb_private::CompilerType &class_clang_type,
const lldb::AccessType default_accesibility,
lldb_private::ClangASTImporter::LayoutInfo &layout_info);

/// Tries to find the definition DW_TAG_variable DIE of the the specified
/// DW_TAG_member 'die'. If such definition exists, returns the
/// DW_AT_const_value of that definition if available. Returns std::nullopt
/// otherwise.
///
/// In newer versions of clang, DW_AT_const_value attributes are not attached
/// to the declaration of a inline static data-member anymore, but rather on
/// its definition. This function is used to locate said constant.
std::optional<lldb_private::plugin::dwarf::DWARFFormValue>
FindConstantOnVariableDefinition(lldb_private::plugin::dwarf::DWARFDIE die);
};

/// Parsed form of all attributes that are relevant for type reconstruction.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,9 @@ def test(self):
# it does not crash.
self.expect("image lookup -t A")

# For debug-info produced by older versions of clang, dsymutil strips the
# debug info for classes that only have const static data members without
# definitions.
@expectedFailureAll(
debug_info=["dsym"], compiler=["clang"], compiler_version=["<", "18.0"]
)
# dsymutil strips the debug info for classes that only have const static
# data members without locations.
@expectedFailureAll(debug_info=["dsym"])
def test_class_with_only_const_static(self):
self.build()
lldbutil.run_to_source_breakpoint(
Expand All @@ -123,10 +120,6 @@ def check_global_var(self, name: str, expect_type, expect_val):
self.assertEqual(varobj.type.name, expect_type)
self.assertEqual(varobj.value, expect_val)

# For debug-info produced by older versions of clang, inline static data members
# wouldn't get indexed into the Names accelerator table preventing LLDB from finding
# them.
@expectedFailureAll(compiler=["clang"], compiler_version=["<", "18.0"])
def test_inline_static_members(self):
self.build()
lldbutil.run_to_source_breakpoint(
Expand Down Expand Up @@ -174,9 +167,6 @@ def test_class_with_only_constexpr_static(self):
"ClassWithEnumAlias::enum_alias_alias", result_value="scoped_enum_case1"
)

# With older versions of Clang, LLDB fails to evaluate classes with only
# constexpr members when dsymutil is enabled
@expectedFailureAll(compiler=["clang"], compiler_version=["<", "18.0"])
def test_shadowed_static_inline_members(self):
"""Tests that the expression evaluator and SBAPI can both
correctly determine the requested inline static variable
Expand Down
6 changes: 2 additions & 4 deletions llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,11 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, LegalizeAction Action) {
}

raw_ostream &LegalityQuery::print(raw_ostream &OS) const {
OS << Opcode << ", Tys={";
OS << "Opcode=" << Opcode << ", Tys={";
for (const auto &Type : Types) {
OS << Type << ", ";
}
OS << "}, Opcode=";

OS << Opcode << ", MMOs={";
OS << "}, MMOs={";
for (const auto &MMODescr : MMODescrs) {
OS << MMODescr.MemoryTy << ", ";
}
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ StaticLibraryDefinitionGenerator::Load(

// If this is a universal binary then search for a slice matching the given
// Triple.
if (auto *UB = cast<object::MachOUniversalBinary>(B->getBinary())) {
if (auto *UB = dyn_cast<object::MachOUniversalBinary>(B->getBinary())) {

const auto &TT = L.getExecutionSession().getTargetTriple();

Expand Down Expand Up @@ -347,7 +347,7 @@ StaticLibraryDefinitionGenerator::Create(

// If this is a universal binary then search for a slice matching the given
// Triple.
if (auto *UB = cast<object::MachOUniversalBinary>(B->get())) {
if (auto *UB = dyn_cast<object::MachOUniversalBinary>(B->get())) {

const auto &TT = L.getExecutionSession().getTargetTriple();

Expand Down
7 changes: 4 additions & 3 deletions llvm/lib/Transforms/Scalar/Reassociate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2256,9 +2256,10 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
// with no common bits set, convert it to X+Y.
if (I->getOpcode() == Instruction::Or &&
shouldConvertOrWithNoCommonBitsToAdd(I) && !isLoadCombineCandidate(I) &&
haveNoCommonBitsSet(I->getOperand(0), I->getOperand(1),
SimplifyQuery(I->getModule()->getDataLayout(),
/*DT=*/nullptr, /*AC=*/nullptr, I))) {
(cast<PossiblyDisjointInst>(I)->isDisjoint() ||
haveNoCommonBitsSet(I->getOperand(0), I->getOperand(1),
SimplifyQuery(I->getModule()->getDataLayout(),
/*DT=*/nullptr, /*AC=*/nullptr, I)))) {
Instruction *NI = convertOrWithNoCommonBitsToAdd(I);
RedoInsts.insert(I);
MadeChange = true;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
define i32 @main(i32 %argc, i8** %argv) {
entry:
ret i32 0
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# RUN: llc -filetype=obj -o %t.o %S/Inputs/main-ret-0.ll
# RUN: cp %t.o %t.a
# RUN: not llvm-jitlink -noexec %t.o %t.a
#
# Try to load an object file as if it were an archive. Should result in an
# error, rather than a crash.
12 changes: 12 additions & 0 deletions llvm/test/Transforms/Reassociate/add-like-or.ll
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,18 @@ define i32 @test3(i32 %x, i32 %bit) {
ret i32 %res
}

; Test that disjoint allow reassociation.
define i32 @test4(i32 %a, i32 %b) {
; CHECK-LABEL: @test4(
; CHECK-NEXT: [[C:%.*]] = add i32 [[A:%.*]], 1
; CHECK-NEXT: [[C_PLUS_ONE:%.*]] = add i32 [[C]], [[B:%.*]]
; CHECK-NEXT: ret i32 [[C_PLUS_ONE]]
;
%c = or disjoint i32 %a, %b
%c.plus.one = add i32 %c, 1
ret i32 %c.plus.one
}

declare i32 @llvm.ctlz.i32(i32, i1 immarg) #2

!0 = !{i32 0, i32 33}
28 changes: 22 additions & 6 deletions mlir/docs/Dialects/ArmSME.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,29 @@
# 'ArmSME' Dialect

[TOC]
Basic dialect to target Arm SME.

This dialect defines custom and LLVM IR intrinsic operations that are used to
target Arm Scalable Matrix Extension. Through the available conversion and
ArmSME passes you can, for example, lower a
[linalg.matmul](https://mlir.llvm.org/docs/Dialects/Linalg/#linalgmatmul-linalgmatmulop)
opereation to Arm SME
[FMOPA](https://developer.arm.com/documentation/ddi0602/2023-03/SME-Instructions/FMOPA--widening---Half-precision-floating-point-sum-of-outer-products-and-accumulate-)
(floating-point outer product) operations. See one of the in-tree end-to-end
integration tests for reference:

* [Linalg/CPU/ArmSME/matmul.mlir](https://github.com/llvm/llvm-project/blob/main/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir)
* [Vector/CPU/ArmSME/test-outerproduct-f64.mlir](https://github.com/llvm/llvm-project/blob/main/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir)

Basic dialect to target Arm SME architectures This dialect contains the
definitions necessary to target Arm SME scalable matrix operations.
These tests are run "post-commit" by the
[clang-aarch64-sve-vla](https://lab.llvm.org/buildbot/#/builders/197) LLVM
BuildBot worker.

## References
* https://developer.arm.com/documentation/ddi0616
* https://developer.arm.com/documentation/ddi0602/2023-03/SME-Instructions
**References:**

* [The Scalable Matrix Extension (SME), for Armv9-A](https://developer.arm.com/documentation/ddi0616)
* [A64 -- SME Instructions (alphabetic order)](https://developer.arm.com/documentation/ddi0602/2023-03/SME-Instructions)

[TOC]

## Operations

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2038,6 +2038,7 @@ def VectorizeChildrenAndApplyPatternsOp :
let arguments = (ins TransformHandleTypeInterface:$target,
UnitAttr:$vectorize_padding,
UnitAttr:$vectorize_nd_extract,
UnitAttr:$flatten_1d_depthwise_conv,
UnitAttr:$disable_multi_reduction_to_contract_patterns,
UnitAttr:$disable_transfer_permutation_map_lowering_patterns);
let results = (outs TransformHandleTypeInterface:$transformed);
Expand All @@ -2049,7 +2050,8 @@ def VectorizeChildrenAndApplyPatternsOp :
let builders = [
OpBuilder<(ins "Value":$target,
CArg<"bool", "false">:$vectorizePadding,
CArg<"bool", "false">:$vectorizeNDExtract)>,
CArg<"bool", "false">:$vectorizeNDExtract,
CArg<"bool", "false">:$flatten1DDepthwise)>
];
let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(
Expand Down
3 changes: 2 additions & 1 deletion mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -753,7 +753,8 @@ LogicalResult deallocateGPUPrivateMemory(OpBuilder &, Value /*buffer*/);
LogicalResult vectorize(RewriterBase &rewriter, Operation *op,
ArrayRef<int64_t> inputVectorSizes = {},
ArrayRef<bool> inputScalableVecDims = {},
bool vectorizeNDExtract = false);
bool vectorizeNDExtract = false,
bool flatten1DDepthwiseConv = false);

/// Emit a suitable vector form for a Copy op with fully static shape.
LogicalResult vectorizeCopy(RewriterBase &builder, memref::CopyOp copyOp);
Expand Down
2 changes: 1 addition & 1 deletion mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def DeclareActionAttr : OpenACC_Attr<"DeclareAction", "declare_action"> {
}

// Device type enumeration.
def OpenACC_DeviceTypeStar : I32EnumAttrCase<"Star", 0, "*">;
def OpenACC_DeviceTypeStar : I32EnumAttrCase<"Star", 0, "star">;
def OpenACC_DeviceTypeDefault : I32EnumAttrCase<"Default", 1, "default">;
def OpenACC_DeviceTypeHost : I32EnumAttrCase<"Host", 2, "host">;
def OpenACC_DeviceTypeMulticore : I32EnumAttrCase<"Multicore", 3, "multicore">;
Expand Down
6 changes: 6 additions & 0 deletions mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBitOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,8 @@ def SPIRV_ShiftLeftLogicalOp : SPIRV_ShiftOp<"ShiftLeftLogical",
%5 = spirv.ShiftLeftLogical %3, %4 : vector<3xi32>, vector<3xi16>
```
}];

let hasFolder = 1;
}

// -----
Expand Down Expand Up @@ -399,6 +401,8 @@ def SPIRV_ShiftRightArithmeticOp : SPIRV_ShiftOp<"ShiftRightArithmetic",
%5 = spirv.ShiftRightArithmetic %3, %4 : vector<3xi32>, vector<3xi16>
```
}];

let hasFolder = 1;
}

// -----
Expand Down Expand Up @@ -431,6 +435,8 @@ def SPIRV_ShiftRightLogicalOp : SPIRV_ShiftOp<"ShiftRightLogical",
%5 = spirv.ShiftRightLogical %3, %4 : vector<3xi32>, vector<3xi16>
```
}];

let hasFolder = 1;
}

// -----
Expand Down
24 changes: 19 additions & 5 deletions mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2946,7 +2946,7 @@ LogicalResult TileUsingForallOp::verify() {

void transform::VectorizeChildrenAndApplyPatternsOp::build(
OpBuilder &builder, OperationState &result, Value target,
bool vectorizePadding, bool vectorizeExtract) {
bool vectorizePadding, bool vectorizeExtract, bool flatten1DDepthwiseConv) {
result.addOperands(target);
if (vectorizePadding) {
result.addAttribute(
Expand All @@ -2960,6 +2960,12 @@ void transform::VectorizeChildrenAndApplyPatternsOp::build(
result.name),
builder.getUnitAttr());
}
if (flatten1DDepthwiseConv) {
result.addAttribute(
VectorizeChildrenAndApplyPatternsOp::getFlatten_1dDepthwiseConvAttrName(
result.name),
builder.getUnitAttr());
}
result.addTypes(transform::AnyOpType::get(builder.getContext()));
}

Expand All @@ -2968,22 +2974,29 @@ namespace {
/// VectorizeChildrenAndApplyPatternsOp::applyToOne.
struct VectorizationPattern : public RewritePattern {
explicit VectorizationPattern(MLIRContext *context,
bool vectorizeExtract = false)
bool vectorizeExtract = false,
bool flattenConv = false)
: RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, context),
vectorizeNDExtract(vectorizeExtract) {}
vectorizeNDExtract(vectorizeExtract),
flatten1DDepthwiseConv(flattenConv) {}
LogicalResult matchAndRewrite(Operation *op,
PatternRewriter &rewriter) const override {
LinalgOp linalgOp = dyn_cast<LinalgOp>(op);
if (!linalgOp)
return rewriter.notifyMatchFailure(op, "expected Linalg Op");
return vectorize(rewriter, linalgOp, /*inputVectorSizes=*/{},
/*scalableVecDims=*/{}, vectorizeNDExtract);
/*scalableVecDims=*/{}, vectorizeNDExtract,
flatten1DDepthwiseConv);
}

private:
/// Controls whether to vectorize `tensor.extract` when the input tensor is
/// rank >= 2.
bool vectorizeNDExtract = false;
/// Controls whether to "flatten" the channel dimension when vectorising 1D
/// depthwise convolutions. This should lead to bette vectorization for
/// tensors with a low number of channel dimensions.
bool flatten1DDepthwiseConv = false;
};
} // namespace

Expand All @@ -3000,7 +3013,8 @@ transform::VectorizeChildrenAndApplyPatternsOp::applyToOne(

MLIRContext *ctx = getContext();
RewritePatternSet patterns(ctx);
patterns.add<VectorizationPattern>(ctx, getVectorizeNdExtract());
patterns.add<VectorizationPattern>(ctx, getVectorizeNdExtract(),
getFlatten_1dDepthwiseConv());

if (!getDisableTransferPermutationMapLoweringPatterns())
vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);
Expand Down
77 changes: 55 additions & 22 deletions mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,9 @@ using namespace mlir::linalg;
#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")

/// Try to vectorize `convOp` as a convolution.
static FailureOr<Operation *> vectorizeConvolution(RewriterBase &rewriter,
LinalgOp convOp);
static FailureOr<Operation *>
vectorizeConvolution(RewriterBase &rewriter, LinalgOp convOp,
bool flatten1DDepthwiseConv = false);

/// Return the unique instance of OpType in `block` if it is indeed unique.
/// Return null if none or more than 1 instances exist.
Expand Down Expand Up @@ -1664,7 +1665,8 @@ static void convertAffineApply(RewriterBase &rewriter, LinalgOp linalgOp) {
LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op,
ArrayRef<int64_t> inputVectorSizes,
ArrayRef<bool> inputScalableVecDims,
bool vectorizeNDExtract) {
bool vectorizeNDExtract,
bool flatten1DDepthwiseConv) {
LDBG("Attempting to vectorize:\n" << *op << "\n");
LDBG("Input vector sizes: ");
LLVM_DEBUG(llvm::interleaveComma(inputVectorSizes, llvm::dbgs()));
Expand Down Expand Up @@ -1696,8 +1698,8 @@ LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op,
// TODO: isaConvolutionOpInterface that can also infer from generic
// features. Will require stride/dilation attributes inference.
if (isa<ConvolutionOpInterface>(linalgOp.getOperation())) {
FailureOr<Operation *> convOr =
vectorizeConvolution(rewriter, linalgOp);
FailureOr<Operation *> convOr = vectorizeConvolution(
rewriter, linalgOp, flatten1DDepthwiseConv);
if (succeeded(convOr)) {
llvm::append_range(results, (*convOr)->getResults());
return success();
Expand Down Expand Up @@ -2822,7 +2824,7 @@ struct Conv1DGenerator
/// kw is always unrolled.
/// TODO: w (resp. kw) is unrolled when the strideW ( resp. dilationW) is
/// > 1.
FailureOr<Operation *> depthwiseConv() {
FailureOr<Operation *> depthwiseConv(bool flatten) {
if (!valid)
return rewriter.notifyMatchFailure(op, "unvectorizable depthwise conv");

Expand Down Expand Up @@ -2869,15 +2871,17 @@ struct Conv1DGenerator
//===------------------------------------------------------------------===//
// Unroll along kw and read slices of lhs and rhs.
SmallVector<Value> lhsVals, rhsVals, resVals;
auto inOutSliceSizes = SmallVector<int64_t>{nSize, wSizeStep, cSize};
auto inOutStrides = SmallVector<int64_t>{1, 1, 1};

// Extract lhs slice of size {n, wSizeStep, c}
// @ [0, sw * w + dw * kw, 0].
for (int64_t kw = 0; kw < kwSize; ++kw) {
for (int64_t w = 0; w < wSize; w += wSizeStep) {
lhsVals.push_back(rewriter.create<vector::ExtractStridedSliceOp>(
loc, lhs,
/*offsets=*/ArrayRef<int64_t>{0, w * strideW + kw * dilationW, 0},
/*sizes=*/ArrayRef<int64_t>{nSize, wSizeStep, cSize},
/*strides=*/ArrayRef<int64_t>{1, 1, 1}));
inOutSliceSizes, inOutStrides));
}
}
// Extract rhs slice of size {c} @ [kw].
Expand All @@ -2889,21 +2893,39 @@ struct Conv1DGenerator
for (int64_t w = 0; w < wSize; w += wSizeStep) {
resVals.push_back(rewriter.create<vector::ExtractStridedSliceOp>(
loc, res,
/*offsets=*/ArrayRef<int64_t>{0, w, 0},
/*sizes=*/ArrayRef<int64_t>{nSize, wSizeStep, cSize},
/*strides=*/ArrayRef<int64_t>{1, 1, 1}));
/*offsets=*/ArrayRef<int64_t>{0, w, 0}, inOutSliceSizes,
inOutStrides));
}

auto linearIndex = [&](int64_t kw, int64_t w) {
return kw * (wSize / wSizeStep) + w;
};

auto inOutFlattenSliceSizes =
SmallVector<int64_t>{nSize, wSizeStep * cSize};
auto lhsCastType = VectorType::get(inOutFlattenSliceSizes, lhsEltType);
auto resCastType = VectorType::get(inOutFlattenSliceSizes, resEltType);
// Compute contraction: O{n, w, c} += I{n, sw * w + dw * kw, c} * F{c}
for (int64_t kw = 0; kw < kwSize; ++kw) {
for (int64_t w = 0; w < wSize; w += wSizeStep) {
resVals[w] = depthwiseConv1dSliceAsMulAcc(rewriter, loc,
lhsVals[linearIndex(kw, w)],
rhsVals[kw], resVals[w]);
Value lhsVal = lhsVals[linearIndex(kw, w)];
Value resVal = resVals[w];
ShapedType filterBCastTy = cast<ShapedType>(resVal.getType());
if (flatten) {
// Flatten the input and filter vectors (collapse the channel
// dimension)
lhsVal = rewriter.create<vector::ShapeCastOp>(
loc, lhsCastType, lhsVals[linearIndex(kw, w)]);
resVal = rewriter.create<vector::ShapeCastOp>(loc, resCastType,
resVals[w]);
}
resVals[w] = depthwiseConv1dSliceAsMulAcc(
rewriter, loc, lhsVal, rhsVals[kw], resVal, filterBCastTy, flatten);
if (flatten) {
// Un-flatten the output vector (restore the channel dimension)
resVals[w] = rewriter.create<vector::ShapeCastOp>(
loc, VectorType::get(inOutSliceSizes, resEltType), resVals[w]);
}
}
}

Expand Down Expand Up @@ -2936,17 +2958,27 @@ struct Conv1DGenerator
.getOperation();
}

/// Lower lhs{n, w, c} * rhs{c} -> res{n, w, c} to MulAcc
/// Lower:
/// * lhs{n, w, c} * rhs{c} -> res{n, w, c} (flatten = false)
/// * lhs{n, w * c} * rhs{c} -> res{n, w * c} (flatten = true)
/// to MulAcc.
Value depthwiseConv1dSliceAsMulAcc(RewriterBase &rewriter, Location loc,
Value lhs, Value rhs, Value res) {
Value lhs, Value rhs, Value res,
ShapedType bcastTy, bool flatten) {
auto rhsTy = cast<ShapedType>(rhs.getType());
auto resTy = cast<ShapedType>(res.getType());

// TODO(suderman): Change this to use a vector.ima intrinsic.
lhs = promote(rewriter, loc, lhs, resTy);

rhs = rewriter.create<vector::BroadcastOp>(
loc, resTy.clone(rhsTy.getElementType()), rhs);
loc, bcastTy.clone(rhsTy.getElementType()), rhs);
if (flatten) {
// Flatten the channel dimension
rhs = rewriter.create<vector::ShapeCastOp>(
loc, resTy.clone(rhsTy.getElementType()), rhs);
}

rhs = promote(rewriter, loc, rhs, resTy);

if (!lhs || !rhs)
Expand Down Expand Up @@ -3049,7 +3081,7 @@ struct Conv1DGenerator

/// Entry point that transposes into the common form:
/// {{n, strideW * w + dilationW * kw, c}, {kw, c}, {n, w, c}}
FailureOr<Operation *> generateDilatedConv() {
FailureOr<Operation *> generateDilatedConv(bool flatten = false) {
AffineExpr n, w, c, kw;
bindDims(ctx, n, w, c, kw);
if (!iters({Par(), Par(), Par(), Red()}))
Expand All @@ -3060,7 +3092,7 @@ struct Conv1DGenerator
if (layout({/*lhsIndex*/ {n, strideW * w + dilationW * kw, c},
/*rhsIndex*/ {kw, c},
/*resIndex*/ {n, w, c}}))
return depthwiseConv();
return depthwiseConv(flatten);

return rewriter.notifyMatchFailure(op, "not a depthwise::Nwc layout");
}
Expand Down Expand Up @@ -3125,8 +3157,9 @@ struct Conv1DGenerator

/// Helper function to vectorize a LinalgOp with convolution semantics.
// TODO: extend the generic vectorization to support windows and drop this.
static FailureOr<Operation *> vectorizeConvolution(RewriterBase &rewriter,
LinalgOp op) {
static FailureOr<Operation *>
vectorizeConvolution(RewriterBase &rewriter, LinalgOp op,
bool flatten1DDepthwiseConv) {
// The ConvolutionOpInterface gives us guarantees of existence for
// strides/dilations. However, we do not need to rely on those, we can simply
// use them if present, otherwise use the default and let the generic conv.
Expand All @@ -3151,7 +3184,7 @@ static FailureOr<Operation *> vectorizeConvolution(RewriterBase &rewriter,
res = e.generateNcwPooling();
if (succeeded(res))
return res;
return e.generateDilatedConv();
return e.generateDilatedConv(flatten1DDepthwiseConv);
}

struct VectorizeConvolution : public OpInterfaceRewritePattern<LinalgOp> {
Expand Down
102 changes: 102 additions & 0 deletions mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -709,6 +709,108 @@ OpFoldResult spirv::LogicalOrOp::fold(FoldAdaptor adaptor) {
return Attribute();
}

//===----------------------------------------------------------------------===//
// spirv.ShiftLeftLogical
//===----------------------------------------------------------------------===//

OpFoldResult spirv::ShiftLeftLogicalOp::fold(
spirv::ShiftLeftLogicalOp::FoldAdaptor adaptor) {
// x << 0 -> x
if (matchPattern(adaptor.getOperand2(), m_Zero())) {
return getOperand1();
}

// Unfortunately due to below undefined behaviour can't fold 0 for Base.

// According to the SPIR-V spec:
//
// Type is a scalar or vector of integer type.
// Results are computed per component, and within each component, per bit...
//
// The result is undefined if Shift is greater than or equal to the bit width
// of the components of Base.
//
// So we can use the APInt << method, but don't fold if undefined behaviour.
bool shiftToLarge = false;
auto res = constFoldBinaryOp<IntegerAttr>(
adaptor.getOperands(), [&](const APInt &a, const APInt &b) {
if (shiftToLarge || b.uge(a.getBitWidth())) {
shiftToLarge = true;
return a;
}
return a << b;
});
return shiftToLarge ? Attribute() : res;
}

//===----------------------------------------------------------------------===//
// spirv.ShiftRightArithmetic
//===----------------------------------------------------------------------===//

OpFoldResult spirv::ShiftRightArithmeticOp::fold(
spirv::ShiftRightArithmeticOp::FoldAdaptor adaptor) {
// x >> 0 -> x
if (matchPattern(adaptor.getOperand2(), m_Zero())) {
return getOperand1();
}

// Unfortunately due to below undefined behaviour can't fold 0, -1 for Base.

// According to the SPIR-V spec:
//
// Type is a scalar or vector of integer type.
// Results are computed per component, and within each component, per bit...
//
// The result is undefined if Shift is greater than or equal to the bit width
// of the components of Base.
//
// So we can use the APInt ashr method, but don't fold if undefined behaviour.
bool shiftToLarge = false;
auto res = constFoldBinaryOp<IntegerAttr>(
adaptor.getOperands(), [&](const APInt &a, const APInt &b) {
if (shiftToLarge || b.uge(a.getBitWidth())) {
shiftToLarge = true;
return a;
}
return a.ashr(b);
});
return shiftToLarge ? Attribute() : res;
}

//===----------------------------------------------------------------------===//
// spirv.ShiftRightLogical
//===----------------------------------------------------------------------===//

OpFoldResult spirv::ShiftRightLogicalOp::fold(
spirv::ShiftRightLogicalOp::FoldAdaptor adaptor) {
// x >> 0 -> x
if (matchPattern(adaptor.getOperand2(), m_Zero())) {
return getOperand1();
}

// Unfortunately due to below undefined behaviour can't fold 0 for Base.

// According to the SPIR-V spec:
//
// Type is a scalar or vector of integer type.
// Results are computed per component, and within each component, per bit...
//
// The result is undefined if Shift is greater than or equal to the bit width
// of the components of Base.
//
// So we can use the APInt lshr method, but don't fold if undefined behaviour.
bool shiftToLarge = false;
auto res = constFoldBinaryOp<IntegerAttr>(
adaptor.getOperands(), [&](const APInt &a, const APInt &b) {
if (shiftToLarge || b.uge(a.getBitWidth())) {
shiftToLarge = true;
return a;
}
return a.lshr(b);
});
return shiftToLarge ? Attribute() : res;
}

//===----------------------------------------------------------------------===//
// spirv.mlir.selection
//===----------------------------------------------------------------------===//
Expand Down
2 changes: 1 addition & 1 deletion mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class CodegenEnv {
return loopEmitter.unpackTensorLevelRange(std::forward<ContainerTy>(c));
}

unsigned getLoopDepth() const { return loopEmitter.getCurrentDepth(); }
unsigned getCurrentDepth() const { return loopEmitter.getCurrentDepth(); }

//
// Code generation environment verify functions.
Expand Down
334 changes: 166 additions & 168 deletions mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp

Large diffs are not rendered by default.

309 changes: 309 additions & 0 deletions mlir/test/Dialect/Linalg/vectorize-convolution-flatten.mlir

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions mlir/test/Dialect/OpenACC/ops.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -974,7 +974,7 @@ func.func @testupdateop(%a: memref<f32>, %b: memref<f32>, %c: memref<f32>) -> ()
acc.update async(%idxValue: index) dataOperands(%0: memref<f32>)
acc.update wait_devnum(%i64Value: i64) wait(%i32Value, %idxValue : i32, index) dataOperands(%0: memref<f32>)
acc.update if(%ifCond) dataOperands(%0: memref<f32>)
acc.update dataOperands(%0: memref<f32>) attributes {acc.device_types = [#acc.device_type<nvidia>]}
acc.update dataOperands(%0: memref<f32>) attributes {acc.device_types = [#acc.device_type<star>]}
acc.update dataOperands(%0, %1, %2 : memref<f32>, memref<f32>, memref<f32>)
acc.update dataOperands(%0, %1, %2 : memref<f32>, memref<f32>, memref<f32>) attributes {async}
acc.update dataOperands(%0, %1, %2 : memref<f32>, memref<f32>, memref<f32>) attributes {wait}
Expand All @@ -993,7 +993,7 @@ func.func @testupdateop(%a: memref<f32>, %b: memref<f32>, %c: memref<f32>) -> ()
// CHECK: acc.update async([[IDXVALUE]] : index) dataOperands(%{{.*}} : memref<f32>)
// CHECK: acc.update wait_devnum([[I64VALUE]] : i64) wait([[I32VALUE]], [[IDXVALUE]] : i32, index) dataOperands(%{{.*}} : memref<f32>)
// CHECK: acc.update if([[IFCOND]]) dataOperands(%{{.*}} : memref<f32>)
// CHECK: acc.update dataOperands(%{{.*}} : memref<f32>) attributes {acc.device_types = [#acc.device_type<nvidia>]}
// CHECK: acc.update dataOperands(%{{.*}} : memref<f32>) attributes {acc.device_types = [#acc.device_type<star>]}
// CHECK: acc.update dataOperands(%{{.*}}, %{{.*}}, %{{.*}} : memref<f32>, memref<f32>, memref<f32>)
// CHECK: acc.update dataOperands(%{{.*}}, %{{.*}}, %{{.*}} : memref<f32>, memref<f32>, memref<f32>) attributes {async}
// CHECK: acc.update dataOperands(%{{.*}}, %{{.*}}, %{{.*}} : memref<f32>, memref<f32>, memref<f32>) attributes {wait}
Expand Down
178 changes: 178 additions & 0 deletions mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -1139,6 +1139,184 @@ func.func @convert_logical_or_true_false_vector(%arg: vector<3xi1>) -> (vector<3

// -----

//===----------------------------------------------------------------------===//
// spirv.LeftShiftLogical
//===----------------------------------------------------------------------===//

// CHECK-LABEL: @lsl_x_0
// CHECK-SAME: (%[[ARG0:.*]]: i32, %[[ARG1:.*]]: vector<3xi32>)
func.func @lsl_x_0(%arg0 : i32, %arg1: vector<3xi32>) -> (i32, vector<3xi32>) {
%c0 = spirv.Constant 0 : i32
%cv0 = spirv.Constant dense<0> : vector<3xi32>

%0 = spirv.ShiftLeftLogical %arg0, %c0 : i32, i32
%1 = spirv.ShiftLeftLogical %arg1, %cv0 : vector<3xi32>, vector<3xi32>

// CHECK: return %[[ARG0]], %[[ARG1]]
return %0, %1 : i32, vector<3xi32>
}

// CHECK-LABEL: @lsl_shift_overflow
// CHECK-SAME: (%[[ARG0:.*]]: i32, %[[ARG1:.*]]: vector<3xi32>)
func.func @lsl_shift_overflow(%arg0: i32, %arg1: vector<3xi32>) -> (i32, vector<3xi32>) {
// CHECK-DAG: %[[C32:.*]] = spirv.Constant 32
// CHECK-DAG: %[[CV:.*]] = spirv.Constant dense<[6, 18, 128]>
%c32 = spirv.Constant 32 : i32
%cv = spirv.Constant dense<[6, 18, 128]> : vector<3xi32>

// CHECK: %0 = spirv.ShiftLeftLogical %[[ARG0]], %[[C32]]
// CHECK: %1 = spirv.ShiftLeftLogical %[[ARG1]], %[[CV]]
%0 = spirv.ShiftLeftLogical %arg0, %c32 : i32, i32
%1 = spirv.ShiftLeftLogical %arg1, %cv : vector<3xi32>, vector<3xi32>

return %0, %1 : i32, vector<3xi32>
}

// CHECK-LABEL: @const_fold_scalar_lsl
func.func @const_fold_scalar_lsl() -> i32 {
%c1 = spirv.Constant 65535 : i32 // 0x0000 ffff
%c2 = spirv.Constant 17 : i32

// CHECK: %[[RET:.*]] = spirv.Constant -131072
// 0x0000 ffff << 17 -> 0xfffe 0000
%0 = spirv.ShiftLeftLogical %c1, %c2 : i32, i32

// CHECK: return %[[RET]]
return %0 : i32
}

// CHECK-LABEL: @const_fold_vector_lsl
func.func @const_fold_vector_lsl() -> vector<3xi32> {
%c1 = spirv.Constant dense<[1, -1, 127]> : vector<3xi32>
%c2 = spirv.Constant dense<[31, 16, 13]> : vector<3xi32>

// CHECK: %[[RET:.*]] = spirv.Constant dense<[-2147483648, -65536, 1040384]>
%0 = spirv.ShiftLeftLogical %c1, %c2 : vector<3xi32>, vector<3xi32>

// CHECK: return %[[RET]]
return %0 : vector<3xi32>
}

// -----

//===----------------------------------------------------------------------===//
// spirv.RightShiftArithmetic
//===----------------------------------------------------------------------===//

// CHECK-LABEL: @asr_x_0
// CHECK-SAME: (%[[ARG0:.*]]: i32, %[[ARG1:.*]]: vector<3xi32>)
func.func @asr_x_0(%arg0 : i32, %arg1: vector<3xi32>) -> (i32, vector<3xi32>) {
%c0 = spirv.Constant 0 : i32
%cv0 = spirv.Constant dense<0> : vector<3xi32>

%0 = spirv.ShiftRightArithmetic %arg0, %c0 : i32, i32
%1 = spirv.ShiftRightArithmetic %arg1, %cv0 : vector<3xi32>, vector<3xi32>

// CHECK: return %[[ARG0]], %[[ARG1]]
return %0, %1 : i32, vector<3xi32>
}

// CHECK-LABEL: @asr_shift_overflow
// CHECK-SAME: (%[[ARG0:.*]]: i32, %[[ARG1:.*]]: vector<3xi32>)
func.func @asr_shift_overflow(%arg0: i32, %arg1: vector<3xi32>) -> (i32, vector<3xi32>) {
// CHECK-DAG: %[[C32:.*]] = spirv.Constant 32
// CHECK-DAG: %[[CV:.*]] = spirv.Constant dense<[6, 18, 128]>
%c32 = spirv.Constant 32 : i32
%cv = spirv.Constant dense<[6, 18, 128]> : vector<3xi32>

// CHECK: %0 = spirv.ShiftRightArithmetic %[[ARG0]], %[[C32]]
// CHECK: %1 = spirv.ShiftRightArithmetic %[[ARG1]], %[[CV]]
%0 = spirv.ShiftRightArithmetic %arg0, %c32 : i32, i32
%1 = spirv.ShiftRightArithmetic %arg1, %cv : vector<3xi32>, vector<3xi32>

return %0, %1 : i32, vector<3xi32>
}

// CHECK-LABEL: @const_fold_scalar_asr
func.func @const_fold_scalar_asr() -> i32 {
%c1 = spirv.Constant -131072 : i32 // 0xfffe 0000
%c2 = spirv.Constant 17 : i32
// 0x0000 ffff ashr 17 -> 0xffff ffff
// CHECK: %[[RET:.*]] = spirv.Constant -1
%0 = spirv.ShiftRightArithmetic %c1, %c2 : i32, i32

// CHECK: return %[[RET]]
return %0 : i32
}

// CHECK-LABEL: @const_fold_vector_asr
func.func @const_fold_vector_asr() -> vector<3xi32> {
%c1 = spirv.Constant dense<[-2147483648, 239847, 127]> : vector<3xi32>
%c2 = spirv.Constant dense<[31, 16, 13]> : vector<3xi32>

// CHECK: %[[RET:.*]] = spirv.Constant dense<[-1, 3, 0]>
%0 = spirv.ShiftRightArithmetic %c1, %c2 : vector<3xi32>, vector<3xi32>

// CHECK: return %[[RET]]
return %0 : vector<3xi32>
}

// -----

//===----------------------------------------------------------------------===//
// spirv.RightShiftLogical
//===----------------------------------------------------------------------===//

// CHECK-LABEL: @lsr_x_0
// CHECK-SAME: (%[[ARG0:.*]]: i32, %[[ARG1:.*]]: vector<3xi32>)
func.func @lsr_x_0(%arg0 : i32, %arg1: vector<3xi32>) -> (i32, vector<3xi32>) {
%c0 = spirv.Constant 0 : i32
%cv0 = spirv.Constant dense<0> : vector<3xi32>

%0 = spirv.ShiftRightLogical %arg0, %c0 : i32, i32
%1 = spirv.ShiftRightLogical %arg1, %cv0 : vector<3xi32>, vector<3xi32>

// CHECK: return %[[ARG0]], %[[ARG1]]
return %0, %1 : i32, vector<3xi32>
}

// CHECK-LABEL: @lsr_shift_overflow
// CHECK-SAME: (%[[ARG0:.*]]: i32, %[[ARG1:.*]]: vector<3xi32>)
func.func @lsr_shift_overflow(%arg0: i32, %arg1: vector<3xi32>) -> (i32, vector<3xi32>) {
// CHECK-DAG: %[[C32:.*]] = spirv.Constant 32
// CHECK-DAG: %[[CV:.*]] = spirv.Constant dense<[6, 18, 128]>
%c32 = spirv.Constant 32 : i32
%cv = spirv.Constant dense<[6, 18, 128]> : vector<3xi32>

// CHECK: %0 = spirv.ShiftRightLogical %[[ARG0]], %[[C32]]
// CHECK: %1 = spirv.ShiftRightLogical %[[ARG1]], %[[CV]]
%0 = spirv.ShiftRightLogical %arg0, %c32 : i32, i32
%1 = spirv.ShiftRightLogical %arg1, %cv : vector<3xi32>, vector<3xi32>
return %0, %1 : i32, vector<3xi32>
}

// CHECK-LABEL: @const_fold_scalar_lsr
func.func @const_fold_scalar_lsr() -> i32 {
%c1 = spirv.Constant -131072 : i32 // 0xfffe 0000
%c2 = spirv.Constant 17 : i32

// 0x0000 ffff << 17 -> 0x0000 7fff
// CHECK: %[[RET:.*]] = spirv.Constant 32767
%0 = spirv.ShiftRightLogical %c1, %c2 : i32, i32

// CHECK: return %[[RET]]
return %0 : i32
}

// CHECK-LABEL: @const_fold_vector_lsr
func.func @const_fold_vector_lsr() -> vector<3xi32> {
%c1 = spirv.Constant dense<[-2147483648, -1, -127]> : vector<3xi32>
%c2 = spirv.Constant dense<[31, 16, 13]> : vector<3xi32>

// CHECK: %[[RET:.*]] = spirv.Constant dense<[1, 65535, 524287]>
%0 = spirv.ShiftRightLogical %c1, %c2 : vector<3xi32>, vector<3xi32>

// CHECK: return %[[RET]]
return %0 : vector<3xi32>
}

// -----

//===----------------------------------------------------------------------===//
// spirv.mlir.selection
//===----------------------------------------------------------------------===//
Expand Down
8 changes: 4 additions & 4 deletions openmp/libomptarget/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,6 @@ set(LIBOMPTARGET_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
message(STATUS "OpenMP tools dir in libomptarget: ${LIBOMP_OMP_TOOLS_INCLUDE_DIR}")
include_directories(${LIBOMP_OMP_TOOLS_INCLUDE_DIR})

# Build target agnostic offloading library.
set(LIBOMPTARGET_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
add_subdirectory(${LIBOMPTARGET_SRC_DIR})

# Definitions for testing, for reuse when testing libomptarget-nvptx.
set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${LIBOMP_INCLUDE_DIR}" CACHE STRING
"Path to folder containing omp.h")
Expand All @@ -129,5 +125,9 @@ add_subdirectory(plugins-nextgen)
add_subdirectory(DeviceRTL)
add_subdirectory(tools)

# Build target agnostic offloading library.
set(LIBOMPTARGET_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
add_subdirectory(${LIBOMPTARGET_SRC_DIR})

# Add tests.
add_subdirectory(test)
21 changes: 0 additions & 21 deletions openmp/libomptarget/include/PluginManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,24 +150,6 @@ struct PluginManager {
HostPtrToTableMapTy HostPtrToTableMap;
std::mutex TblMapMtx; ///< For HostPtrToTableMap

// Work around for plugins that call dlopen on shared libraries that call
// tgt_register_lib during their initialisation. Stash the pointers in a
// vector until the plugins are all initialised and then register them.
bool delayRegisterLib(__tgt_bin_desc *Desc) {
if (RTLsLoaded)
return false;
DelayedBinDesc.push_back(Desc);
return true;
}

void registerDelayedLibraries() {
// Only called by libomptarget constructor
RTLsLoaded = true;
for (auto *Desc : DelayedBinDesc)
__tgt_register_lib(Desc);
DelayedBinDesc.clear();
}

/// Return the number of usable devices.
int getNumDevices() { return getExclusiveDevicesAccessor()->size(); }

Expand Down Expand Up @@ -196,9 +178,6 @@ struct PluginManager {
void addRequirements(int64_t Flags) { Requirements.addRequirements(Flags); }

private:
bool RTLsLoaded = false;
llvm::SmallVector<__tgt_bin_desc *> DelayedBinDesc;

// List of all plugin adaptors, in use or not.
llvm::SmallVector<std::unique_ptr<PluginAdaptorTy>> PluginAdaptors;

Expand Down
25 changes: 22 additions & 3 deletions openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@

#include "Shared/Utils.h"

#include "llvm/BinaryFormat/ELF.h"
#include "llvm/Support/Error.h"

#include <cstdint>
#include <cstring>

using namespace llvm;
Expand Down Expand Up @@ -53,9 +57,15 @@ Error GenericGlobalHandlerTy::getGlobalMetadataFromELF(
const ELF64LE::Shdr &Section, GlobalTy &ImageGlobal) {

// The global's address is computed as the image begin + the ELF section
// offset + the ELF symbol value.
ImageGlobal.setPtr(advanceVoidPtr(
Image.getStart(), Section.sh_offset - Section.sh_addr + Symbol.st_value));
// offset + the ELF symbol value except for NOBITS sections that, as the name
// suggests, have no bits in the image. We still record the size and use
// nullptr to indicate there is no location.
if (Section.sh_type == ELF::SHT_NOBITS)
ImageGlobal.setPtr(nullptr);
else
ImageGlobal.setPtr(
advanceVoidPtr(Image.getStart(),
Section.sh_offset - Section.sh_addr + Symbol.st_value));

// Set the global's size.
ImageGlobal.setSize(Symbol.st_size);
Expand Down Expand Up @@ -170,12 +180,21 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device,
"%u bytes in the ELF image but %u bytes on the host",
HostGlobal.getName().data(), ImageGlobal.getSize(),
HostGlobal.getSize());
if (ImageGlobal.getPtr() == nullptr)
return Plugin::error("Transfer impossible because global symbol '%s' has "
"no representation in the image (NOBITS sections)",
HostGlobal.getName().data());

DP("Global symbol '%s' was found in the ELF image and %u bytes will copied "
"from %p to %p.\n",
HostGlobal.getName().data(), HostGlobal.getSize(), ImageGlobal.getPtr(),
HostGlobal.getPtr());

assert(Image.getStart() <= ImageGlobal.getPtr() &&
advanceVoidPtr(ImageGlobal.getPtr(), ImageGlobal.getSize()) <
advanceVoidPtr(Image.getStart(), Image.getSize()) &&
"Attempting to read outside the image!");

// Perform the copy from the image to the host memory.
std::memcpy(HostGlobal.getPtr(), ImageGlobal.getPtr(), HostGlobal.getSize());

Expand Down
21 changes: 13 additions & 8 deletions openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -785,9 +785,14 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
for (auto *Image : LoadedImages) {
DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
if (!GHandler.isSymbolInImage(*this, *Image,
"__omp_rtl_device_memory_pool_tracker"))
GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
sizeof(DeviceMemoryPoolTrackingTy),
&ImageDeviceMemoryPoolTracking);
if (auto Err =
GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
consumeError(std::move(Err));
continue;
}
DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
}

Expand Down Expand Up @@ -968,16 +973,16 @@ Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
}

// Create the metainfo of the device environment global.
GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
sizeof(DeviceMemoryPoolTrackingTy),
&DeviceMemoryPoolTracking);
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
if (auto Err = GHandler.readGlobalFromImage(*this, Image, TrackerGlobal)) {
[[maybe_unused]] std::string ErrStr = toString(std::move(Err));
DP("Avoid the memory pool: %s.\n", ErrStr.c_str());
if (!GHandler.isSymbolInImage(*this, Image,
"__omp_rtl_device_memory_pool_tracker")) {
DP("Skip the memory pool as there is no tracker symbol in the image.");
return Error::success();
}

GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
sizeof(DeviceMemoryPoolTrackingTy),
&DeviceMemoryPoolTracking);
if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
return Err;

Expand Down
21 changes: 21 additions & 0 deletions openmp/libomptarget/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,27 @@ target_compile_definitions(omptarget PRIVATE
DEBUG_PREFIX="omptarget"
)

macro(check_plugin_target target)
if (TARGET omptarget.rtl.${target})
list(APPEND LIBOMPTARGET_PLUGINS_TO_LOAD ${target})
endif()
endmacro()

set(LIBOMPTARGET_PLUGINS_TO_LOAD "" CACHE STRING
"Comma separated list of plugin names to look for at runtime")
if (NOT LIBOMPTARGET_PLUGINS_TO_LOAD)
check_plugin_target(ppc64)
check_plugin_target(x86_64)
check_plugin_target(cuda)
check_plugin_target(aarch64)
check_plugin_target(amdgpu)
endif()

list(TRANSFORM LIBOMPTARGET_PLUGINS_TO_LOAD PREPEND "\"libomptarget.rtl.")
list(TRANSFORM LIBOMPTARGET_PLUGINS_TO_LOAD APPEND "\"")
list(JOIN LIBOMPTARGET_PLUGINS_TO_LOAD "," ENABLED_OFFLOAD_PLUGINS)
target_compile_definitions(omptarget PRIVATE ENABLED_OFFLOAD_PLUGINS=${ENABLED_OFFLOAD_PLUGINS})

# libomptarget.so needs to be aware of where the plugins live as they
# are now separated in the build directory.
set_target_properties(omptarget PROPERTIES
Expand Down
10 changes: 2 additions & 8 deletions openmp/libomptarget/src/PluginManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,7 @@ using namespace llvm::sys;
PluginManager *PM;

// List of all plugins that can support offloading.
static const char *RTLNames[] = {
/* PowerPC target */ "libomptarget.rtl.ppc64",
/* x86_64 target */ "libomptarget.rtl.x86_64",
/* CUDA target */ "libomptarget.rtl.cuda",
/* AArch64 target */ "libomptarget.rtl.aarch64",
/* AMDGPU target */ "libomptarget.rtl.amdgpu",
};
static const char *RTLNames[] = {ENABLED_OFFLOAD_PLUGINS};

Expected<std::unique_ptr<PluginAdaptorTy>>
PluginAdaptorTy::create(const std::string &Name) {
Expand Down Expand Up @@ -67,7 +61,7 @@ Error PluginAdaptorTy::init() {
return createStringError(inconvertibleErrorCode(), \
"Invalid plugin as necessary interface function " \
"(%s) was not found.\n", \
NAME); \
std::string(#NAME).c_str()); \
}

#include "Shared/PluginAPI.inc"
Expand Down
3 changes: 0 additions & 3 deletions openmp/libomptarget/src/interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,6 @@ EXTERN void __tgt_register_requires(int64_t Flags) {
/// adds a target shared library to the target execution image
EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
TIMESCOPE();
if (PM->delayRegisterLib(Desc))
return;

PM->registerLib(Desc);
}

Expand Down
1 change: 0 additions & 1 deletion openmp/libomptarget/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ __attribute__((constructor(101))) void init() {
PM->init();

Profiler::get();
PM->registerDelayedLibraries();
}

__attribute__((destructor(101))) void deinit() {
Expand Down
1 change: 1 addition & 0 deletions openmp/libomptarget/test/Inputs/empty.c
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

2 changes: 0 additions & 2 deletions openmp/libomptarget/test/offloading/barrier_fence.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
// UNSUPPORTED: x86_64-pc-linux-gnu
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
// UNSUPPORTED: amdgcn-amd-amdhsa
// UNSUPPORTED: amdgcn-amd-amdhsa-LTO

#include <omp.h>
#include <stdio.h>
Expand Down
6 changes: 6 additions & 0 deletions openmp/libomptarget/test/offloading/bug60119.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
// RUN: %clang-generic -fPIC -shared %S/../Inputs/empty.c -o %T/liba.so
// RUN: %clang-generic -fPIC -shared %S/../Inputs/empty.c -o %T/libb.so
// RUN: %clang-generic -rpath %T -L %T -l a -l b %s -o %t
// RUN: %t

int main() {}
13 changes: 13 additions & 0 deletions openmp/libomptarget/test/offloading/bug74582.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// RUN: %libomptarget-compile-generic && %libomptarget-run-generic
// RUN: %libomptarget-compileopt-generic && %libomptarget-run-generic

// Verify we do not read bits in the image that are not there (nobits section).

#pragma omp begin declare target
char BigUninitializedBuffer[4096 * 64] __attribute__((loader_uninitialized));
#pragma omp end declare target

int main() {
#pragma omp target
{}
}
4 changes: 2 additions & 2 deletions openmp/libomptarget/test/offloading/shared_lib_fp_mapping.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// clang-format off
// RUN: %clang-generic -fPIC -shared %S/../Inputs/declare_indirect_func.c -o %T/liba.so -fopenmp-version=51
// RUN: %libomptarget-compile-generic -rpath %T -L %T -l a -o %t -fopenmp-version=51
// RUN: %clang-generic -fPIC -shared %S/../Inputs/declare_indirect_func.c -o %T/libslfm.so -fopenmp-version=51
// RUN: %libomptarget-compile-generic -rpath %T -L %T -l slfm -o %t -fopenmp-version=51
// RUN: env LIBOMPTARGET_INFO=32 %t 2>&1 | %fcheck-generic
// clang-format on

Expand Down