78 changes: 74 additions & 4 deletions clang/unittests/Format/TokenAnnotatorTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1496,12 +1496,82 @@ TEST_F(TokenAnnotatorTest, RequiresDoesNotChangeParsingOfTheRest) {

TEST_F(TokenAnnotatorTest, UnderstandsAsm) {
auto Tokens = annotate("__asm{\n"
"a:\n"
"};");
ASSERT_EQ(Tokens.size(), 7u) << Tokens;
"\"a\":\n"
": x\n"
":};");
ASSERT_EQ(Tokens.size(), 10u) << Tokens;
EXPECT_TOKEN(Tokens[0], tok::kw_asm, TT_Unknown);
EXPECT_TOKEN(Tokens[1], tok::l_brace, TT_InlineASMBrace);
EXPECT_TOKEN(Tokens[4], tok::r_brace, TT_InlineASMBrace);
EXPECT_TOKEN(Tokens[3], tok::colon, TT_InlineASMColon);
EXPECT_TOKEN(Tokens[4], tok::colon, TT_InlineASMColon);
EXPECT_TOKEN(Tokens[6], tok::colon, TT_InlineASMColon);
EXPECT_TOKEN(Tokens[7], tok::r_brace, TT_InlineASMBrace);

Tokens = annotate("__asm(\n"
"\"a\":\n"
": x\n"
":);");
ASSERT_EQ(Tokens.size(), 10u) << Tokens;
EXPECT_TOKEN(Tokens[0], tok::kw_asm, TT_Unknown);
EXPECT_TOKEN(Tokens[3], tok::colon, TT_InlineASMColon);
EXPECT_TOKEN(Tokens[4], tok::colon, TT_InlineASMColon);
EXPECT_TOKEN(Tokens[6], tok::colon, TT_InlineASMColon);

Tokens = annotate("asm volatile (\n"
"\"a_label:\"\n"
":\n"
": x\n"
":);");
ASSERT_EQ(Tokens.size(), 11u) << Tokens;
EXPECT_TOKEN(Tokens[0], tok::kw_asm, TT_Unknown);
EXPECT_TOKEN(Tokens[4], tok::colon, TT_InlineASMColon);
EXPECT_TOKEN(Tokens[5], tok::colon, TT_InlineASMColon);
EXPECT_TOKEN(Tokens[7], tok::colon, TT_InlineASMColon);

Tokens = annotate("__asm__(\n"
"\"a_label:\"\n"
": x\n"
":\n"
": y);");
ASSERT_EQ(Tokens.size(), 11u) << Tokens;
EXPECT_TOKEN(Tokens[0], tok::kw_asm, TT_Unknown);
EXPECT_TOKEN(Tokens[3], tok::colon, TT_InlineASMColon);
EXPECT_TOKEN(Tokens[5], tok::colon, TT_InlineASMColon);
EXPECT_TOKEN(Tokens[6], tok::colon, TT_InlineASMColon);

Tokens = annotate("__asm volatile (\n"
"\"a_label:\"\n"
"\"a b c(%%x)\"\n"
":\n"
": x\n"
":);");
ASSERT_EQ(Tokens.size(), 12u) << Tokens;
EXPECT_TOKEN(Tokens[0], tok::kw_asm, TT_Unknown);
EXPECT_TOKEN(Tokens[5], tok::colon, TT_InlineASMColon);
EXPECT_TOKEN(Tokens[6], tok::colon, TT_InlineASMColon);
EXPECT_TOKEN(Tokens[8], tok::colon, TT_InlineASMColon);

Tokens = annotate("asm(\n"
"\"insn\"\n"
": \"=r\" (var1), \"=&r\" (value)\n"
":\n"
": \"memory\");");
ASSERT_EQ(Tokens.size(), 19u) << Tokens;
EXPECT_TOKEN(Tokens[0], tok::kw_asm, TT_Unknown);
EXPECT_TOKEN(Tokens[3], tok::colon, TT_InlineASMColon);
EXPECT_TOKEN(Tokens[13], tok::colon, TT_InlineASMColon);
EXPECT_TOKEN(Tokens[14], tok::colon, TT_InlineASMColon);

Tokens = annotate("__asm__ volatile (\n"
"\"ldr r1, [r0, %%[sym]]\"\n"
":\n"
": [sym] \"J\" (aaaaa(aaaa, aaaa))\n"
");");
ASSERT_EQ(Tokens.size(), 21u) << Tokens;
EXPECT_TOKEN(Tokens[0], tok::kw_asm, TT_Unknown);
EXPECT_TOKEN(Tokens[4], tok::colon, TT_InlineASMColon);
EXPECT_TOKEN(Tokens[5], tok::colon, TT_InlineASMColon);
EXPECT_TOKEN(Tokens[6], tok::l_square, TT_InlineASMSymbolicNameLSquare);
}

TEST_F(TokenAnnotatorTest, UnderstandsObjCBlock) {
Expand Down
19 changes: 19 additions & 0 deletions compiler-rt/test/dfsan/sscanf.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// RUN: %clang_dfsan %s -o %t && %run %t
// XFAIL: *

#include <assert.h>
#include <stdio.h>

int main(int argc, char *argv[]) {
char buf[256] = "10000000000-100000000000 rw-p 00000000 00:00 0";
long rss = 0;
// This test exposes a bug in DFSan's sscanf, that leads to flakiness
// in release_shadow_space.c (see
// https://github.com/llvm/llvm-project/issues/91287)
if (sscanf(buf, "Garbage text before, %ld, Garbage text after", &rss) == 1) {
printf("Error: matched %ld\n", rss);
return 1;
}

return 0;
}
8 changes: 6 additions & 2 deletions libcxx/.clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,12 @@ CheckOptions:
value: _
- key: readability-identifier-naming.TemplateParameterIgnoredRegexp
value: (.*\:auto|expr-type) # This is https://llvm.org/PR56464
- key: readability-identifier-naming.ValueTemplateParameterIgnoredRegexp # TODO: enforce naming of variable parameters
value: .*
- key: readability-identifier-naming.ValueTemplateParameterCase
value: CamelCase
- key: readability-identifier-naming.ValueTemplateParameterPrefix
value: _
- key: readability-identifier-naming.ValueTemplateParameterIgnoredRegexp
value: (__[a-z_]|_[A-Z]).* # TODO: Converge on a single style for value template parameters

# TODO: investigate these checks
# bugprone-branch-clone,
Expand Down
4 changes: 2 additions & 2 deletions libcxx/include/__functional/bind.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ __mu(_Ti& __ti, tuple<_Uj...>& __uj) {
return std::__mu_expand(__ti, __uj, __indices());
}

template <bool IsPh, class _Ti, class _Uj>
template <bool _IsPh, class _Ti, class _Uj>
struct __mu_return2 {};

template <class _Ti, class _Uj>
Expand All @@ -120,7 +120,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Ti& __mu(_Ti& __ti,
return __ti;
}

template <class _Ti, bool IsReferenceWrapper, bool IsBindEx, bool IsPh, class _TupleUj>
template <class _Ti, bool _IsReferenceWrapper, bool _IsBindEx, bool _IsPh, class _TupleUj>
struct __mu_return_impl;

template <bool _Invokable, class _Ti, class... _Uj>
Expand Down
6 changes: 3 additions & 3 deletions libcxx/include/__mdspan/extents.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,9 @@ struct __maybe_static_array {
// static mapping of indices to the position in the dynamic values array
using _DynamicIdxMap = __static_partial_sums<static_cast<size_t>(_Values == _DynTag)...>;

template <size_t... Indices>
_LIBCPP_HIDE_FROM_ABI static constexpr _DynamicValues __zeros(index_sequence<Indices...>) noexcept {
return _DynamicValues{((void)Indices, 0)...};
template <size_t... _Indices>
_LIBCPP_HIDE_FROM_ABI static constexpr _DynamicValues __zeros(index_sequence<_Indices...>) noexcept {
return _DynamicValues{((void)_Indices, 0)...};
}

public:
Expand Down
5 changes: 2 additions & 3 deletions lld/ELF/Driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1384,9 +1384,8 @@ static void readConfigs(opt::InputArgList &args) {
config->relaxGP = args.hasFlag(OPT_relax_gp, OPT_no_relax_gp, false);
config->rpath = getRpath(args);
config->relocatable = args.hasArg(OPT_relocatable);
config->resolveGroups = !args.hasArg(OPT_relocatable) ||
args.hasFlag(OPT_force_group_allocation,
OPT_inhibit_group_allocation, false);
config->resolveGroups =
!args.hasArg(OPT_relocatable) || args.hasArg(OPT_force_group_allocation);

if (args.hasArg(OPT_save_temps)) {
// --save-temps implies saving all temps.
Expand Down
5 changes: 2 additions & 3 deletions lld/ELF/InputSection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,8 @@ InputSectionBase::InputSectionBase(InputFile *file, uint64_t flags,
}

// SHF_INFO_LINK and SHF_GROUP are normally resolved and not copied to the
// output section. However, for relocatable linking with the default
// --inhibit-group-allocation, the SHF_GROUP marker and section groups are
// retained.
// output section. However, for relocatable linking without
// --force-group-allocation, the SHF_GROUP flag and section groups are retained.
static uint64_t getFlags(uint64_t flags) {
flags &= ~(uint64_t)SHF_INFO_LINK;
if (config->resolveGroups)
Expand Down
4 changes: 1 addition & 3 deletions lld/ELF/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,7 @@ def fix_cortex_a8: F<"fix-cortex-a8">,
HelpText<"Apply fixes for ARM Cortex-A8 erratum 657417">;

def force_group_allocation: FF<"force-group-allocation">,
HelpText<"Only meaningful for -r. Section groups are discarded. If two section group members are combined into the same output section, combine their relocations as well">;
def inhibit_group_allocation: FF<"inhibit-group-allocation">,
HelpText<"This is the default for -r. Section groups are retained. Section group members' relocations are not combined">;
HelpText<"Only meaningful for -r. Section groups are discarded. If two section group members are placed to the same output section, combine their relocations as well">;

defm format: Eq<"format", "Change the input format of the inputs following this option">,
MetaVarName<"[default,elf,binary]">;
Expand Down
23 changes: 11 additions & 12 deletions lld/ELF/Writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -783,13 +783,11 @@ template <class ELFT> void Writer<ELFT>::addRelIpltSymbols() {
// __rela_iplt_{start,end} are initially defined relative to dummy section 0.
// We'll override Out::elfHeader with relaDyn later when we are sure that
// .rela.dyn will be present in the output.
ElfSym::relaIpltStart = addOptionalRegular(
config->isRela ? "__rela_iplt_start" : "__rel_iplt_start",
Out::elfHeader, 0, STV_HIDDEN);

ElfSym::relaIpltEnd = addOptionalRegular(
config->isRela ? "__rela_iplt_end" : "__rel_iplt_end",
Out::elfHeader, 0, STV_HIDDEN);
std::string name = config->isRela ? "__rela_iplt_start" : "__rel_iplt_start";
ElfSym::relaIpltStart =
addOptionalRegular(name, Out::elfHeader, 0, STV_HIDDEN);
name.replace(name.size() - 5, 5, "end");
ElfSym::relaIpltEnd = addOptionalRegular(name, Out::elfHeader, 0, STV_HIDDEN);
}

// This function generates assignments for predefined symbols (e.g. _end or
Expand Down Expand Up @@ -2487,11 +2485,12 @@ template <class ELFT> void Writer<ELFT>::assignFileOffsets() {
lastRX->lastSec == sec)
off = alignToPowerOf2(off, config->maxPageSize);
}
for (OutputSection *osec : outputSections)
if (!(osec->flags & SHF_ALLOC)) {
osec->offset = alignToPowerOf2(off, osec->addralign);
off = osec->offset + osec->size;
}
for (OutputSection *osec : outputSections) {
if (osec->flags & SHF_ALLOC)
continue;
osec->offset = alignToPowerOf2(off, osec->addralign);
off = osec->offset + osec->size;
}

sectionHeaderOff = alignToPowerOf2(off, config->wordsize);
fileSize = sectionHeaderOff + (outputSections.size() + 1) * sizeof(Elf_Shdr);
Expand Down
3 changes: 3 additions & 0 deletions lld/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ ELF Improvements
(typical for embedded). It also makes full LTO feasible in such cases, since
IR merging currently prevents the linker script from referring to input
files. (`#90007 <https://github.com/llvm/llvm-project/pull/90007>`_)
* ``--force-group-allocation`` is implemented to discard ``SHT_GROUP`` sections
and combine relocation sections if their relocated section group members are
placed to the same output section.

Breaking changes
----------------
Expand Down
4 changes: 1 addition & 3 deletions lld/docs/ld.lld.1
Original file line number Diff line number Diff line change
Expand Up @@ -279,9 +279,7 @@ field to the specified value.
.It Fl -fini Ns = Ns Ar symbol
Specify a finalizer function.
.It Fl -force-group-allocation
Only meaningful for -r. Section groups are discarded. If two section group members are combined into the same output section, combine their relocations as well.
.It Fl -inhibit-group-allocation
This is the default for -r. Section groups are retained. Section group members' relocations are not combined.
Only meaningful for -r. Section groups are discarded. If two section group members are placed to the same output section, combine their relocations as well.
.It Fl -format Ns = Ns Ar input-format , Fl b Ar input-format
Specify the format of the inputs following this option.
.Ar input-format
Expand Down
1 change: 1 addition & 0 deletions lld/test/ELF/comdat.s
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
// RUN: ld.lld -shared %t.o %t2.o -o %t
// RUN: llvm-objdump -d %t | FileCheck %s
// RUN: llvm-readelf -S -s %t | FileCheck --check-prefix=READ %s
// RUN: ld.lld -shared --force-group-allocation %t.o %t2.o -o - | cmp - %t

// Check that we don't crash with --gc-section and that we print a list of
// reclaimed sections on stderr.
Expand Down
2 changes: 0 additions & 2 deletions lld/test/ELF/relocatable-comdat.s
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@
## If --force-group-allocation is specified, discard .group and combine .rela.* if their relocated sections are combined.
# RUN: ld.lld -r -T combine.lds a.o a.o --force-group-allocation -o combine-a.ro
# RUN: llvm-readelf -g -S combine-a.ro | FileCheck %s --check-prefix=COMBINE-A
## --inhibit-group-allocation restores the default behavior.
# RUN: ld.lld -r -T combine.lds a.o a.o --force-group-allocation --inhibit-group-allocation -o - | cmp - combine.ro

# COMBINE-A: Name Type Address Off Size ES Flg Lk Inf Al
# COMBINE-A: .rodata PROGBITS 0000000000000000 {{.*}} 000002 00 A 0 0 1
Expand Down
9 changes: 6 additions & 3 deletions lldb/include/lldb/Core/ValueObject.h
Original file line number Diff line number Diff line change
Expand Up @@ -959,9 +959,12 @@ class ValueObject {
/// Should only be called by ValueObject::GetChildAtIndex().
///
/// \return A ValueObject managed by this ValueObject's manager.
virtual ValueObject *CreateChildAtIndex(size_t idx,
bool synthetic_array_member,
int32_t synthetic_index);
virtual ValueObject *CreateChildAtIndex(size_t idx);

/// Should only be called by ValueObject::GetSyntheticArrayMember().
///
/// \return A ValueObject managed by this ValueObject's manager.
virtual ValueObject *CreateSyntheticArrayMember(size_t idx);

/// Should only be called by ValueObject::GetNumChildren().
virtual llvm::Expected<uint32_t>
Expand Down
10 changes: 7 additions & 3 deletions lldb/include/lldb/Core/ValueObjectConstResult.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,6 @@ class ValueObjectConstResult : public ValueObject {

lldb::ValueObjectSP Dereference(Status &error) override;

ValueObject *CreateChildAtIndex(size_t idx, bool synthetic_array_member,
int32_t synthetic_index) override;

lldb::ValueObjectSP GetSyntheticChildAtOffset(
uint32_t offset, const CompilerType &type, bool can_create,
ConstString name_const_str = ConstString()) override;
Expand Down Expand Up @@ -151,6 +148,13 @@ class ValueObjectConstResult : public ValueObject {
ValueObjectConstResult(ExecutionContextScope *exe_scope,
ValueObjectManager &manager, const Status &error);

ValueObject *CreateChildAtIndex(size_t idx) override {
return m_impl.CreateChildAtIndex(idx);
}
ValueObject *CreateSyntheticArrayMember(size_t idx) override {
return m_impl.CreateSyntheticArrayMember(idx);
}

ValueObjectConstResult(const ValueObjectConstResult &) = delete;
const ValueObjectConstResult &
operator=(const ValueObjectConstResult &) = delete;
Expand Down
10 changes: 7 additions & 3 deletions lldb/include/lldb/Core/ValueObjectConstResultCast.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,6 @@ class ValueObjectConstResultCast : public ValueObjectCast {

lldb::ValueObjectSP Dereference(Status &error) override;

ValueObject *CreateChildAtIndex(size_t idx, bool synthetic_array_member,
int32_t synthetic_index) override;

virtual CompilerType GetCompilerType() {
return ValueObjectCast::GetCompilerType();
}
Expand All @@ -61,6 +58,13 @@ class ValueObjectConstResultCast : public ValueObjectCast {
friend class ValueObjectConstResult;
friend class ValueObjectConstResultImpl;

ValueObject *CreateChildAtIndex(size_t idx) override {
return m_impl.CreateChildAtIndex(idx);
}
ValueObject *CreateSyntheticArrayMember(size_t idx) override {
return m_impl.CreateSyntheticArrayMember(idx);
}

ValueObjectConstResultCast(const ValueObjectConstResultCast &) = delete;
const ValueObjectConstResultCast &
operator=(const ValueObjectConstResultCast &) = delete;
Expand Down
10 changes: 7 additions & 3 deletions lldb/include/lldb/Core/ValueObjectConstResultChild.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,6 @@ class ValueObjectConstResultChild : public ValueObjectChild {

lldb::ValueObjectSP Dereference(Status &error) override;

ValueObject *CreateChildAtIndex(size_t idx, bool synthetic_array_member,
int32_t synthetic_index) override;

virtual CompilerType GetCompilerType() {
return ValueObjectChild::GetCompilerType();
}
Expand All @@ -70,6 +67,13 @@ class ValueObjectConstResultChild : public ValueObjectChild {
friend class ValueObjectConstResult;
friend class ValueObjectConstResultImpl;

ValueObject *CreateChildAtIndex(size_t idx) override {
return m_impl.CreateChildAtIndex(idx);
}
ValueObject *CreateSyntheticArrayMember(size_t idx) override {
return m_impl.CreateSyntheticArrayMember(idx);
}

ValueObjectConstResultChild(const ValueObjectConstResultChild &) = delete;
const ValueObjectConstResultChild &
operator=(const ValueObjectConstResultChild &) = delete;
Expand Down
4 changes: 2 additions & 2 deletions lldb/include/lldb/Core/ValueObjectConstResultImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ class ValueObjectConstResultImpl {

lldb::ValueObjectSP Dereference(Status &error);

ValueObject *CreateChildAtIndex(size_t idx, bool synthetic_array_member,
int32_t synthetic_index);
ValueObject *CreateChildAtIndex(size_t idx);
ValueObject *CreateSyntheticArrayMember(size_t idx);

lldb::ValueObjectSP
GetSyntheticChildAtOffset(uint32_t offset, const CompilerType &type,
Expand Down
8 changes: 5 additions & 3 deletions lldb/include/lldb/Core/ValueObjectRegister.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,6 @@ class ValueObjectRegisterSet : public ValueObject {

llvm::Expected<uint32_t> CalculateNumChildren(uint32_t max) override;

ValueObject *CreateChildAtIndex(size_t idx, bool synthetic_array_member,
int32_t synthetic_index) override;

lldb::ValueObjectSP GetChildMemberWithName(llvm::StringRef name,
bool can_create = true) override;

Expand All @@ -73,6 +70,11 @@ class ValueObjectRegisterSet : public ValueObject {
ValueObjectManager &manager,
lldb::RegisterContextSP &reg_ctx_sp, uint32_t set_idx);

ValueObject *CreateChildAtIndex(size_t idx) override;
ValueObject *CreateSyntheticArrayMember(size_t idx) override {
return nullptr;
}

// For ValueObject only
ValueObjectRegisterSet(const ValueObjectRegisterSet &) = delete;
const ValueObjectRegisterSet &
Expand Down
8 changes: 5 additions & 3 deletions lldb/include/lldb/Core/ValueObjectVTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,6 @@ class ValueObjectVTable : public ValueObject {

llvm::Expected<uint32_t> CalculateNumChildren(uint32_t max) override;

ValueObject *CreateChildAtIndex(size_t idx, bool synthetic_array_member,
int32_t synthetic_index) override;

lldb::ValueType GetValueType() const override;

ConstString GetTypeName() override;
Expand All @@ -95,6 +92,11 @@ class ValueObjectVTable : public ValueObject {
private:
ValueObjectVTable(ValueObject &parent);

ValueObject *CreateChildAtIndex(size_t idx) override;
ValueObject *CreateSyntheticArrayMember(size_t idx) override {
return nullptr;
}

// For ValueObject only
ValueObjectVTable(const ValueObjectVTable &) = delete;
const ValueObjectVTable &operator=(const ValueObjectVTable &) = delete;
Expand Down
91 changes: 55 additions & 36 deletions lldb/source/Core/ValueObject.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ ValueObjectSP ValueObject::GetChildAtIndex(uint32_t idx, bool can_create) {
if (can_create && !m_children.HasChildAtIndex(idx)) {
// No we haven't created the child at this index, so lets have our
// subclass do it and cache the result for quick future access.
m_children.SetChildAtIndex(idx, CreateChildAtIndex(idx, false, 0));
m_children.SetChildAtIndex(idx, CreateChildAtIndex(idx));
}

ValueObject *child = m_children.GetChildAtIndex(idx);
Expand Down Expand Up @@ -488,66 +488,85 @@ void ValueObject::SetNumChildren(uint32_t num_children) {
m_children.SetChildrenCount(num_children);
}

ValueObject *ValueObject::CreateChildAtIndex(size_t idx,
bool synthetic_array_member,
int32_t synthetic_index) {
ValueObject *valobj = nullptr;

ValueObject *ValueObject::CreateChildAtIndex(size_t idx) {
bool omit_empty_base_classes = true;
bool ignore_array_bounds = synthetic_array_member;
std::string child_name_str;
bool ignore_array_bounds = false;
std::string child_name;
uint32_t child_byte_size = 0;
int32_t child_byte_offset = 0;
uint32_t child_bitfield_bit_size = 0;
uint32_t child_bitfield_bit_offset = 0;
bool child_is_base_class = false;
bool child_is_deref_of_parent = false;
uint64_t language_flags = 0;

const bool transparent_pointers = !synthetic_array_member;
const bool transparent_pointers = true;

ExecutionContext exe_ctx(GetExecutionContextRef());

auto child_compiler_type_or_err =
GetCompilerType().GetChildCompilerTypeAtIndex(
&exe_ctx, idx, transparent_pointers, omit_empty_base_classes,
ignore_array_bounds, child_name_str, child_byte_size,
child_byte_offset, child_bitfield_bit_size, child_bitfield_bit_offset,
ignore_array_bounds, child_name, child_byte_size, child_byte_offset,
child_bitfield_bit_size, child_bitfield_bit_offset,
child_is_base_class, child_is_deref_of_parent, this, language_flags);
CompilerType child_compiler_type;
if (!child_compiler_type_or_err)
if (!child_compiler_type_or_err || !child_compiler_type_or_err->IsValid()) {
LLDB_LOG_ERROR(GetLog(LLDBLog::Types),
child_compiler_type_or_err.takeError(),
"could not find child: {0}");
else
child_compiler_type = *child_compiler_type_or_err;
return nullptr;
}

return new ValueObjectChild(
*this, *child_compiler_type_or_err, ConstString(child_name),
child_byte_size, child_byte_offset, child_bitfield_bit_size,
child_bitfield_bit_offset, child_is_base_class, child_is_deref_of_parent,
eAddressTypeInvalid, language_flags);
}

ValueObject *ValueObject::CreateSyntheticArrayMember(size_t idx) {
bool omit_empty_base_classes = true;
bool ignore_array_bounds = true;
std::string child_name;
uint32_t child_byte_size = 0;
int32_t child_byte_offset = 0;
uint32_t child_bitfield_bit_size = 0;
uint32_t child_bitfield_bit_offset = 0;
bool child_is_base_class = false;
bool child_is_deref_of_parent = false;
uint64_t language_flags = 0;
const bool transparent_pointers = false;

if (child_compiler_type) {
if (synthetic_index)
child_byte_offset += child_byte_size * synthetic_index;
ExecutionContext exe_ctx(GetExecutionContextRef());

ConstString child_name;
if (!child_name_str.empty())
child_name.SetCString(child_name_str.c_str());
auto child_compiler_type_or_err =
GetCompilerType().GetChildCompilerTypeAtIndex(
&exe_ctx, 0, transparent_pointers, omit_empty_base_classes,
ignore_array_bounds, child_name, child_byte_size, child_byte_offset,
child_bitfield_bit_size, child_bitfield_bit_offset,
child_is_base_class, child_is_deref_of_parent, this, language_flags);
if (!child_compiler_type_or_err) {
LLDB_LOG_ERROR(GetLog(LLDBLog::Types),
child_compiler_type_or_err.takeError(),
"could not find child: {0}");
return nullptr;
}

if (child_compiler_type_or_err->IsValid()) {
child_byte_offset += child_byte_size * idx;

valobj = new ValueObjectChild(
*this, child_compiler_type, child_name, child_byte_size,
child_byte_offset, child_bitfield_bit_size, child_bitfield_bit_offset,
child_is_base_class, child_is_deref_of_parent, eAddressTypeInvalid,
language_flags);
return new ValueObjectChild(
*this, *child_compiler_type_or_err, ConstString(child_name),
child_byte_size, child_byte_offset, child_bitfield_bit_size,
child_bitfield_bit_offset, child_is_base_class,
child_is_deref_of_parent, eAddressTypeInvalid, language_flags);
}

// In case of an incomplete type, try to use the ValueObject's
// synthetic value to create the child ValueObject.
if (!valobj && synthetic_array_member) {
if (ValueObjectSP synth_valobj_sp = GetSyntheticValue()) {
valobj = synth_valobj_sp
->GetChildAtIndex(synthetic_index, synthetic_array_member)
.get();
}
}
if (ValueObjectSP synth_valobj_sp = GetSyntheticValue())
return synth_valobj_sp->GetChildAtIndex(idx, /*can_create=*/true).get();

return valobj;
return nullptr;
}

bool ValueObject::GetSummaryAsCString(TypeSummaryImpl *summary_ptr,
Expand Down Expand Up @@ -1616,7 +1635,7 @@ ValueObjectSP ValueObject::GetSyntheticArrayMember(size_t index,
ValueObject *synthetic_child;
// We haven't made a synthetic array member for INDEX yet, so lets make
// one and cache it for any future reference.
synthetic_child = CreateChildAtIndex(0, true, index);
synthetic_child = CreateSyntheticArrayMember(index);

// Cache the value if we got one back...
if (synthetic_child) {
Expand Down
6 changes: 0 additions & 6 deletions lldb/source/Core/ValueObjectConstResult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,12 +267,6 @@ lldb::addr_t ValueObjectConstResult::GetAddressOf(bool scalar_is_load_address,
return m_impl.GetAddressOf(scalar_is_load_address, address_type);
}

ValueObject *ValueObjectConstResult::CreateChildAtIndex(
size_t idx, bool synthetic_array_member, int32_t synthetic_index) {
return m_impl.CreateChildAtIndex(idx, synthetic_array_member,
synthetic_index);
}

size_t ValueObjectConstResult::GetPointeeData(DataExtractor &data,
uint32_t item_idx,
uint32_t item_count) {
Expand Down
6 changes: 0 additions & 6 deletions lldb/source/Core/ValueObjectConstResultCast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,6 @@ lldb::ValueObjectSP ValueObjectConstResultCast::AddressOf(Status &error) {
return m_impl.AddressOf(error);
}

ValueObject *ValueObjectConstResultCast::CreateChildAtIndex(
size_t idx, bool synthetic_array_member, int32_t synthetic_index) {
return m_impl.CreateChildAtIndex(idx, synthetic_array_member,
synthetic_index);
}

size_t ValueObjectConstResultCast::GetPointeeData(DataExtractor &data,
uint32_t item_idx,
uint32_t item_count) {
Expand Down
6 changes: 0 additions & 6 deletions lldb/source/Core/ValueObjectConstResultChild.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,6 @@ lldb::addr_t ValueObjectConstResultChild::GetAddressOf(
return m_impl.GetAddressOf(scalar_is_load_address, address_type);
}

ValueObject *ValueObjectConstResultChild::CreateChildAtIndex(
size_t idx, bool synthetic_array_member, int32_t synthetic_index) {
return m_impl.CreateChildAtIndex(idx, synthetic_array_member,
synthetic_index);
}

size_t ValueObjectConstResultChild::GetPointeeData(DataExtractor &data,
uint32_t item_idx,
uint32_t item_count) {
Expand Down
115 changes: 78 additions & 37 deletions lldb/source/Core/ValueObjectConstResultImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,75 +46,116 @@ lldb::ValueObjectSP ValueObjectConstResultImpl::Dereference(Status &error) {
return m_impl_backend->ValueObject::Dereference(error);
}

ValueObject *ValueObjectConstResultImpl::CreateChildAtIndex(
size_t idx, bool synthetic_array_member, int32_t synthetic_index) {
ValueObject *ValueObjectConstResultImpl::CreateChildAtIndex(size_t idx) {
if (m_impl_backend == nullptr)
return nullptr;

m_impl_backend->UpdateValueIfNeeded(false);

ValueObjectConstResultChild *valobj = nullptr;

bool omit_empty_base_classes = true;
bool ignore_array_bounds = synthetic_array_member;
std::string child_name_str;
bool ignore_array_bounds = false;
std::string child_name;
uint32_t child_byte_size = 0;
int32_t child_byte_offset = 0;
uint32_t child_bitfield_bit_size = 0;
uint32_t child_bitfield_bit_offset = 0;
bool child_is_base_class = false;
bool child_is_deref_of_parent = false;
uint64_t language_flags;

const bool transparent_pointers = !synthetic_array_member;
const bool transparent_pointers = true;
CompilerType compiler_type = m_impl_backend->GetCompilerType();

ExecutionContext exe_ctx(m_impl_backend->GetExecutionContextRef());

auto child_compiler_type_or_err = compiler_type.GetChildCompilerTypeAtIndex(
&exe_ctx, idx, transparent_pointers, omit_empty_base_classes,
ignore_array_bounds, child_name_str, child_byte_size, child_byte_offset,
ignore_array_bounds, child_name, child_byte_size, child_byte_offset,
child_bitfield_bit_size, child_bitfield_bit_offset, child_is_base_class,
child_is_deref_of_parent, m_impl_backend, language_flags);
CompilerType child_compiler_type;
if (!child_compiler_type_or_err)

// One might think we should check that the size of the children
// is always strictly positive, hence we could avoid creating a
// ValueObject if that's not the case, but it turns out there
// are languages out there which allow zero-size types with
// children (e.g. Swift).
if (!child_compiler_type_or_err || !child_compiler_type_or_err->IsValid()) {
LLDB_LOG_ERROR(GetLog(LLDBLog::Types),
child_compiler_type_or_err.takeError(),
"could not find child: {0}");
else
child_compiler_type = *child_compiler_type_or_err;
return nullptr;
}

lldb::addr_t child_live_addr = LLDB_INVALID_ADDRESS;
// Transfer the live address (with offset) to the child. But if
// the parent is a pointer, the live address is where that pointer
// value lives in memory, so the children live addresses aren't
// offsets from that value, they are just other load addresses that
// are recorded in the Value of the child ValueObjects.
if (m_live_address != LLDB_INVALID_ADDRESS && !compiler_type.IsPointerType())
child_live_addr = m_live_address + child_byte_offset;

return new ValueObjectConstResultChild(
*m_impl_backend, *child_compiler_type_or_err, ConstString(child_name),
child_byte_size, child_byte_offset, child_bitfield_bit_size,
child_bitfield_bit_offset, child_is_base_class, child_is_deref_of_parent,
child_live_addr, language_flags);
}

ValueObject *
ValueObjectConstResultImpl::CreateSyntheticArrayMember(size_t idx) {
if (m_impl_backend == nullptr)
return nullptr;

m_impl_backend->UpdateValueIfNeeded(false);

bool omit_empty_base_classes = true;
bool ignore_array_bounds = true;
std::string child_name;
uint32_t child_byte_size = 0;
int32_t child_byte_offset = 0;
uint32_t child_bitfield_bit_size = 0;
uint32_t child_bitfield_bit_offset = 0;
bool child_is_base_class = false;
bool child_is_deref_of_parent = false;
uint64_t language_flags;

const bool transparent_pointers = false;
CompilerType compiler_type = m_impl_backend->GetCompilerType();

ExecutionContext exe_ctx(m_impl_backend->GetExecutionContextRef());

auto child_compiler_type_or_err = compiler_type.GetChildCompilerTypeAtIndex(
&exe_ctx, 0, transparent_pointers, omit_empty_base_classes,
ignore_array_bounds, child_name, child_byte_size, child_byte_offset,
child_bitfield_bit_size, child_bitfield_bit_offset, child_is_base_class,
child_is_deref_of_parent, m_impl_backend, language_flags);
// One might think we should check that the size of the children
// is always strictly positive, hence we could avoid creating a
// ValueObject if that's not the case, but it turns out there
// are languages out there which allow zero-size types with
// children (e.g. Swift).
if (child_compiler_type) {
if (synthetic_index)
child_byte_offset += child_byte_size * synthetic_index;

ConstString child_name;
if (!child_name_str.empty())
child_name.SetCString(child_name_str.c_str());

lldb::addr_t child_live_addr = LLDB_INVALID_ADDRESS;
// Transfer the live address (with offset) to the child. But if
// the parent is a pointer, the live address is where that pointer
// value lives in memory, so the children live addresses aren't
// offsets from that value, they are just other load addresses that
// are recorded in the Value of the child ValueObjects.
if (m_live_address != LLDB_INVALID_ADDRESS) {
if (!compiler_type.IsPointerType())
child_live_addr = m_live_address + child_byte_offset;
}
valobj = new ValueObjectConstResultChild(
*m_impl_backend, child_compiler_type, child_name, child_byte_size,
child_byte_offset, child_bitfield_bit_size, child_bitfield_bit_offset,
child_is_base_class, child_is_deref_of_parent, child_live_addr,
language_flags);
if (!child_compiler_type_or_err || !child_compiler_type_or_err->IsValid()) {
LLDB_LOG_ERROR(GetLog(LLDBLog::Types),
child_compiler_type_or_err.takeError(),
"could not find child: {0}");
return nullptr;
}

return valobj;
child_byte_offset += child_byte_size * idx;

lldb::addr_t child_live_addr = LLDB_INVALID_ADDRESS;
// Transfer the live address (with offset) to the child. But if
// the parent is a pointer, the live address is where that pointer
// value lives in memory, so the children live addresses aren't
// offsets from that value, they are just other load addresses that
// are recorded in the Value of the child ValueObjects.
if (m_live_address != LLDB_INVALID_ADDRESS && !compiler_type.IsPointerType())
child_live_addr = m_live_address + child_byte_offset;
return new ValueObjectConstResultChild(
*m_impl_backend, *child_compiler_type_or_err, ConstString(child_name),
child_byte_size, child_byte_offset, child_bitfield_bit_size,
child_bitfield_bit_offset, child_is_base_class, child_is_deref_of_parent,
child_live_addr, language_flags);
}

lldb::ValueObjectSP ValueObjectConstResultImpl::GetSyntheticChildAtOffset(
Expand Down
14 changes: 5 additions & 9 deletions lldb/source/Core/ValueObjectRegister.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,17 +115,13 @@ bool ValueObjectRegisterSet::UpdateValue() {
return m_error.Success();
}

ValueObject *ValueObjectRegisterSet::CreateChildAtIndex(
size_t idx, bool synthetic_array_member, int32_t synthetic_index) {
ValueObject *valobj = nullptr;
ValueObject *ValueObjectRegisterSet::CreateChildAtIndex(size_t idx) {
if (m_reg_ctx_sp && m_reg_set) {
uint32_t num_children = GetNumChildrenIgnoringErrors();
if (idx < num_children)
valobj = new ValueObjectRegister(
*this, m_reg_ctx_sp,
m_reg_ctx_sp->GetRegisterInfoAtIndex(m_reg_set->registers[idx]));
return new ValueObjectRegister(
*this, m_reg_ctx_sp,
m_reg_ctx_sp->GetRegisterInfoAtIndex(m_reg_set->registers[idx]));
}
return valobj;
return nullptr;
}

lldb::ValueObjectSP
Expand Down
6 changes: 1 addition & 5 deletions lldb/source/Core/ValueObjectVTable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,11 +185,7 @@ ConstString ValueObjectVTable::GetDisplayTypeName() {

bool ValueObjectVTable::IsInScope() { return GetParent()->IsInScope(); }

ValueObject *ValueObjectVTable::CreateChildAtIndex(size_t idx,
bool synthetic_array_member,
int32_t synthetic_index) {
if (synthetic_array_member)
return nullptr;
ValueObject *ValueObjectVTable::CreateChildAtIndex(size_t idx) {
return new ValueObjectVTableChild(*this, idx, m_addr_size);
}

Expand Down
4 changes: 1 addition & 3 deletions llvm/include/llvm/Analysis/ScalarEvolution.h
Original file line number Diff line number Diff line change
Expand Up @@ -1656,9 +1656,7 @@ class ScalarEvolution {
DenseMap<const SCEV *, ConstantRange> &Cache =
Hint == HINT_RANGE_UNSIGNED ? UnsignedRanges : SignedRanges;

auto Pair = Cache.try_emplace(S, std::move(CR));
if (!Pair.second)
Pair.first->second = std::move(CR);
auto Pair = Cache.insert_or_assign(S, std::move(CR));
return Pair.first->second;
}

Expand Down
6 changes: 3 additions & 3 deletions llvm/include/llvm/CodeGen/MachinePassManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ struct MachinePassModel
#ifndef NDEBUG
if constexpr (is_detected<has_get_required_properties_t, PassT>::value) {
auto &MFProps = IR.getProperties();
auto RequiredProperties = PassT::getRequiredProperties();
auto RequiredProperties = this->Pass.getRequiredProperties();
if (!MFProps.verifyRequiredProperties(RequiredProperties)) {
errs() << "MachineFunctionProperties required by " << PassT::name()
<< " pass are not met by function " << IR.getName() << ".\n"
Expand All @@ -78,9 +78,9 @@ struct MachinePassModel
auto PA = this->Pass.run(IR, AM);

if constexpr (is_detected<has_get_set_properties_t, PassT>::value)
IR.getProperties().set(PassT::getSetProperties());
IR.getProperties().set(this->Pass.getSetProperties());
if constexpr (is_detected<has_get_cleared_properties_t, PassT>::value)
IR.getProperties().reset(PassT::getClearedProperties());
IR.getProperties().reset(this->Pass.getClearedProperties());
return PA;
}

Expand Down
57 changes: 57 additions & 0 deletions llvm/include/llvm/CodeGen/RegAllocFast.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
//==- RegAllocFast.h ----------- fast register allocator ----------*-C++-*-==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_CODEGEN_REGALLOCFAST_H
#define LLVM_CODEGEN_REGALLOCFAST_H

#include "llvm/CodeGen/MachinePassManager.h"
#include "llvm/CodeGen/RegAllocCommon.h"

namespace llvm {

struct RegAllocFastPassOptions {
RegClassFilterFunc Filter = allocateAllRegClasses;
StringRef FilterName = "all";
bool ClearVRegs = true;
};

class RegAllocFastPass : public PassInfoMixin<RegAllocFastPass> {
RegAllocFastPassOptions Opts;

public:
RegAllocFastPass(RegAllocFastPassOptions Opts = RegAllocFastPassOptions())
: Opts(Opts) {}

MachineFunctionProperties getRequiredProperties() {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoPHIs);
}

MachineFunctionProperties getSetProperties() {
if (Opts.ClearVRegs) {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoVRegs);
}

return MachineFunctionProperties();
}

MachineFunctionProperties getClearedProperties() {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::IsSSA);
}

PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &);

void printPipeline(raw_ostream &OS,
function_ref<StringRef(StringRef)> MapClassName2PassName);
};

} // namespace llvm

#endif // LLVM_CODEGEN_REGALLOCFAST_H
94 changes: 56 additions & 38 deletions llvm/include/llvm/CodeGen/ScheduleDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,10 @@ class TargetRegisterInfo;
private:
enum : unsigned { BoundaryID = ~0u };

SDNode *Node = nullptr; ///< Representative node.
MachineInstr *Instr = nullptr; ///< Alternatively, a MachineInstr.
union {
SDNode *Node; ///< Representative node.
MachineInstr *Instr; ///< Alternatively, a MachineInstr.
};

public:
SUnit *OrigNode = nullptr; ///< If not this, the node from which this node
Expand All @@ -253,6 +255,10 @@ class TargetRegisterInfo;
const MCSchedClassDesc *SchedClass =
nullptr; ///< nullptr or resolved SchedClass.

const TargetRegisterClass *CopyDstRC =
nullptr; ///< Is a special copy node if != nullptr.
const TargetRegisterClass *CopySrcRC = nullptr;

SmallVector<SDep, 4> Preds; ///< All sunit predecessors.
SmallVector<SDep, 4> Succs; ///< All sunit successors.

Expand All @@ -269,8 +275,14 @@ class TargetRegisterInfo;
unsigned NumSuccsLeft = 0; ///< # of succs not scheduled.
unsigned WeakPredsLeft = 0; ///< # of weak preds not scheduled.
unsigned WeakSuccsLeft = 0; ///< # of weak succs not scheduled.
unsigned short NumRegDefsLeft = 0; ///< # of reg defs with no scheduled use.
unsigned short Latency = 0; ///< Node latency.
unsigned TopReadyCycle = 0; ///< Cycle relative to start when node is ready.
unsigned BotReadyCycle = 0; ///< Cycle relative to end when node is ready.

private:
unsigned Depth = 0; ///< Node depth.
unsigned Height = 0; ///< Node height.

public:
bool isVRegCycle : 1; ///< May use and def the same vreg.
bool isCall : 1; ///< Is a function call.
bool isCallOp : 1; ///< Is a function call operand.
Expand All @@ -287,52 +299,54 @@ class TargetRegisterInfo;
bool isCloned : 1; ///< True if this node has been cloned.
bool isUnbuffered : 1; ///< Uses an unbuffered resource.
bool hasReservedResource : 1; ///< Uses a reserved resource.
Sched::Preference SchedulingPref = Sched::None; ///< Scheduling preference.
unsigned short NumRegDefsLeft = 0; ///< # of reg defs with no scheduled use.
unsigned short Latency = 0; ///< Node latency.

private:
bool isDepthCurrent : 1; ///< True if Depth is current.
bool isHeightCurrent : 1; ///< True if Height is current.
unsigned Depth = 0; ///< Node depth.
unsigned Height = 0; ///< Node height.
bool isNode : 1; ///< True if the representative is an SDNode
bool isInst : 1; ///< True if the representative is a MachineInstr

public:
unsigned TopReadyCycle = 0; ///< Cycle relative to start when node is ready.
unsigned BotReadyCycle = 0; ///< Cycle relative to end when node is ready.

const TargetRegisterClass *CopyDstRC =
nullptr; ///< Is a special copy node if != nullptr.
const TargetRegisterClass *CopySrcRC = nullptr;
Sched::Preference SchedulingPref : 4; ///< Scheduling preference.
static_assert(Sched::Preference::Last <= (1 << 4),
"not enough bits in bitfield");

/// Constructs an SUnit for pre-regalloc scheduling to represent an
/// SDNode and any nodes flagged to it.
SUnit(SDNode *node, unsigned nodenum)
: Node(node), NodeNum(nodenum), isVRegCycle(false), isCall(false),
isCallOp(false), isTwoAddress(false), isCommutable(false),
hasPhysRegUses(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
isPending(false), isAvailable(false), isScheduled(false),
isScheduleHigh(false), isScheduleLow(false), isCloned(false),
isUnbuffered(false), hasReservedResource(false), isDepthCurrent(false),
isHeightCurrent(false) {}
: Node(node), NodeNum(nodenum), isVRegCycle(false), isCall(false),
isCallOp(false), isTwoAddress(false), isCommutable(false),
hasPhysRegUses(false), hasPhysRegDefs(false),
hasPhysRegClobbers(false), isPending(false), isAvailable(false),
isScheduled(false), isScheduleHigh(false), isScheduleLow(false),
isCloned(false), isUnbuffered(false), hasReservedResource(false),
isDepthCurrent(false), isHeightCurrent(false), isNode(true),
isInst(false), SchedulingPref(Sched::None) {}

/// Constructs an SUnit for post-regalloc scheduling to represent a
/// MachineInstr.
SUnit(MachineInstr *instr, unsigned nodenum)
: Instr(instr), NodeNum(nodenum), isVRegCycle(false), isCall(false),
isCallOp(false), isTwoAddress(false), isCommutable(false),
hasPhysRegUses(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
isPending(false), isAvailable(false), isScheduled(false),
isScheduleHigh(false), isScheduleLow(false), isCloned(false),
isUnbuffered(false), hasReservedResource(false), isDepthCurrent(false),
isHeightCurrent(false) {}
: Instr(instr), NodeNum(nodenum), isVRegCycle(false), isCall(false),
isCallOp(false), isTwoAddress(false), isCommutable(false),
hasPhysRegUses(false), hasPhysRegDefs(false),
hasPhysRegClobbers(false), isPending(false), isAvailable(false),
isScheduled(false), isScheduleHigh(false), isScheduleLow(false),
isCloned(false), isUnbuffered(false), hasReservedResource(false),
isDepthCurrent(false), isHeightCurrent(false), isNode(false),
isInst(true), SchedulingPref(Sched::None) {}

/// Constructs a placeholder SUnit.
SUnit()
: isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
isCommutable(false), hasPhysRegUses(false), hasPhysRegDefs(false),
hasPhysRegClobbers(false), isPending(false), isAvailable(false),
isScheduled(false), isScheduleHigh(false), isScheduleLow(false),
isCloned(false), isUnbuffered(false), hasReservedResource(false),
isDepthCurrent(false), isHeightCurrent(false) {}
: Node(nullptr), isVRegCycle(false), isCall(false), isCallOp(false),
isTwoAddress(false), isCommutable(false), hasPhysRegUses(false),
hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
isAvailable(false), isScheduled(false), isScheduleHigh(false),
isScheduleLow(false), isCloned(false), isUnbuffered(false),
hasReservedResource(false), isDepthCurrent(false),
isHeightCurrent(false), isNode(false), isInst(false),
SchedulingPref(Sched::None) {}

/// Boundary nodes are placeholders for the boundary of the
/// scheduling region.
Expand All @@ -346,32 +360,36 @@ class TargetRegisterInfo;
/// Assigns the representative SDNode for this SUnit. This may be used
/// during pre-regalloc scheduling.
void setNode(SDNode *N) {
assert(!Instr && "Setting SDNode of SUnit with MachineInstr!");
assert(!isInst && "Setting SDNode of SUnit with MachineInstr!");
Node = N;
isNode = true;
}

/// Returns the representative SDNode for this SUnit. This may be used
/// during pre-regalloc scheduling.
SDNode *getNode() const {
assert(!Instr && "Reading SDNode of SUnit with MachineInstr!");
assert(!isInst && (isNode || !Instr) &&
"Reading SDNode of SUnit without SDNode!");
return Node;
}

/// Returns true if this SUnit refers to a machine instruction as
/// opposed to an SDNode.
bool isInstr() const { return Instr; }
bool isInstr() const { return isInst && Instr; }

/// Assigns the instruction for the SUnit. This may be used during
/// post-regalloc scheduling.
void setInstr(MachineInstr *MI) {
assert(!Node && "Setting MachineInstr of SUnit with SDNode!");
assert(!isNode && "Setting MachineInstr of SUnit with SDNode!");
Instr = MI;
isInst = true;
}

/// Returns the representative MachineInstr for this SUnit. This may be used
/// during post-regalloc scheduling.
MachineInstr *getInstr() const {
assert(!Node && "Reading MachineInstr of SUnit with SDNode!");
assert(!isNode && (isInst || !Node) &&
"Reading MachineInstr of SUnit without MachineInstr!");
return Instr;
}

Expand Down
17 changes: 9 additions & 8 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,15 @@ class Value;
namespace Sched {

enum Preference : uint8_t {
None, // No preference
Source, // Follow source order.
RegPressure, // Scheduling for lowest register pressure.
Hybrid, // Scheduling for both latency and register pressure.
ILP, // Scheduling for ILP in low register pressure mode.
VLIW, // Scheduling for VLIW targets.
Fast, // Fast suboptimal list scheduling
Linearize // Linearize DAG, no scheduling
None, // No preference
Source, // Follow source order.
RegPressure, // Scheduling for lowest register pressure.
Hybrid, // Scheduling for both latency and register pressure.
ILP, // Scheduling for ILP in low register pressure mode.
VLIW, // Scheduling for VLIW targets.
Fast, // Fast suboptimal list scheduling
Linearize, // Linearize DAG, no scheduling
Last = Linearize // Marker for the last Sched::Preference
};

} // end namespace Sched
Expand Down
2 changes: 1 addition & 1 deletion llvm/include/llvm/ExecutionEngine/Orc/Core.h
Original file line number Diff line number Diff line change
Expand Up @@ -1438,7 +1438,7 @@ class ExecutionSession {

public:
/// For reporting errors.
using ErrorReporter = std::function<void(Error)>;
using ErrorReporter = unique_function<void(Error)>;

/// Send a result to the remote.
using SendResultFunction = unique_function<void(shared::WrapperFunctionResult)>;
Expand Down
3 changes: 2 additions & 1 deletion llvm/include/llvm/Passes/CodeGenPassBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachinePassManager.h"
#include "llvm/CodeGen/PreISelIntrinsicLowering.h"
#include "llvm/CodeGen/RegAllocFast.h"
#include "llvm/CodeGen/ReplaceWithVeclib.h"
#include "llvm/CodeGen/SafeStack.h"
#include "llvm/CodeGen/SelectOptimize.h"
Expand Down Expand Up @@ -1038,7 +1039,7 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addTargetRegisterAllocator(
if (Optimized)
addPass(RAGreedyPass());
else
addPass(RAFastPass());
addPass(RegAllocFastPass());
}

/// Find and instantiate the register allocation pass requested by this target
Expand Down
14 changes: 13 additions & 1 deletion llvm/include/llvm/Passes/MachinePassRegistry.def
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,19 @@ MACHINE_FUNCTION_PASS("require-all-machine-function-properties",
MACHINE_FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
#undef MACHINE_FUNCTION_PASS

#ifndef MACHINE_FUNCTION_PASS_WITH_PARAMS
#define MACHINE_FUNCTION_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, \
PARAMS)
#endif
MACHINE_FUNCTION_PASS_WITH_PARAMS(
"regallocfast", "RegAllocFast",
[](RegAllocFastPassOptions Opts) { return RegAllocFastPass(Opts); },
[PB = this](StringRef Params) {
return parseRegAllocFastPassOptions(*PB, Params);
},
"filter=reg-filter;no-clear-vregs")
#undef MACHINE_FUNCTION_PASS_WITH_PARAMS

// After a pass is converted to new pass manager, its entry should be moved from
// dummy table to the normal one. For example, for a machine function pass,
// DUMMY_MACHINE_FUNCTION_PASS to MACHINE_FUNCTION_PASS.
Expand Down Expand Up @@ -211,7 +224,6 @@ DUMMY_MACHINE_FUNCTION_PASS("processimpdefs", ProcessImplicitDefsPass)
DUMMY_MACHINE_FUNCTION_PASS("prologepilog", PrologEpilogInserterPass)
DUMMY_MACHINE_FUNCTION_PASS("prologepilog-code", PrologEpilogCodeInserterPass)
DUMMY_MACHINE_FUNCTION_PASS("ra-basic", RABasicPass)
DUMMY_MACHINE_FUNCTION_PASS("ra-fast", RAFastPass)
DUMMY_MACHINE_FUNCTION_PASS("ra-greedy", RAGreedyPass)
DUMMY_MACHINE_FUNCTION_PASS("ra-pbqp", RAPBQPPass)
DUMMY_MACHINE_FUNCTION_PASS("reg-usage-collector", RegUsageInfoCollectorPass)
Expand Down
15 changes: 15 additions & 0 deletions llvm/include/llvm/Passes/PassBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "llvm/Analysis/CGSCCPassManager.h"
#include "llvm/CodeGen/MachinePassManager.h"
#include "llvm/CodeGen/RegAllocCommon.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Passes/OptimizationLevel.h"
#include "llvm/Support/Error.h"
Expand Down Expand Up @@ -388,6 +389,9 @@ class PassBuilder {
/// returns false.
Error parseAAPipeline(AAManager &AA, StringRef PipelineText);

/// Parse RegClassFilterName to get RegClassFilterFunc.
RegClassFilterFunc parseRegAllocFilter(StringRef RegClassFilterName);

/// Print pass names.
void printPassNames(raw_ostream &OS);

Expand Down Expand Up @@ -576,6 +580,14 @@ class PassBuilder {
}
/// @}}

/// Register callbacks to parse target specific filter field if regalloc pass
/// needs it. E.g. AMDGPU requires regalloc passes can handle sgpr and vgpr
/// separately.
void registerRegClassFilterParsingCallback(
const std::function<RegClassFilterFunc(StringRef)> &C) {
RegClassFilterParsingCallbacks.push_back(C);
}

/// Register a callback for a top-level pipeline entry.
///
/// If the PassManager type is not given at the top level of the pipeline
Expand Down Expand Up @@ -792,6 +804,9 @@ class PassBuilder {
ArrayRef<PipelineElement>)>,
2>
MachineFunctionPipelineParsingCallbacks;
// Callbacks to parse `filter` parameter in register allocation passes
SmallVector<std::function<RegClassFilterFunc(StringRef)>, 2>
RegClassFilterParsingCallbacks;
};

/// This utility template takes care of adding require<> and invalidate<>
Expand Down
201 changes: 124 additions & 77 deletions llvm/lib/CodeGen/RegAllocFast.cpp

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion llvm/lib/MC/MCSection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ void MCSection::flushPendingLabels() {
LLVM_DUMP_METHOD void MCSection::dump() const {
raw_ostream &OS = errs();

OS << "<MCSection";
OS << "<MCSection Name:" << getName();
OS << " Fragments:[\n ";
for (auto it = begin(), ie = end(); it != ie; ++it) {
if (it != begin())
Expand Down
56 changes: 56 additions & 0 deletions llvm/lib/Passes/PassBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
#include "llvm/CodeGen/MachinePassManager.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/PreISelIntrinsicLowering.h"
#include "llvm/CodeGen/RegAllocFast.h"
#include "llvm/CodeGen/SafeStack.h"
#include "llvm/CodeGen/SelectOptimize.h"
#include "llvm/CodeGen/ShadowStackGCLowering.h"
Expand Down Expand Up @@ -1163,6 +1164,38 @@ Expected<SmallVector<std::string, 0>> parseInternalizeGVs(StringRef Params) {
return Expected<SmallVector<std::string, 0>>(std::move(PreservedGVs));
}

Expected<RegAllocFastPassOptions>
parseRegAllocFastPassOptions(PassBuilder &PB, StringRef Params) {
RegAllocFastPassOptions Opts;
while (!Params.empty()) {
StringRef ParamName;
std::tie(ParamName, Params) = Params.split(';');

if (ParamName.consume_front("filter=")) {
RegClassFilterFunc Filter = PB.parseRegAllocFilter(ParamName);
if (!Filter) {
return make_error<StringError>(
formatv("invalid regallocfast register filter '{0}' ", ParamName)
.str(),
inconvertibleErrorCode());
}
Opts.Filter = Filter;
Opts.FilterName = ParamName;
continue;
}

if (ParamName == "no-clear-vregs") {
Opts.ClearVRegs = false;
continue;
}

return make_error<StringError>(
formatv("invalid regallocfast pass parameter '{0}' ", ParamName).str(),
inconvertibleErrorCode());
}
return Opts;
}

} // namespace

/// Tests whether a pass name starts with a valid prefix for a default pipeline
Expand Down Expand Up @@ -1296,6 +1329,11 @@ static bool isMachineFunctionPassName(StringRef Name, CallbacksT &Callbacks) {
#define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) \
if (Name == NAME) \
return true;
#define MACHINE_FUNCTION_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, \
PARAMS) \
if (PassBuilder::checkParametrizedPassName(Name, NAME)) \
return true;

#define MACHINE_FUNCTION_ANALYSIS(NAME, CREATE_PASS) \
if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">") \
return true;
Expand Down Expand Up @@ -1925,6 +1963,15 @@ Error PassBuilder::parseMachinePass(MachineFunctionPassManager &MFPM,
MFPM.addPass(CREATE_PASS); \
return Error::success(); \
}
#define MACHINE_FUNCTION_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, \
PARAMS) \
if (checkParametrizedPassName(Name, NAME)) { \
auto Params = parsePassParameters(PARSER, Name, NAME); \
if (!Params) \
return Params.takeError(); \
MFPM.addPass(CREATE_PASS(Params.get())); \
return Error::success(); \
}
#include "llvm/Passes/MachinePassRegistry.def"

for (auto &C : MachineFunctionPipelineParsingCallbacks)
Expand Down Expand Up @@ -2172,6 +2219,15 @@ Error PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) {
return Error::success();
}

RegClassFilterFunc PassBuilder::parseRegAllocFilter(StringRef FilterName) {
if (FilterName == "all")
return allocateAllRegClasses;
for (auto &C : RegClassFilterParsingCallbacks)
if (auto F = C(FilterName))
return F;
return nullptr;
}

static void printPassName(StringRef PassName, raw_ostream &OS) {
OS << " " << PassName << "\n";
}
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,15 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
if (EnableLowerModuleLDS)
PM.addPass(AMDGPULowerModuleLDSPass(*this));
});

PB.registerRegClassFilterParsingCallback(
[](StringRef FilterName) -> RegClassFilterFunc {
if (FilterName == "sgpr")
return onlyAllocateSGPRs;
if (FilterName == "vgpr")
return onlyAllocateVGPRs;
return nullptr;
});
}

int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/LoongArch/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ add_public_tablegen_target(LoongArchCommonTableGen)

add_llvm_target(LoongArchCodeGen
LoongArchAsmPrinter.cpp
LoongArchDeadRegisterDefinitions.cpp
LoongArchExpandAtomicPseudoInsts.cpp
LoongArchExpandPseudoInsts.cpp
LoongArchFrameLowering.cpp
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArch.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,14 @@ bool lowerLoongArchMachineOperandToMCOperand(const MachineOperand &MO,
MCOperand &MCOp,
const AsmPrinter &AP);

FunctionPass *createLoongArchDeadRegisterDefinitionsPass();
FunctionPass *createLoongArchExpandAtomicPseudoPass();
FunctionPass *createLoongArchISelDag(LoongArchTargetMachine &TM);
FunctionPass *createLoongArchOptWInstrsPass();
FunctionPass *createLoongArchPreRAExpandPseudoPass();
FunctionPass *createLoongArchExpandPseudoPass();
void initializeLoongArchDAGToDAGISelLegacyPass(PassRegistry &);
void initializeLoongArchDeadRegisterDefinitionsPass(PassRegistry &);
void initializeLoongArchExpandAtomicPseudoPass(PassRegistry &);
void initializeLoongArchOptWInstrsPass(PassRegistry &);
void initializeLoongArchPreRAExpandPseudoPass(PassRegistry &);
Expand Down
108 changes: 108 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
//=== LoongArchDeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg ===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===---------------------------------------------------------------------===//
//
// This pass rewrites Rd to r0 for instrs whose return values are unused.
//
//===---------------------------------------------------------------------===//

#include "LoongArch.h"
#include "LoongArchInstrInfo.h"
#include "LoongArchSubtarget.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/LiveDebugVariables.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveStacks.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"

using namespace llvm;
#define DEBUG_TYPE "loongarch-dead-defs"
#define LoongArch_DEAD_REG_DEF_NAME "LoongArch Dead register definitions"

STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced");

namespace {
class LoongArchDeadRegisterDefinitions : public MachineFunctionPass {
public:
static char ID;

LoongArchDeadRegisterDefinitions() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<LiveIntervals>();
AU.addPreserved<LiveIntervals>();
AU.addRequired<LiveIntervals>();
AU.addPreserved<SlotIndexes>();
AU.addPreserved<LiveDebugVariables>();
AU.addPreserved<LiveStacks>();
MachineFunctionPass::getAnalysisUsage(AU);
}

StringRef getPassName() const override { return LoongArch_DEAD_REG_DEF_NAME; }
};
} // end anonymous namespace

char LoongArchDeadRegisterDefinitions::ID = 0;
INITIALIZE_PASS(LoongArchDeadRegisterDefinitions, DEBUG_TYPE,
LoongArch_DEAD_REG_DEF_NAME, false, false)

FunctionPass *llvm::createLoongArchDeadRegisterDefinitionsPass() {
return new LoongArchDeadRegisterDefinitions();
}

bool LoongArchDeadRegisterDefinitions::runOnMachineFunction(
MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;

const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
LiveIntervals &LIS = getAnalysis<LiveIntervals>();
LLVM_DEBUG(dbgs() << "***** LoongArchDeadRegisterDefinitions *****\n");

bool MadeChange = false;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
// We only handle non-computational instructions.
const MCInstrDesc &Desc = MI.getDesc();
if (!Desc.mayLoad() && !Desc.mayStore() &&
!Desc.hasUnmodeledSideEffects())
continue;
for (int I = 0, E = Desc.getNumDefs(); I != E; ++I) {
MachineOperand &MO = MI.getOperand(I);
if (!MO.isReg() || !MO.isDef() || MO.isEarlyClobber())
continue;
// Be careful not to change the register if it's a tied operand.
if (MI.isRegTiedToUseOperand(I)) {
LLVM_DEBUG(dbgs() << " Ignoring, def is tied operand.\n");
continue;
}
Register Reg = MO.getReg();
if (!Reg.isVirtual() || !MO.isDead())
continue;
LLVM_DEBUG(dbgs() << " Dead def operand #" << I << " in:\n ";
MI.print(dbgs()));
const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI, MF);
if (!(RC && RC->contains(LoongArch::R0))) {
LLVM_DEBUG(dbgs() << " Ignoring, register is not a GPR.\n");
continue;
}
assert(LIS.hasInterval(Reg));
LIS.removeInterval(Reg);
MO.setReg(LoongArch::R0);
LLVM_DEBUG(dbgs() << " Replacing with zero register. New:\n ";
MI.print(dbgs()));
++NumDeadDefsReplaced;
MadeChange = true;
}
}
}

return MadeChange;
}
13 changes: 12 additions & 1 deletion llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -734,7 +734,9 @@ def ADD_W : ALU_3R<0x00100000>;
def SUB_W : ALU_3R<0x00110000>;
def ADDI_W : ALU_2RI12<0x02800000, simm12_addlike>;
def ALSL_W : ALU_3RI2<0x00040000, uimm2_plus1>;
let isReMaterializable = 1 in {
def LU12I_W : ALU_1RI20<0x14000000, simm20_lu12iw>;
}
def SLT : ALU_3R<0x00120000>;
def SLTU : ALU_3R<0x00128000>;
def SLTI : ALU_2RI12<0x02000000, simm12>;
Expand All @@ -749,8 +751,10 @@ def XOR : ALU_3R<0x00158000>;
def ANDN : ALU_3R<0x00168000>;
def ORN : ALU_3R<0x00160000>;
def ANDI : ALU_2RI12<0x03400000, uimm12>;
let isReMaterializable = 1 in {
def ORI : ALU_2RI12<0x03800000, uimm12_ori>;
def XORI : ALU_2RI12<0x03c00000, uimm12>;
}
def MUL_W : ALU_3R<0x001c0000>;
def MULH_W : ALU_3R<0x001c8000>;
def MULH_WU : ALU_3R<0x001d0000>;
Expand Down Expand Up @@ -852,17 +856,24 @@ let Predicates = [IsLA64] in {
// Arithmetic Operation Instructions for 64-bits
def ADD_D : ALU_3R<0x00108000>;
def SUB_D : ALU_3R<0x00118000>;
// ADDI_D isn't always rematerializable, but isReMaterializable will be used as
// a hint which is verified in isReallyTriviallyReMaterializable.
let isReMaterializable = 1 in {
def ADDI_D : ALU_2RI12<0x02c00000, simm12_addlike>;
}
def ADDU16I_D : ALU_2RI16<0x10000000, simm16>;
def ALSL_WU : ALU_3RI2<0x00060000, uimm2_plus1>;
def ALSL_D : ALU_3RI2<0x002c0000, uimm2_plus1>;
let Constraints = "$rd = $dst" in {
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
isReMaterializable = 1 in
def LU32I_D : Fmt1RI20<0x16000000, (outs GPR:$dst),
(ins GPR:$rd, simm20_lu32id:$imm20),
"$rd, $imm20">;
}
let isReMaterializable = 1 in {
def LU52I_D : ALU_2RI12<0x03000000, simm12_lu52id>;
}
def PCADDU18I : ALU_1RI20<0x1e000000, simm20_pcaddu18i>;
def MUL_D : ALU_3R<0x001d8000>;
def MULH_D : ALU_3R<0x001e0000>;
Expand Down
24 changes: 24 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,19 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTarget() {
RegisterTargetMachine<LoongArchTargetMachine> X(getTheLoongArch32Target());
RegisterTargetMachine<LoongArchTargetMachine> Y(getTheLoongArch64Target());
auto *PR = PassRegistry::getPassRegistry();
initializeLoongArchDeadRegisterDefinitionsPass(*PR);
initializeLoongArchOptWInstrsPass(*PR);
initializeLoongArchPreRAExpandPseudoPass(*PR);
initializeLoongArchDAGToDAGISelLegacyPass(*PR);
}

static cl::opt<bool> EnableLoongArchDeadRegisterElimination(
"loongarch-enable-dead-defs", cl::Hidden,
cl::desc("Enable the pass that removes dead"
" definitons and replaces stores to"
" them with stores to r0"),
cl::init(true));

static cl::opt<bool>
EnableLoopDataPrefetch("loongarch-enable-loop-data-prefetch", cl::Hidden,
cl::desc("Enable the loop data prefetch pass"),
Expand Down Expand Up @@ -148,6 +156,8 @@ class LoongArchPassConfig : public TargetPassConfig {
void addPreEmitPass2() override;
void addMachineSSAOptimization() override;
void addPreRegAlloc() override;
bool addRegAssignAndRewriteFast() override;
bool addRegAssignAndRewriteOptimized() override;
};
} // end namespace

Expand Down Expand Up @@ -200,3 +210,17 @@ void LoongArchPassConfig::addMachineSSAOptimization() {
void LoongArchPassConfig::addPreRegAlloc() {
addPass(createLoongArchPreRAExpandPseudoPass());
}

bool LoongArchPassConfig::addRegAssignAndRewriteFast() {
if (TM->getOptLevel() != CodeGenOptLevel::None &&
EnableLoongArchDeadRegisterElimination)
addPass(createLoongArchDeadRegisterDefinitionsPass());
return TargetPassConfig::addRegAssignAndRewriteFast();
}

bool LoongArchPassConfig::addRegAssignAndRewriteOptimized() {
if (TM->getOptLevel() != CodeGenOptLevel::None &&
EnableLoongArchDeadRegisterElimination)
addPass(createLoongArchDeadRegisterDefinitionsPass());
return TargetPassConfig::addRegAssignAndRewriteOptimized();
}
2 changes: 1 addition & 1 deletion llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ class SPIRVMergeRegionExitTargets : public FunctionPass {
const std::unordered_set<BasicBlock *> ToReplace,
BasicBlock *NewTarget) {
auto *T = BB->getTerminator();
if (auto *RI = dyn_cast<ReturnInst>(T))
if (isa<ReturnInst>(T))
return;

if (auto *BI = dyn_cast<BranchInst>(T)) {
Expand Down
10 changes: 5 additions & 5 deletions llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ class CallsiteContextGraph {
ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
DenseSet<uint32_t> ContextIds)
: Callee(Callee), Caller(Caller), AllocTypes(AllocType),
ContextIds(ContextIds) {}
ContextIds(std::move(ContextIds)) {}

DenseSet<uint32_t> &getContextIds() { return ContextIds; }

Expand Down Expand Up @@ -1127,15 +1127,15 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
continue;
}
if (TowardsCallee) {
uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
auto NewEdge = std::make_shared<ContextEdge>(
Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds),
NewEdgeContextIds);
Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
NewNode->CalleeEdges.push_back(NewEdge);
NewEdge->Callee->CallerEdges.push_back(NewEdge);
} else {
uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
auto NewEdge = std::make_shared<ContextEdge>(
NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds),
NewEdgeContextIds);
NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
NewNode->CallerEdges.push_back(NewEdge);
NewEdge->Caller->CalleeEdges.push_back(NewEdge);
}
Expand Down
59 changes: 38 additions & 21 deletions llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1288,38 +1288,55 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
Swapped = true;
}

// In X == Y ? f(X) : Z, try to evaluate f(Y) and replace the operand.
// Make sure Y cannot be undef though, as we might pick different values for
// undef in the icmp and in f(Y). Additionally, take care to avoid replacing
// X == Y ? X : Z with X == Y ? Y : Z, as that would lead to an infinite
// replacement cycle.
Value *CmpLHS = Cmp.getOperand(0), *CmpRHS = Cmp.getOperand(1);
if (TrueVal != CmpLHS && isGuaranteedNotToBeUndef(CmpRHS, SQ.AC, &Sel, &DT)) {
if (Value *V = simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, SQ,
/* AllowRefinement */ true))
// Require either the replacement or the simplification result to be a
// constant to avoid infinite loops.
// FIXME: Make this check more precise.
if (isa<Constant>(CmpRHS) || isa<Constant>(V))
auto ReplaceOldOpWithNewOp = [&](Value *OldOp,
Value *NewOp) -> Instruction * {
// In X == Y ? f(X) : Z, try to evaluate f(Y) and replace the operand.
// Take care to avoid replacing X == Y ? X : Z with X == Y ? Y : Z, as that
// would lead to an infinite replacement cycle.
// If we will be able to evaluate f(Y) to a constant, we can allow undef,
// otherwise Y cannot be undef as we might pick different values for undef
// in the icmp and in f(Y).
if (TrueVal == OldOp)
return nullptr;

if (Value *V = simplifyWithOpReplaced(TrueVal, OldOp, NewOp, SQ,
/* AllowRefinement=*/true)) {
// Need some guarantees about the new simplified op to ensure we don't inf
// loop.
// If we simplify to a constant, replace if we aren't creating new undef.
if (match(V, m_ImmConstant()) &&
isGuaranteedNotToBeUndef(V, SQ.AC, &Sel, &DT))
return replaceOperand(Sel, Swapped ? 2 : 1, V);

// If NewOp is a constant and OldOp is not replace iff NewOp doesn't
// contain and undef elements.
if (match(NewOp, m_ImmConstant())) {
if (isGuaranteedNotToBeUndef(NewOp, SQ.AC, &Sel, &DT))
return replaceOperand(Sel, Swapped ? 2 : 1, V);
return nullptr;
}
}

// Even if TrueVal does not simplify, we can directly replace a use of
// CmpLHS with CmpRHS, as long as the instruction is not used anywhere
// else and is safe to speculatively execute (we may end up executing it
// with different operands, which should not cause side-effects or trigger
// undefined behavior). Only do this if CmpRHS is a constant, as
// profitability is not clear for other cases.
// FIXME: Support vectors.
if (match(CmpRHS, m_ImmConstant()) && !match(CmpLHS, m_ImmConstant()) &&
!Cmp.getType()->isVectorTy())
if (replaceInInstruction(TrueVal, CmpLHS, CmpRHS))
if (OldOp == CmpLHS && match(NewOp, m_ImmConstant()) &&
!match(OldOp, m_ImmConstant()) && !Cmp.getType()->isVectorTy() &&
isGuaranteedNotToBeUndef(NewOp, SQ.AC, &Sel, &DT))
if (replaceInInstruction(TrueVal, OldOp, NewOp))
return &Sel;
}
if (TrueVal != CmpRHS && isGuaranteedNotToBeUndef(CmpLHS, SQ.AC, &Sel, &DT))
if (Value *V = simplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, SQ,
/* AllowRefinement */ true))
if (isa<Constant>(CmpLHS) || isa<Constant>(V))
return replaceOperand(Sel, Swapped ? 2 : 1, V);
return nullptr;
};

if (Instruction *R = ReplaceOldOpWithNewOp(CmpLHS, CmpRHS))
return R;
if (Instruction *R = ReplaceOldOpWithNewOp(CmpRHS, CmpLHS))
return R;

auto *FalseInst = dyn_cast<Instruction>(FalseVal);
if (!FalseInst)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple aarch64-apple-ios -run-pass regallocfast -o - %s | FileCheck %s
# RUN: llc -mtriple aarch64-apple-ios -passes=regallocfast -o - %s | FileCheck %s
# This test used to crash the fast register alloc.
# Basically, when a basic block has liveins, the fast regalloc
# was deferencing the begin iterator of this block. However,
Expand Down
107 changes: 107 additions & 0 deletions llvm/test/CodeGen/AMDGPU/build_vector-r600.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=r600-- -mcpu=redwood | FileCheck %s --check-prefixes=R600

define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
; R600-LABEL: build_vector2:
; R600: ; %bb.0: ; %entry
; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: MOV * T0.Y, literal.x,
; R600-NEXT: 6(8.407791e-45), 0(0.000000e+00)
; R600-NEXT: MOV T0.X, literal.x,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; R600-NEXT: 5(7.006492e-45), 2(2.802597e-45)
entry:
store <2 x i32> <i32 5, i32 6>, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
; R600-LABEL: build_vector4:
; R600: ; %bb.0: ; %entry
; R600-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: MOV * T0.W, literal.x,
; R600-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; R600-NEXT: MOV * T0.Z, literal.x,
; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00)
; R600-NEXT: MOV * T0.Y, literal.x,
; R600-NEXT: 6(8.407791e-45), 0(0.000000e+00)
; R600-NEXT: MOV T0.X, literal.x,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; R600-NEXT: 5(7.006492e-45), 2(2.802597e-45)
entry:
store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) {
; R600-LABEL: build_vector_v2i16:
; R600: ; %bb.0: ; %entry
; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: MOV T4.X, literal.x,
; R600-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; R600-NEXT: 393221(5.510200e-40), 2(2.802597e-45)
entry:
store <2 x i16> <i16 5, i16 6>, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 %a) {
; R600-LABEL: build_vector_v2i16_trunc:
; R600: ; %bb.0:
; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: LSHR * T0.W, KC0[2].Z, literal.x,
; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; R600-NEXT: OR_INT T4.X, PV.W, literal.x,
; R600-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; R600-NEXT: 327680(4.591775e-40), 2(2.802597e-45)
%srl = lshr i32 %a, 16
%trunc = trunc i32 %srl to i16
%ins.0 = insertelement <2 x i16> undef, i16 %trunc, i32 0
%ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
store <2 x i16> %ins.1, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, <4 x i16> %in) {
; R600-LABEL: build_v2i32_from_v4i16_shuffle:
; R600: ; %bb.0: ; %entry
; R600-NEXT: ALU 0, @10, KC0[], KC1[]
; R600-NEXT: TEX 1 @6
; R600-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: Fetch clause starting at 6:
; R600-NEXT: VTX_READ_16 T1.X, T0.X, 48, #3
; R600-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3
; R600-NEXT: ALU clause starting at 10:
; R600-NEXT: MOV * T0.X, 0.0,
; R600-NEXT: ALU clause starting at 11:
; R600-NEXT: LSHL * T0.Y, T1.X, literal.x,
; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; R600-NEXT: LSHL T0.X, T0.X, literal.x,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; R600-NEXT: 16(2.242078e-44), 2(2.802597e-45)
entry:
%shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
%zextended = zext <2 x i16> %shuf to <2 x i32>
%shifted = shl <2 x i32> %zextended, <i32 16, i32 16>
store <2 x i32> %shifted, ptr addrspace(1) %out
ret void
}
355 changes: 248 additions & 107 deletions llvm/test/CodeGen/AMDGPU/build_vector.ll

Large diffs are not rendered by default.

159 changes: 159 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fabs-r600.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s


; DAGCombiner will transform:
; (fabsf (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
; unless isFabsFree returns true
define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) {
; R600-LABEL: s_fabsf_fn_free:
; R600: ; %bb.0:
; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: MOV * T0.W, KC0[2].Z,
; R600-NEXT: MOV T0.X, |PV.W|,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%bc= bitcast i32 %in to float
%fabs = call float @fabsf(float %bc)
store float %fabs, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) {
; R600-LABEL: s_fabsf_free:
; R600: ; %bb.0:
; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: MOV * T0.W, KC0[2].Z,
; R600-NEXT: MOV T0.X, |PV.W|,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%bc= bitcast i32 %in to float
%fabs = call float @llvm.fabs.f32(float %bc)
store float %fabs, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) {
; R600-LABEL: s_fabsf_f32:
; R600: ; %bb.0:
; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: MOV * T0.W, KC0[2].Z,
; R600-NEXT: MOV T0.X, |PV.W|,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%fabs = call float @llvm.fabs.f32(float %in)
store float %fabs, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; R600-LABEL: fabs_v2f32:
; R600: ; %bb.0:
; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: MOV * T0.W, KC0[3].X,
; R600-NEXT: MOV T0.Y, |PV.W|,
; R600-NEXT: MOV * T0.W, KC0[2].W,
; R600-NEXT: MOV T0.X, |PV.W|,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
store <2 x float> %fabs, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; R600-LABEL: fabsf_v4f32:
; R600: ; %bb.0:
; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: MOV T0.W, KC0[4].X,
; R600-NEXT: MOV * T1.W, KC0[3].W,
; R600-NEXT: MOV * T0.W, |PV.W|,
; R600-NEXT: MOV T0.Z, |T1.W|,
; R600-NEXT: MOV * T1.W, KC0[3].Z,
; R600-NEXT: MOV T0.Y, |PV.W|,
; R600-NEXT: MOV * T1.W, KC0[3].Y,
; R600-NEXT: MOV T0.X, |PV.W|,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
store <4 x float> %fabs, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, float %in1) {
; R600-LABEL: fabsf_fn_fold:
; R600: ; %bb.0:
; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; R600-NEXT: MUL_IEEE * T1.X, |KC0[2].Z|, KC0[2].W,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%fabs = call float @fabsf(float %in0)
%fmul = fmul float %fabs, %in1
store float %fmul, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %in1) {
; R600-LABEL: fabs_fold:
; R600: ; %bb.0:
; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; R600-NEXT: MUL_IEEE * T1.X, |KC0[2].Z|, KC0[2].W,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%fabs = call float @llvm.fabs.f32(float %in0)
%fmul = fmul float %fabs, %in1
store float %fmul, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @bitpreserve_fabsf_f32(ptr addrspace(1) %out, float %in) {
; R600-LABEL: bitpreserve_fabsf_f32:
; R600: ; %bb.0:
; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; R600-NEXT: ADD * T1.X, |KC0[2].Z|, 1.0,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%in.bc = bitcast float %in to i32
%int.abs = and i32 %in.bc, 2147483647
%bc = bitcast i32 %int.abs to float
%fadd = fadd float %bc, 1.0
store float %fadd, ptr addrspace(1) %out
ret void
}

declare float @fabsf(float) readnone
declare float @llvm.fabs.f32(float) readnone
declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone
declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone
252 changes: 203 additions & 49 deletions llvm/test/CodeGen/AMDGPU/fabs.ll
Original file line number Diff line number Diff line change
@@ -1,104 +1,256 @@
; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global,-xnack -enable-misched=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -march=amdgcn -enable-misched=0 < %s | FileCheck -check-prefixes=GCN,SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -enable-misched=0 < %s | FileCheck -check-prefixes=GCN,VI %s


; DAGCombiner will transform:
; (fabsf (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
; unless isFabsFree returns true

; FUNC-LABEL: {{^}}s_fabsf_fn_free:
; R600-NOT: AND
; R600: |PV.{{[XYZW]}}|

; GCN: s_bitset0_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_fabsf_fn_free:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitset0_b32 s4, 31
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_fabsf_fn_free:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dword s2, s[2:3], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_bitset0_b32 s2, 31
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%bc= bitcast i32 %in to float
%fabs = call float @fabsf(float %bc)
store float %fabs, ptr addrspace(1) %out
ret void
}

; FUNC-LABEL: {{^}}s_fabsf_free:
; R600-NOT: AND
; R600: |PV.{{[XYZW]}}|

; GCN: s_bitset0_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_fabsf_free:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitset0_b32 s0, 31
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_fabsf_free:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_bitset0_b32 s0, 31
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%bc= bitcast i32 %in to float
%fabs = call float @llvm.fabs.f32(float %bc)
store float %fabs, ptr addrspace(1) %out
ret void
}

; FUNC-LABEL: {{^}}s_fabsf_f32:
; R600: |{{(PV|T[0-9])\.[XYZW]}}|

; GCN: s_bitset0_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) {
; SI-LABEL: s_fabsf_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitset0_b32 s0, 31
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_fabsf_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_bitset0_b32 s0, 31
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %in)
store float %fabs, ptr addrspace(1) %out
ret void
}

; FUNC-LABEL: {{^}}fabs_v2f32:
; R600: |{{(PV|T[0-9])\.[XYZW]}}|
; R600: |{{(PV|T[0-9])\.[XYZW]}}|

; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; SI-LABEL: fabs_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_and_b32 s0, s3, 0x7fffffff
; SI-NEXT: s_and_b32 s1, s2, 0x7fffffff
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fabs_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset0_b32 s3, 31
; VI-NEXT: s_bitset0_b32 s2, 31
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
store <2 x float> %fabs, ptr addrspace(1) %out
ret void
}

; FUNC-LABEL: {{^}}fabsf_v4f32:
; R600: |{{(PV|T[0-9])\.[XYZW]}}|
; R600: |{{(PV|T[0-9])\.[XYZW]}}|
; R600: |{{(PV|T[0-9])\.[XYZW]}}|
; R600: |{{(PV|T[0-9])\.[XYZW]}}|

; GCN: s_bitset0_b32
; GCN: s_bitset0_b32
; GCN: s_bitset0_b32
; GCN: s_bitset0_b32
define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; SI-LABEL: fabsf_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitset0_b32 s3, 31
; SI-NEXT: s_bitset0_b32 s2, 31
; SI-NEXT: s_bitset0_b32 s1, 31
; SI-NEXT: s_bitset0_b32 s0, 31
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_mov_b32_e32 v3, s3
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fabsf_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_bitset0_b32 s3, 31
; VI-NEXT: s_bitset0_b32 s2, 31
; VI-NEXT: s_bitset0_b32 s1, 31
; VI-NEXT: s_bitset0_b32 s0, 31
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
store <4 x float> %fabs, ptr addrspace(1) %out
ret void
}

; GCN-LABEL: {{^}}fabsf_fn_fold:
; SI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[{{[0-9]+:[0-9]+}}], 0x9
; VI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[{{[0-9]+:[0-9]+}}], 0x24
; GCN-NOT: and
; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[#LOAD + 3]]
; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[#LOAD + 2]]|, [[V_MUL_VI]]
define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, float %in1) {
; SI-LABEL: fabsf_fn_fold:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: v_mul_f32_e64 v0, |s2|, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fabsf_fn_fold:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%fabs = call float @fabsf(float %in0)
%fmul = fmul float %fabs, %in1
store float %fmul, ptr addrspace(1) %out
ret void
}

; FUNC-LABEL: {{^}}fabs_fold:
; SI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[{{[0-9]+:[0-9]+}}], 0x9
; VI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[{{[0-9]+:[0-9]+}}], 0x24
; GCN-NOT: and
; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[#LOAD + 3]]
; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[#LOAD + 2]]|, [[V_MUL_VI]]
define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %in1) {
; SI-LABEL: fabs_fold:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: v_mul_f32_e64 v0, |s2|, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fabs_fold:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %in0)
%fmul = fmul float %fabs, %in1
store float %fmul, ptr addrspace(1) %out
ret void
}

; Make sure we turn some integer operations back into fabsf
; FUNC-LABEL: {{^}}bitpreserve_fabsf_f32:
; GCN: v_add_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|, 1.0
define amdgpu_kernel void @bitpreserve_fabsf_f32(ptr addrspace(1) %out, float %in) {
; SI-LABEL: bitpreserve_fabsf_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_add_f32_e64 v0, |s0|, 1.0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: bitpreserve_fabsf_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_add_f32_e64 v2, |s0|, 1.0
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%in.bc = bitcast float %in to i32
%int.abs = and i32 %in.bc, 2147483647
%bc = bitcast i32 %int.abs to float
Expand All @@ -111,3 +263,5 @@ declare float @fabsf(float) readnone
declare float @llvm.fabs.f32(float) readnone
declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone
declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/fast-ra-kills-vcc.mir
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=regallocfast -o - %s | FileCheck %s
# RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=regallocfast -o - %s | FileCheck %s

# Make sure incorrect kills aren't emitted on vcc

Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/fast-regalloc-bundles.mir
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass=regallocfast %s -o - | FileCheck --check-prefix=GCN %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx902 -verify-machineinstrs -passes=regallocfast %s -o - | FileCheck --check-prefix=GCN %s

---
name: fast_regalloc_bundle_handling
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regallocfast -o - %s | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -passes=regallocfast -o - %s | FileCheck %s

# This would hit "Illegal subregister index for physical register" verifier error since
# tied operands would skip dropping the subregister index.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=regallocfast -o - %s | FileCheck -check-prefix=GCN %s
# RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=regallocfast -o - %s | FileCheck -check-prefix=GCN %s

---
name: self_loop_single_def_use
Expand Down
180 changes: 180 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fneg-fabs-r600.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 %s

define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, float %y) {
; R600-LABEL: fneg_fabsf_fadd_f32:
; R600: ; %bb.0:
; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; R600-NEXT: ADD * T1.X, KC0[2].W, -|KC0[2].Z|,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%fabs = call float @llvm.fabs.f32(float %x)
%fsub = fsub float -0.000000e+00, %fabs
%fadd = fadd float %y, %fsub
store float %fadd, ptr addrspace(1) %out, align 4
ret void
}

define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, float %y) {
; R600-LABEL: fneg_fabsf_fmul_f32:
; R600: ; %bb.0:
; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; R600-NEXT: MUL_IEEE * T1.X, KC0[2].W, -|KC0[2].Z|,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%fabs = call float @llvm.fabs.f32(float %x)
%fsub = fsub float -0.000000e+00, %fabs
%fmul = fmul float %y, %fsub
store float %fmul, ptr addrspace(1) %out, align 4
ret void
}

define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) {
; R600-LABEL: fneg_fabsf_free_f32:
; R600: ; %bb.0:
; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: MOV * T0.W, KC0[2].Z,
; R600-NEXT: MOV * T0.W, |PV.W|,
; R600-NEXT: MOV T0.X, -PV.W,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%bc = bitcast i32 %in to float
%fabs = call float @llvm.fabs.f32(float %bc)
%fsub = fsub float -0.000000e+00, %fabs
store float %fsub, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in) {
; R600-LABEL: fneg_fabsf_fn_free_f32:
; R600: ; %bb.0:
; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: MOV * T0.W, KC0[2].Z,
; R600-NEXT: MOV * T0.W, |PV.W|,
; R600-NEXT: MOV T0.X, -PV.W,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%bc = bitcast i32 %in to float
%fabs = call float @fabsf(float %bc)
%fsub = fsub float -0.000000e+00, %fabs
store float %fsub, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) {
; R600-LABEL: fneg_fabsf_f32:
; R600: ; %bb.0:
; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: MOV * T0.W, KC0[2].Z,
; R600-NEXT: MOV * T0.W, |PV.W|,
; R600-NEXT: MOV T0.X, -PV.W,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%fabs = call float @llvm.fabs.f32(float %in)
%fsub = fsub float -0.000000e+00, %fabs
store float %fsub, ptr addrspace(1) %out, align 4
ret void
}

define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; R600-LABEL: v_fneg_fabsf_f32:
; R600: ; %bb.0:
; R600-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; R600-NEXT: TEX 0 @6
; R600-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: Fetch clause starting at 6:
; R600-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; R600-NEXT: ALU clause starting at 8:
; R600-NEXT: MOV * T0.X, KC0[2].Z,
; R600-NEXT: ALU clause starting at 9:
; R600-NEXT: MOV * T0.W, |T0.X|,
; R600-NEXT: MOV T0.X, -PV.W,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%val = load float, ptr addrspace(1) %in, align 4
%fabs = call float @llvm.fabs.f32(float %val)
%fsub = fsub float -0.000000e+00, %fabs
store float %fsub, ptr addrspace(1) %out, align 4
ret void
}

define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; R600-LABEL: fneg_fabsf_v2f32:
; R600: ; %bb.0:
; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: MOV T0.W, KC0[3].X,
; R600-NEXT: MOV * T1.W, KC0[2].W,
; R600-NEXT: MOV * T0.W, |PV.W|,
; R600-NEXT: MOV T0.Y, -PV.W,
; R600-NEXT: MOV * T0.W, |T1.W|,
; R600-NEXT: MOV T0.X, -PV.W,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
%fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
store <2 x float> %fsub, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; R600-LABEL: fneg_fabsf_v4f32:
; R600: ; %bb.0:
; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: MOV * T0.W, KC0[4].X,
; R600-NEXT: MOV T0.W, |PV.W|,
; R600-NEXT: MOV * T1.W, KC0[3].W,
; R600-NEXT: MOV T0.Z, KC0[3].Z,
; R600-NEXT: MOV T1.W, |PS|,
; R600-NEXT: MOV * T2.W, -PV.W,
; R600-NEXT: MOV T2.Z, -PV.W,
; R600-NEXT: MOV T0.W, KC0[3].Y,
; R600-NEXT: MOV * T1.W, |PV.Z|,
; R600-NEXT: MOV T2.Y, -PS,
; R600-NEXT: MOV * T0.W, |PV.W|,
; R600-NEXT: MOV T2.X, -PV.W,
; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
%fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs
store <4 x float> %fsub, ptr addrspace(1) %out
ret void
}

declare float @fabsf(float) readnone
declare float @llvm.fabs.f32(float) readnone
declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone
declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone

!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
Loading