8 changes: 8 additions & 0 deletions clang/test/Preprocessor/init-loongarch.c
Original file line number Diff line number Diff line change
Expand Up @@ -832,6 +832,14 @@
// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=loongarch64 -DTUNE=loongarch64 %s
// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx -Xclang -target-feature -Xclang +frecipe | \
// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la64v1.1 -DTUNE=loongarch64 %s
// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la664 | \
// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la664 -DTUNE=la664 %s
// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -mtune=la664 | \
// RUN: FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=la664 %s
// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -mtune=la664 | \
// RUN: FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la664 %s
// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la664 -mtune=loongarch64 | \
// RUN: FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la664 -DTUNE=loongarch64 %s

// ARCH-TUNE: #define __loongarch_arch "[[ARCH]]"
// FRECIPE: #define __loongarch_frecipe 1
Expand Down
18 changes: 9 additions & 9 deletions clang/test/Preprocessor/riscv-target-features.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@
// CHECK-NOT: __riscv_za64rs {{.*$}}
// CHECK-NOT: __riscv_zaamo {{.*$}}
// CHECK-NOT: __riscv_zabha {{.*$}}
// CHECK-NOT: __riscv_zacas {{.*$}}
// CHECK-NOT: __riscv_zalrsc {{.*$}}
// CHECK-NOT: __riscv_zama16b {{.*$}}
// CHECK-NOT: __riscv_zawrs {{.*$}}
Expand Down Expand Up @@ -182,6 +181,7 @@
// CHECK-NOT: __riscv_sspm{{.*$}}
// CHECK-NOT: __riscv_ssqosid{{.*$}}
// CHECK-NOT: __riscv_supm{{.*$}}
// CHECK-NOT: __riscv_zacas {{.*$}}
// CHECK-NOT: __riscv_zalasr {{.*$}}
// CHECK-NOT: __riscv_zfbfmin {{.*$}}
// CHECK-NOT: __riscv_zicfilp {{.*$}}
Expand Down Expand Up @@ -747,14 +747,6 @@
// RUN: -o - | FileCheck --check-prefix=CHECK-ZABHA-EXT %s
// CHECK-ZABHA-EXT: __riscv_zabha 1000000{{$}}

// RUN: %clang --target=riscv32 \
// RUN: -march=rv32ia_zacas1p0 -E -dM %s \
// RUN: -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
// RUN: %clang --target=riscv64 \
// RUN: -march=rv64ia_zacas1p0 -E -dM %s \
// RUN: -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
// CHECK-ZACAS-EXT: __riscv_zacas 1000000{{$}}

// RUN: %clang --target=riscv32 \
// RUN: -march=rv32i_zalrsc1p0 -E -dM %s \
// RUN: -o - | FileCheck --check-prefix=CHECK-ZALRSC-EXT %s
Expand Down Expand Up @@ -1626,6 +1618,14 @@
// CHECK-ZVKT-EXT: __riscv_zvkt 1000000{{$}}

// Experimental extensions
// RUN: %clang --target=riscv32 -menable-experimental-extensions \
// RUN: -march=rv32ia_zacas1p0 -E -dM %s \
// RUN: -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
// RUN: %clang --target=riscv64 -menable-experimental-extensions \
// RUN: -march=rv64ia_zacas1p0 -E -dM %s \
// RUN: -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
// CHECK-ZACAS-EXT: __riscv_zacas 1000000{{$}}

// RUN: %clang --target=riscv32 -menable-experimental-extensions \
// RUN: -march=rv32i_zalasr0p1 -E -dM %s \
// RUN: -o - | FileCheck --check-prefix=CHECK-ZALASR-EXT %s
Expand Down
95 changes: 82 additions & 13 deletions clang/unittests/AST/ASTImporterTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9681,37 +9681,106 @@ AST_MATCHER_P(EnumDecl, hasEnumConstName, StringRef, ConstName) {
return false;
}

TEST_P(ASTImporterOptionSpecificTestBase, ImportAnonymousEnum) {
TEST_P(ASTImporterOptionSpecificTestBase, ImportAnonymousEnums) {
const char *Code =
R"(
struct A {
enum { E1, E2 } x;
enum { E3, E4 } y;
};
)";
Decl *FromTU = getTuDecl(Code, Lang_CXX11);
auto *FromEnumE1 = FirstDeclMatcher<EnumDecl>().match(
FromTU, enumDecl(hasEnumConstName("E1")));
auto *ImportedEnumE1 = Import(FromEnumE1, Lang_CXX11);
EXPECT_TRUE(ImportedEnumE1);
auto *FromEnumE3 = FirstDeclMatcher<EnumDecl>().match(
FromTU, enumDecl(hasEnumConstName("E3")));
auto *ImportedEnumE3 = Import(FromEnumE3, Lang_CXX11);
EXPECT_TRUE(ImportedEnumE3);
EXPECT_NE(ImportedEnumE1, ImportedEnumE3);
}

TEST_P(ASTImporterOptionSpecificTestBase, ImportFreeStandingAnonymousEnums) {
const char *Code =
R"(
struct A {
enum { E1, E2 };
enum { E3, E4 };
};
)";
Decl *FromTU = getTuDecl(Code, Lang_CXX11);
auto *FromEnumE1 = FirstDeclMatcher<EnumDecl>().match(
FromTU, enumDecl(hasEnumConstName("E1")));
auto *ImportedEnumE1 = Import(FromEnumE1, Lang_CXX11);
EXPECT_TRUE(ImportedEnumE1);
auto *FromEnumE3 = FirstDeclMatcher<EnumDecl>().match(
FromTU, enumDecl(hasEnumConstName("E3")));
auto *ImportedEnumE3 = Import(FromEnumE3, Lang_CXX11);
EXPECT_TRUE(ImportedEnumE3);
EXPECT_NE(ImportedEnumE1, ImportedEnumE3);
}

TEST_P(ASTImporterOptionSpecificTestBase, ImportExistingAnonymousEnums) {
const char *ToCode =
R"(
struct A {
enum { E1, E2} x;
enum { E3, E4} y;
enum { E1, E2 } x;
enum { E3, E4 } y;
};
)";
Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11);
auto *ToE1 = FirstDeclMatcher<EnumDecl>().match(
auto *ToEnumE1 = FirstDeclMatcher<EnumDecl>().match(
ToTU, enumDecl(hasEnumConstName("E1")));
auto *ToE3 = FirstDeclMatcher<EnumDecl>().match(
auto *ToEnumE3 = FirstDeclMatcher<EnumDecl>().match(
ToTU, enumDecl(hasEnumConstName("E3")));
const char *Code =
R"(
struct A {
enum { E1, E2} x;
enum { E3, E4} y;
enum { E1, E2 } x;
enum { E3, E4 } y;
};
)";
Decl *FromTU = getTuDecl(Code, Lang_CXX11);
auto *FromE1 = FirstDeclMatcher<EnumDecl>().match(
auto *FromEnumE1 = FirstDeclMatcher<EnumDecl>().match(
FromTU, enumDecl(hasEnumConstName("E1")));
auto *ImportedEnumE1 = Import(FromEnumE1, Lang_CXX11);
ASSERT_TRUE(ImportedEnumE1);
EXPECT_EQ(ImportedEnumE1, ToEnumE1);
auto *FromEnumE3 = FirstDeclMatcher<EnumDecl>().match(
FromTU, enumDecl(hasEnumConstName("E3")));
auto *ImportedEnumE3 = Import(FromEnumE3, Lang_CXX11);
ASSERT_TRUE(ImportedEnumE3);
EXPECT_EQ(ImportedEnumE3, ToEnumE3);
}

TEST_P(ASTImporterOptionSpecificTestBase, ImportExistingEmptyAnonymousEnums) {
const char *ToCode =
R"(
struct A {
enum {};
};
)";
Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11);
auto *ToE1 = FirstDeclMatcher<EnumDecl>().match(ToTU, enumDecl());
const char *Code =
R"(
struct A {
enum {};
enum {};
};
)";
Decl *FromTU = getTuDecl(Code, Lang_CXX11);
auto *FromE1 = FirstDeclMatcher<EnumDecl>().match(FromTU, enumDecl());
auto *ImportedE1 = Import(FromE1, Lang_CXX11);
ASSERT_TRUE(ImportedE1);
EXPECT_EQ(ImportedE1, ToE1);
auto *FromE3 = FirstDeclMatcher<EnumDecl>().match(
FromTU, enumDecl(hasEnumConstName("E3")));
auto *ImportedE3 = Import(FromE3, Lang_CXX11);
ASSERT_TRUE(ImportedE3);
EXPECT_EQ(ImportedE3, ToE3);
auto *FromE2 = LastDeclMatcher<EnumDecl>().match(FromTU, enumDecl());
ASSERT_NE(FromE1, FromE2);
auto *ImportedE2 = Import(FromE2, Lang_CXX11);
ASSERT_TRUE(ImportedE2);
// FIXME: These should not be equal, or the import should fail.
EXPECT_EQ(ImportedE2, ToE1);
}

INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ASTImporterLookupTableTest,
Expand Down
12 changes: 6 additions & 6 deletions compiler-rt/test/asan/TestCases/Posix/ignore_free_hook.cpp
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
// RUN: %clangxx_asan -O2 %s -o %t -DTEST=basic_hook_works && not %run %t \
// RUN: |& FileCheck %s -check-prefix=CHECK-BASIC
// RUN: 2>&1 | FileCheck %s -check-prefix=CHECK-BASIC
// RUN: %clangxx_asan -O2 %s -o %t -DTEST=ignore && %run %t \
// RUN: |& FileCheck %s -check-prefix=CHECK-IGNORE
// RUN: 2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE
// RUN: %clangxx_asan -O2 %s -o %t -DTEST=ignore_twice && not %run %t \
// RUN: |& FileCheck %s -check-prefix=CHECK-IGNORE-2
// RUN: 2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-2
// RUN: %clangxx_asan -O2 %s -o %t -DTEST=mismatch && %env_asan_opts=alloc_dealloc_mismatch=1 not %run %t \
// RUN: |& FileCheck %s -check-prefix=CHECK-MISMATCH
// RUN: 2>&1 | FileCheck %s -check-prefix=CHECK-MISMATCH
// RUN: %clangxx_asan -O2 %s -o %t -DTEST=ignore_mismatch && %env_asan_opts=alloc_dealloc_mismatch=1 %run %t \
// RUN: |& FileCheck %s -check-prefix=CHECK-IGNORE-MISMATCH
// RUN: 2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-MISMATCH
// RUN: %clangxx_asan -O2 %s -o %t -DTEST=double_delete && not %run %t \
// RUN: |& FileCheck %s -check-prefix=CHECK-DOUBLE-DELETE
// RUN: 2>&1 | FileCheck %s -check-prefix=CHECK-DOUBLE-DELETE

#include <stdio.h>
#include <stdlib.h>
Expand Down
8 changes: 4 additions & 4 deletions compiler-rt/test/hwasan/TestCases/Posix/ignore_free_hook.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
// RUN: %clangxx_hwasan -O2 %s -o %t -DTEST=basic_hook_works && not %run %t \
// RUN: |& FileCheck %s -check-prefix=CHECK-BASIC
// RUN: 2>&1 | FileCheck %s -check-prefix=CHECK-BASIC
// RUN: %clangxx_hwasan -O2 %s -o %t -DTEST=ignore && %run %t \
// RUN: |& FileCheck %s -check-prefix=CHECK-IGNORE
// RUN: 2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE
// RUN: %clangxx_hwasan -O2 %s -o %t -DTEST=ignore_twice && not %run %t \
// RUN: |& FileCheck %s -check-prefix=CHECK-IGNORE-2
// RUN: 2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-2
// RUN: %clangxx_hwasan -O2 %s -o %t -DTEST=double_delete && not %run %t \
// RUN: |& FileCheck %s -check-prefix=CHECK-DOUBLE-DELETE
// RUN: 2>&1 | FileCheck %s -check-prefix=CHECK-DOUBLE-DELETE

#include <sanitizer/hwasan_interface.h>
#include <stdio.h>
Expand Down
9 changes: 3 additions & 6 deletions llvm/docs/RISCVUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,6 @@ on support follow.
``Za64rs`` Supported (`See note <#riscv-profiles-extensions-note>`__)
``Zaamo`` Assembly Support
``Zabha`` Supported
``Zacas`` Supported (`See note <#riscv-zacas-note>`__)
``Zalrsc`` Assembly Support
``Zama16b`` Supported (`See note <#riscv-profiles-extensions-note>`__)
``Zawrs`` Assembly Support
Expand Down Expand Up @@ -281,11 +280,6 @@ Supported
``Za128rs``, ``Za64rs``, ``Zama16b``, ``Zic64b``, ``Ziccamoa``, ``Ziccif``, ``Zicclsm``, ``Ziccrse``, ``Shcounterenvw``, ``Shgatpa``, ``Shtvala``, ``Shvsatpa``, ``Shvstvala``, ``Shvstvecd``, ``Ssccptr``, ``Sscounterenw``, ``Ssstateen``, ``Ssstrict``, ``Sstvala``, ``Sstvecd``, ``Ssu64xl``, ``Svade``, ``Svbare``
These extensions are defined as part of the `RISC-V Profiles specification <https://github.com/riscv/riscv-profiles/releases/tag/v1.0>`__. They do not introduce any new features themselves, but instead describe existing hardware features.

.. _riscv-zacas-note:

``Zacas``
amocas.w will be used for i32 cmpxchg. amocas.d will be used i64 cmpxchg on RV64. The compiler will not generate amocas.d on RV32 or amocas.q on RV64 due to ABI compatibilty. These can only be used in the assembler.

Experimental Extensions
=======================

Expand All @@ -299,6 +293,9 @@ The primary goal of experimental support is to assist in the process of ratifica
``experimental-ssqosid``
LLVM implements assembler support for the `v1.0-rc1 draft specification <https://github.com/riscv/riscv-ssqosid/releases/tag/v1.0-rc1>`_.

``experimental-zacas``
LLVM implements the `1.0 release specification <https://github.com/riscvarchive/riscv-zacas/releases/tag/v1.0>`__. amocas.w will be used for i32 cmpxchg. amocas.d will be used i64 cmpxchg on RV64. The compiler will not generate amocas.d on RV32 or amocas.q on RV64 due to ABI compatibilty. These can only be used in the assembler. The extension will be left as experimental until `an ABI issue <https://github.com/riscv-non-isa/riscv-elf-psabi-doc/issues/444>`__ is resolved.

``experimental-zalasr``
LLVM implements the `0.0.5 draft specification <https://github.com/mehnadnerd/riscv-zalasr>`__.

Expand Down
8 changes: 7 additions & 1 deletion llvm/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,13 @@ Changes to the LoongArch Backend
* i32 is now a native type in the datalayout string. This enables
LoopStrengthReduce for loops with i32 induction variables, among other
optimizations.
* Codegen support is added for TLS Desciptor.
* Interleaved vectorization and vector shuffle are supported on LoongArch and
the experimental feature ``auto-vec`` is removed.
* Allow ``f16`` codegen with expansion to libcalls.
* Clarify that emulated TLS is not supported.
* A codegen issue for ``bstrins.w`` is fixed on loongarch32.
* Assorted codegen improvements.

Changes to the MIPS Backend
---------------------------
Expand All @@ -185,7 +192,6 @@ Changes to the RISC-V Backend
* Codegen support was added for the Zimop (May-Be-Operations) extension.
* The experimental Ssnpm, Smnpm, Smmpm, Sspm, and Supm 1.0.0 Pointer Masking extensions are supported.
* The experimental Ssqosid extension is supported.
* Zacas is no longer experimental.
* Added the CSR names from the Resumable Non-Maskable Interrupts (Smrnmi) extension.
* llvm-objdump now prints disassembled opcode bytes in groups of 2 or 4 bytes to
match GNU objdump. The bytes within the groups are in big endian order.
Expand Down
7 changes: 0 additions & 7 deletions llvm/include/llvm/MC/MCAssembler.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ class MCAssembler {

bool HasLayout = false;
bool RelaxAll = false;
bool SubsectionsViaSymbols = false;

SectionListType Sections;

Expand Down Expand Up @@ -144,7 +143,6 @@ class MCAssembler {
std::unique_ptr<MCObjectWriter> Writer);
MCAssembler(const MCAssembler &) = delete;
MCAssembler &operator=(const MCAssembler &) = delete;
~MCAssembler();

/// Compute the effective fragment size.
uint64_t computeFragmentSize(const MCFragment &F) const;
Expand Down Expand Up @@ -194,7 +192,6 @@ class MCAssembler {
MCObjectWriter &getWriter() const { return *Writer; }

MCDwarfLineTableParams getDWARFLinetableParams() const { return LTParams; }
void setDWARFLinetableParams(MCDwarfLineTableParams P) { LTParams = P; }

/// Finish - Do final processing and write the object to the output stream.
/// \p Writer is used for custom object writer (as the MCJIT does),
Expand All @@ -204,10 +201,6 @@ class MCAssembler {
// Layout all section and prepare them for emission.
void layout();

// FIXME: This does not belong here.
bool getSubsectionsViaSymbols() const { return SubsectionsViaSymbols; }
void setSubsectionsViaSymbols(bool Value) { SubsectionsViaSymbols = Value; }

bool hasLayout() const { return HasLayout; }
bool getRelaxAll() const { return RelaxAll; }
void setRelaxAll(bool Value) { RelaxAll = Value; }
Expand Down
34 changes: 12 additions & 22 deletions llvm/include/llvm/MC/MCELFObjectWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,12 @@ class ELFObjectWriter : public MCObjectWriter {

public:
std::unique_ptr<MCELFObjectTargetWriter> TargetObjectWriter;
raw_pwrite_stream &OS;
raw_pwrite_stream *DwoOS = nullptr;

DenseMap<const MCSectionELF *, std::vector<ELFRelocationEntry>> Relocations;
DenseMap<const MCSymbolELF *, const MCSymbolELF *> Renames;
bool IsLittleEndian = false;
bool SeenGnuAbi = false;
std::optional<uint8_t> OverrideABIVersion;

Expand All @@ -173,8 +177,11 @@ class ELFObjectWriter : public MCObjectWriter {
};
SmallVector<Symver, 0> Symvers;

ELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW)
: TargetObjectWriter(std::move(MOTW)) {}
ELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
raw_pwrite_stream &OS, bool IsLittleEndian);
ELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS,
bool IsLittleEndian);

void reset() override;
void executePostLayoutBinding(MCAssembler &Asm) override;
Expand All @@ -185,6 +192,7 @@ class ELFObjectWriter : public MCObjectWriter {
const MCSymbol &SymA,
const MCFragment &FB, bool InSet,
bool IsPCRel) const override;
uint64_t writeObject(MCAssembler &Asm) override;

bool hasRelocationAddend() const;
bool usesRela(const MCTargetOptions *TO, const MCSectionELF &Sec) const;
Expand All @@ -193,11 +201,8 @@ class ELFObjectWriter : public MCObjectWriter {
const MCSymbolELF *Sym, uint64_t C,
unsigned Type) const;

virtual bool checkRelocation(MCContext &Ctx, SMLoc Loc,
const MCSectionELF *From,
const MCSectionELF *To) {
return true;
}
bool checkRelocation(MCContext &Ctx, SMLoc Loc, const MCSectionELF *From,
const MCSectionELF *To);

unsigned getELFHeaderEFlags() const { return ELFHeaderEFlags; }
void setELFHeaderEFlags(unsigned Flags) { ELFHeaderEFlags = Flags; }
Expand All @@ -209,21 +214,6 @@ class ELFObjectWriter : public MCObjectWriter {
// Override the default e_ident[EI_ABIVERSION] in the ELF header.
void setOverrideABIVersion(uint8_t V) { OverrideABIVersion = V; }
};

/// Construct a new ELF writer instance.
///
/// \param MOTW - The target specific ELF writer subclass.
/// \param OS - The stream to write to.
/// \returns The constructed object writer.
std::unique_ptr<MCObjectWriter>
createELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
raw_pwrite_stream &OS, bool IsLittleEndian);

std::unique_ptr<MCObjectWriter>
createELFDwoObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS,
bool IsLittleEndian);

} // end namespace llvm

#endif // LLVM_MC_MCELFOBJECTWRITER_H
5 changes: 5 additions & 0 deletions llvm/include/llvm/MC/MCObjectWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class MCObjectWriter {
std::string CompilerVersion;
std::vector<const MCSymbol *> AddrsigSyms;
bool EmitAddrsigSection = false;
bool SubsectionsViaSymbols = false;

struct CGProfileEntry {
const MCSymbolRefExpr *From;
Expand Down Expand Up @@ -114,6 +115,10 @@ class MCObjectWriter {
std::vector<const MCSymbol *> &getAddrsigSyms() { return AddrsigSyms; }
SmallVector<CGProfileEntry, 0> &getCGProfile() { return CGProfile; }

// Mach-O specific: Whether .subsections_via_symbols is enabled.
bool getSubsectionsViaSymbols() const { return SubsectionsViaSymbols; }
void setSubsectionsViaSymbols(bool Value) { SubsectionsViaSymbols = Value; }

/// Write the object file and returns the number of bytes written.
///
/// This routine is called by the assembler after layout and relaxation is
Expand Down
2 changes: 2 additions & 0 deletions llvm/include/llvm/TargetParser/LoongArchTargetParser.def
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ LOONGARCH_FEATURE("+lasx", FK_LASX)
LOONGARCH_FEATURE("+lbt", FK_LBT)
LOONGARCH_FEATURE("+lvz", FK_LVZ)
LOONGARCH_FEATURE("+ual", FK_UAL)
LOONGARCH_FEATURE("+frecipe", FK_FRECIPE)

#undef LOONGARCH_FEATURE

Expand All @@ -19,5 +20,6 @@ LOONGARCH_FEATURE("+ual", FK_UAL)

LOONGARCH_ARCH("loongarch64", AK_LOONGARCH64, FK_64BIT | FK_FP32 | FK_FP64 | FK_UAL)
LOONGARCH_ARCH("la464", AK_LA464, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL)
LOONGARCH_ARCH("la664", AK_LA664, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL | FK_FRECIPE)

#undef LOONGARCH_ARCH
3 changes: 3 additions & 0 deletions llvm/include/llvm/TargetParser/LoongArchTargetParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ enum FeatureKind : uint32_t {

// Allow memory accesses to be unaligned.
FK_UAL = 1 << 8,

// Floating-point approximate reciprocal instructions are available.
FK_FRECIPE = 1 << 9,
};

struct FeatureInfo {
Expand Down
100 changes: 35 additions & 65 deletions llvm/lib/MC/ELFObjectWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,58 +198,6 @@ struct ELFWriter {
void writeSection(uint32_t GroupSymbolIndex, uint64_t Offset, uint64_t Size,
const MCSectionELF &Section);
};

class ELFSingleObjectWriter : public ELFObjectWriter {
raw_pwrite_stream &OS;
bool IsLittleEndian;

public:
ELFSingleObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
raw_pwrite_stream &OS, bool IsLittleEndian)
: ELFObjectWriter(std::move(MOTW)), OS(OS),
IsLittleEndian(IsLittleEndian) {}

uint64_t writeObject(MCAssembler &Asm) override {
return ELFWriter(*this, OS, IsLittleEndian, ELFWriter::AllSections)
.writeObject(Asm);
}

friend struct ELFWriter;
};

class ELFDwoObjectWriter : public ELFObjectWriter {
raw_pwrite_stream &OS, &DwoOS;
bool IsLittleEndian;

public:
ELFDwoObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS,
bool IsLittleEndian)
: ELFObjectWriter(std::move(MOTW)), OS(OS), DwoOS(DwoOS),
IsLittleEndian(IsLittleEndian) {}

bool checkRelocation(MCContext &Ctx, SMLoc Loc, const MCSectionELF *From,
const MCSectionELF *To) override {
if (isDwoSection(*From)) {
Ctx.reportError(Loc, "A dwo section may not contain relocations");
return false;
}
if (To && isDwoSection(*To)) {
Ctx.reportError(Loc, "A relocation may not refer to a dwo section");
return false;
}
return true;
}

uint64_t writeObject(MCAssembler &Asm) override {
uint64_t Size = ELFWriter(*this, OS, IsLittleEndian, ELFWriter::NonDwoOnly)
.writeObject(Asm);
Size += ELFWriter(*this, DwoOS, IsLittleEndian, ELFWriter::DwoOnly)
.writeObject(Asm);
return Size;
}
};

} // end anonymous namespace

uint64_t ELFWriter::align(Align Alignment) {
Expand Down Expand Up @@ -1156,6 +1104,16 @@ uint64_t ELFWriter::writeObject(MCAssembler &Asm) {
return W.OS.tell() - StartOffset;
}

ELFObjectWriter::ELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
raw_pwrite_stream &OS, bool IsLittleEndian)
: TargetObjectWriter(std::move(MOTW)), OS(OS),
IsLittleEndian(IsLittleEndian) {}
ELFObjectWriter::ELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
raw_pwrite_stream &OS,
raw_pwrite_stream &DwoOS, bool IsLittleEndian)
: TargetObjectWriter(std::move(MOTW)), OS(OS), DwoOS(&DwoOS),
IsLittleEndian(IsLittleEndian) {}

void ELFObjectWriter::reset() {
ELFHeaderEFlags = 0;
SeenGnuAbi = false;
Expand Down Expand Up @@ -1357,6 +1315,22 @@ bool ELFObjectWriter::shouldRelocateWithSymbol(const MCAssembler &Asm,
return false;
}

bool ELFObjectWriter::checkRelocation(MCContext &Ctx, SMLoc Loc,
const MCSectionELF *From,
const MCSectionELF *To) {
if (DwoOS) {
if (isDwoSection(*From)) {
Ctx.reportError(Loc, "A dwo section may not contain relocations");
return false;
}
if (To && isDwoSection(*To)) {
Ctx.reportError(Loc, "A relocation may not refer to a dwo section");
return false;
}
}
return true;
}

void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
const MCFragment *Fragment,
const MCFixup &Fixup, MCValue Target,
Expand Down Expand Up @@ -1473,17 +1447,13 @@ bool ELFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
return &SymA.getSection() == FB.getParent();
}

std::unique_ptr<MCObjectWriter>
llvm::createELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
raw_pwrite_stream &OS, bool IsLittleEndian) {
return std::make_unique<ELFSingleObjectWriter>(std::move(MOTW), OS,
IsLittleEndian);
}

std::unique_ptr<MCObjectWriter>
llvm::createELFDwoObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS,
bool IsLittleEndian) {
return std::make_unique<ELFDwoObjectWriter>(std::move(MOTW), OS, DwoOS,
IsLittleEndian);
uint64_t ELFObjectWriter::writeObject(MCAssembler &Asm) {
uint64_t Size =
ELFWriter(*this, OS, IsLittleEndian,
DwoOS ? ELFWriter::NonDwoOnly : ELFWriter::AllSections)
.writeObject(Asm);
if (DwoOS)
Size += ELFWriter(*this, *DwoOS, IsLittleEndian, ELFWriter::DwoOnly)
.writeObject(Asm);
return Size;
}
11 changes: 6 additions & 5 deletions llvm/lib/MC/MCAsmBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,17 @@ MCAsmBackend::~MCAsmBackend() = default;
std::unique_ptr<MCObjectWriter>
MCAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
auto TW = createObjectTargetWriter();
bool IsLE = Endian == llvm::endianness::little;
switch (TW->getFormat()) {
case Triple::ELF:
return createELFObjectWriter(cast<MCELFObjectTargetWriter>(std::move(TW)),
OS, Endian == llvm::endianness::little);
case Triple::MachO:
return createMachObjectWriter(cast<MCMachObjectTargetWriter>(std::move(TW)),
OS, Endian == llvm::endianness::little);
OS, IsLE);
case Triple::COFF:
return createWinCOFFObjectWriter(
cast<MCWinCOFFObjectTargetWriter>(std::move(TW)), OS);
case Triple::ELF:
return std::make_unique<ELFObjectWriter>(
cast<MCELFObjectTargetWriter>(std::move(TW)), OS, IsLE);
case Triple::SPIRV:
return createSPIRVObjectWriter(
cast<MCSPIRVObjectTargetWriter>(std::move(TW)), OS);
Expand Down Expand Up @@ -71,7 +72,7 @@ MCAsmBackend::createDwoObjectWriter(raw_pwrite_stream &OS,
return createWinCOFFDwoObjectWriter(
cast<MCWinCOFFObjectTargetWriter>(std::move(TW)), OS, DwoOS);
case Triple::ELF:
return createELFDwoObjectWriter(
return std::make_unique<ELFObjectWriter>(
cast<MCELFObjectTargetWriter>(std::move(TW)), OS, DwoOS,
Endian == llvm::endianness::little);
case Triple::Wasm:
Expand Down
5 changes: 1 addition & 4 deletions llvm/lib/MC/MCAssembler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,8 @@ MCAssembler::MCAssembler(MCContext &Context,
: Context(Context), Backend(std::move(Backend)),
Emitter(std::move(Emitter)), Writer(std::move(Writer)) {}

MCAssembler::~MCAssembler() = default;

void MCAssembler::reset() {
RelaxAll = false;
SubsectionsViaSymbols = false;
Sections.clear();
Symbols.clear();
ThumbFuncs.clear();
Expand Down Expand Up @@ -1095,7 +1092,7 @@ bool MCAssembler::relaxLEB(MCLEBFragment &LF) {
// Use evaluateKnownAbsolute for Mach-O as a hack: .subsections_via_symbols
// requires that .uleb128 A-B is foldable where A and B reside in different
// fragments. This is used by __gcc_except_table.
bool Abs = getSubsectionsViaSymbols()
bool Abs = getWriter().getSubsectionsViaSymbols()
? LF.getValue().evaluateKnownAbsolute(Value, *this)
: LF.getValue().evaluateAsAbsolute(Value, *this);
if (!Abs) {
Expand Down
12 changes: 0 additions & 12 deletions llvm/lib/MC/MCELFStreamer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,18 +88,6 @@ void MCELFStreamer::emitLabelAtPos(MCSymbol *S, SMLoc Loc, MCDataFragment &F,
void MCELFStreamer::emitAssemblerFlag(MCAssemblerFlag Flag) {
// Let the target do whatever target specific stuff it needs to do.
getAssembler().getBackend().handleAssemblerFlag(Flag);
// Do any generic stuff we need to do.
switch (Flag) {
case MCAF_SyntaxUnified: return; // no-op here.
case MCAF_Code16: return; // Change parsing mode; no-op here.
case MCAF_Code32: return; // Change parsing mode; no-op here.
case MCAF_Code64: return; // Change parsing mode; no-op here.
case MCAF_SubsectionsViaSymbols:
getAssembler().setSubsectionsViaSymbols(true);
return;
}

llvm_unreachable("invalid assembler flag!");
}

// If bundle alignment is used and there are any instructions in the section, it
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/MC/MCMachOStreamer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ void MCMachOStreamer::emitAssemblerFlag(MCAssemblerFlag Flag) {
case MCAF_Code32: return; // Change parsing mode; no-op here.
case MCAF_Code64: return; // Change parsing mode; no-op here.
case MCAF_SubsectionsViaSymbols:
getAssembler().setSubsectionsViaSymbols(true);
getWriter().setSubsectionsViaSymbols(true);
return;
}
}
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/MC/MCObjectWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ MCObjectWriter::~MCObjectWriter() = default;
void MCObjectWriter::reset() {
FileNames.clear();
AddrsigSyms.clear();
EmitAddrsigSection = false;
SubsectionsViaSymbols = false;
CGProfile.clear();
}

Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/MC/MachObjectWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -744,7 +744,7 @@ bool MachObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
if (!hasReliableSymbolDifference) {
if (!SA.isInSection() || &SecA != &SecB ||
(!SA.isTemporary() && FB.getAtom() != SA.getFragment()->getAtom() &&
Asm.getSubsectionsViaSymbols()))
SubsectionsViaSymbols))
return false;
return true;
}
Expand Down Expand Up @@ -894,7 +894,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {

// Write the prolog, starting with the header and load command...
writeHeader(MachO::MH_OBJECT, NumLoadCommands, LoadCommandsSize,
Asm.getSubsectionsViaSymbols());
SubsectionsViaSymbols);
uint32_t Prot =
MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE;
writeSegmentLoadCommand("", NumSections, 0, VMSize, SectionDataStart,
Expand Down
18 changes: 14 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -947,6 +947,12 @@ def FeatureHasRestrictedSOffset : SubtargetFeature<"restricted-soffset",
"Has restricted SOffset (immediate not supported)."
>;

def FeatureRequiredExportPriority : SubtargetFeature<"required-export-priority",
"HasRequiredExportPriority",
"true",
"Export priority must be explicitly manipulated on GFX11.5"
>;

//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
Expand Down Expand Up @@ -1567,7 +1573,8 @@ def FeatureISAVersion11_Generic: FeatureSet<
FeatureUserSGPRInit16Bug,
FeatureMADIntraFwdBug,
FeaturePrivEnabledTrap2NopBug,
FeatureRequiresCOV6])>;
FeatureRequiresCOV6,
FeatureRequiredExportPriority])>;

def FeatureISAVersion11_0_Common : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
Expand Down Expand Up @@ -1597,20 +1604,23 @@ def FeatureISAVersion11_5_0 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[FeatureSALUFloatInsts,
FeatureDPPSrc1SGPR,
FeatureVGPRSingleUseHintInsts])>;
FeatureVGPRSingleUseHintInsts,
FeatureRequiredExportPriority])>;

def FeatureISAVersion11_5_1 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[FeatureSALUFloatInsts,
FeatureDPPSrc1SGPR,
FeatureVGPRSingleUseHintInsts,
Feature1_5xVGPRs])>;
Feature1_5xVGPRs,
FeatureRequiredExportPriority])>;

def FeatureISAVersion11_5_2 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[FeatureSALUFloatInsts,
FeatureDPPSrc1SGPR,
FeatureVGPRSingleUseHintInsts])>;
FeatureVGPRSingleUseHintInsts,
FeatureRequiredExportPriority])>;

def FeatureISAVersion12 : FeatureSet<
[FeatureGFX12,
Expand Down
112 changes: 112 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/TargetParser/TargetParser.h"
Expand Down Expand Up @@ -1104,6 +1105,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixWMMAHazards(MI);
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
fixRequiredExportPriority(MI);
}

bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
Expand Down Expand Up @@ -2895,3 +2897,113 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {

return true;
}

static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
const SIInstrInfo &TII) {
MachineBasicBlock &EntryMBB = MF->front();
if (EntryMBB.begin() != EntryMBB.end()) {
auto &EntryMI = *EntryMBB.begin();
if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
EntryMI.getOperand(0).getImm() >= Priority)
return false;
}

BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
.addImm(Priority);
return true;
}

bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
if (!ST.hasRequiredExportPriority())
return false;

// Assume the following shader types will never have exports,
// and avoid adding or adjusting S_SETPRIO.
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MBB->getParent();
auto CC = MF->getFunction().getCallingConv();
switch (CC) {
case CallingConv::AMDGPU_CS:
case CallingConv::AMDGPU_CS_Chain:
case CallingConv::AMDGPU_CS_ChainPreserve:
case CallingConv::AMDGPU_KERNEL:
return false;
default:
break;
}

const int MaxPriority = 3;
const int NormalPriority = 2;
const int PostExportPriority = 0;

auto It = MI->getIterator();
switch (MI->getOpcode()) {
case AMDGPU::S_ENDPGM:
case AMDGPU::S_ENDPGM_SAVED:
case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
case AMDGPU::SI_RETURN_TO_EPILOG:
// Ensure shader with calls raises priority at entry.
// This ensures correct priority if exports exist in callee.
if (MF->getFrameInfo().hasCalls())
return ensureEntrySetPrio(MF, NormalPriority, TII);
return false;
case AMDGPU::S_SETPRIO: {
// Raise minimum priority unless in workaround.
auto &PrioOp = MI->getOperand(0);
int Prio = PrioOp.getImm();
bool InWA = (Prio == PostExportPriority) &&
(It != MBB->begin() && TII.isEXP(*std::prev(It)));
if (InWA || Prio >= NormalPriority)
return false;
PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
return true;
}
default:
if (!TII.isEXP(*MI))
return false;
break;
}

// Check entry priority at each export (as there will only be a few).
// Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
bool Changed = false;
if (CC != CallingConv::AMDGPU_Gfx)
Changed = ensureEntrySetPrio(MF, NormalPriority, TII);

auto NextMI = std::next(It);
bool EndOfShader = false;
if (NextMI != MBB->end()) {
// Only need WA at end of sequence of exports.
if (TII.isEXP(*NextMI))
return Changed;
// Assume appropriate S_SETPRIO after export means WA already applied.
if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
NextMI->getOperand(0).getImm() == PostExportPriority)
return Changed;
EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
}

const DebugLoc &DL = MI->getDebugLoc();

// Lower priority.
BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
.addImm(PostExportPriority);

if (!EndOfShader) {
// Wait for exports to complete.
BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
.addReg(AMDGPU::SGPR_NULL)
.addImm(0);
}

BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);

if (!EndOfShader) {
// Return to normal (higher) priority.
BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
.addImm(NormalPriority);
}

return true;
}
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
bool fixWMMAHazards(MachineInstr *MI);
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);
bool fixRequiredExportPriority(MachineInstr *MI);

int checkMAIHazards(MachineInstr *MI);
int checkMAIHazards908(MachineInstr *MI);
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasVOPDInsts = false;
bool HasVALUTransUseHazard = false;
bool HasForceStoreSC0SC1 = false;
bool HasRequiredExportPriority = false;

bool RequiresCOV6 = false;

Expand Down Expand Up @@ -1282,6 +1283,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }

bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }

/// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
/// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
Expand Down
48 changes: 40 additions & 8 deletions llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
CombineInfo &Paired, bool Modify = false);
static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
const CombineInfo &Paired);
static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
const CombineInfo &Paired);
const TargetRegisterClass *
Expand Down Expand Up @@ -353,6 +353,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX2:
case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX2:
Expand All @@ -363,6 +364,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX3:
case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX3:
Expand All @@ -373,6 +375,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX4:
case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX4:
Expand All @@ -383,6 +386,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
return 8;
case AMDGPU::DS_READ_B32:
case AMDGPU::DS_READ_B32_gfx9:
Expand Down Expand Up @@ -507,6 +511,10 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
return S_LOAD_IMM;
case AMDGPU::DS_READ_B32:
case AMDGPU::DS_READ_B32_gfx9:
Expand Down Expand Up @@ -591,6 +599,10 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
return AMDGPU::S_LOAD_DWORD_IMM;
case AMDGPU::GLOBAL_LOAD_DWORD:
case AMDGPU::GLOBAL_LOAD_DWORDX2:
Expand Down Expand Up @@ -703,6 +715,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
Result.SBase = true;
return Result;
case AMDGPU::DS_READ_B32:
Expand Down Expand Up @@ -1212,8 +1228,14 @@ void SILoadStoreOptimizer::copyToDestRegs(

// Copy to the old destination registers.
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
const auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
const auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);

// The constrained sload instructions in S_LOAD_IMM class will have
// `early-clobber` flag in the dst operand. Remove the flag before using the
// MOs in copies.
Dest0->setIsEarlyClobber(false);
Dest1->setIsEarlyClobber(false);

BuildMI(*MBB, InsertBefore, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
Expand Down Expand Up @@ -1700,19 +1722,29 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
case 8:
return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
}
case S_LOAD_IMM:
case S_LOAD_IMM: {
// If XNACK is enabled, use the constrained opcodes when the first load is
// under-aligned.
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
bool NeedsConstrainedOpc =
STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
switch (Width) {
default:
return 0;
case 2:
return AMDGPU::S_LOAD_DWORDX2_IMM;
return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
: AMDGPU::S_LOAD_DWORDX2_IMM;
case 3:
return AMDGPU::S_LOAD_DWORDX3_IMM;
return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
: AMDGPU::S_LOAD_DWORDX3_IMM;
case 4:
return AMDGPU::S_LOAD_DWORDX4_IMM;
return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
: AMDGPU::S_LOAD_DWORDX4_IMM;
case 8:
return AMDGPU::S_LOAD_DWORDX8_IMM;
return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
: AMDGPU::S_LOAD_DWORDX8_IMM;
}
}
case GLOBAL_LOAD:
switch (Width) {
default:
Expand Down
21 changes: 17 additions & 4 deletions llvm/lib/Target/AMDGPU/SMInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -161,12 +161,25 @@ class SM_Discard_Pseudo <string opName, OffsetMode offsets>
let has_soffset = offsets.HasSOffset;
}

multiclass SM_Load_Pseudos<string op, RegisterClass baseClass,
RegisterClass dstClass, OffsetMode offsets> {
defvar opName = !tolower(op);
def "" : SM_Load_Pseudo <opName, baseClass, dstClass, offsets>;

// The constrained multi-dword load equivalents with early clobber flag at
// the dst operands. They are needed only for codegen and there is no need
// for their real opcodes.
if !gt(dstClass.RegTypes[0].Size, 32) then
let Constraints = "@earlyclobber $sdst",
PseudoInstr = op # offsets.Variant in
def "" # _ec : SM_Load_Pseudo <opName, baseClass, dstClass, offsets>;
}

multiclass SM_Pseudo_Loads<RegisterClass baseClass,
RegisterClass dstClass> {
defvar opName = !tolower(NAME);
def _IMM : SM_Load_Pseudo <opName, baseClass, dstClass, IMM_Offset>;
def _SGPR : SM_Load_Pseudo <opName, baseClass, dstClass, SGPR_Offset>;
def _SGPR_IMM : SM_Load_Pseudo <opName, baseClass, dstClass, SGPR_IMM_Offset>;
defm _IMM : SM_Load_Pseudos <NAME, baseClass, dstClass, IMM_Offset>;
defm _SGPR : SM_Load_Pseudos <NAME, baseClass, dstClass, SGPR_Offset>;
defm _SGPR_IMM : SM_Load_Pseudos <NAME, baseClass, dstClass, SGPR_IMM_Offset>;
}

multiclass SM_Pseudo_Stores<RegisterClass baseClass,
Expand Down
12 changes: 7 additions & 5 deletions llvm/lib/Target/LoongArch/LoongArch.td
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,6 @@ def FeatureRelax
: SubtargetFeature<"relax", "HasLinkerRelax", "true",
"Enable Linker relaxation">;

// Experimental auto vectorization
def FeatureAutoVec
: SubtargetFeature<"auto-vec", "HasExpAutoVec", "true",
"Experimental auto vectorization">;

// Floating point approximation operation
def FeatureFrecipe
: SubtargetFeature<"frecipe", "HasFrecipe", "true",
Expand Down Expand Up @@ -151,6 +146,13 @@ def : ProcessorModel<"la464", NoSchedModel, [Feature64Bit,
FeatureExtLVZ,
FeatureExtLBT]>;

def : ProcessorModel<"la664", NoSchedModel, [Feature64Bit,
FeatureUAL,
FeatureExtLASX,
FeatureExtLVZ,
FeatureExtLBT,
FeatureFrecipe]>;

//===----------------------------------------------------------------------===//
// Define the LoongArch target.
//===----------------------------------------------------------------------===//
Expand Down
2 changes: 0 additions & 2 deletions llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@ TypeSize LoongArchTTIImpl::getRegisterBitWidth(
case TargetTransformInfo::RGK_Scalar:
return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
case TargetTransformInfo::RGK_FixedWidthVector:
if (!ST->hasExpAutoVec())
return DefSize;
if (ST->hasExtLASX())
return TypeSize::getFixed(256);
if (ST->hasExtLSX())
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/RISCV/RISCVFeatures.td
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,8 @@ def HasStdExtZabha : Predicate<"Subtarget->hasStdExtZabha()">,
"'Zabha' (Byte and Halfword Atomic Memory Operations)">;

def FeatureStdExtZacas
: RISCVExtension<"zacas", 1, 0,
"'Zacas' (Atomic Compare-And-Swap Instructions)">,
: RISCVExperimentalExtension<"zacas", 1, 0,
"'Zacas' (Atomic Compare-And-Swap Instructions)">,
RISCVExtensionBitmask<0, 26>;
def HasStdExtZacas : Predicate<"Subtarget->hasStdExtZacas()">,
AssemblerPredicate<(all_of FeatureStdExtZacas),
Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/TargetParser/Host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1562,6 +1562,8 @@ StringRef sys::getHostCPUName() {
switch (processor_id & 0xf000) {
case 0xc000: // Loongson 64bit, 4-issue
return "la464";
case 0xd000: // Loongson 64bit, 6-issue
return "la664";
// TODO: Others.
default:
break;
Expand Down Expand Up @@ -2067,7 +2069,8 @@ const StringMap<bool> sys::getHostCPUFeatures() {
Features["zvfhmin"] = ExtMask & (1ULL << 31); // RISCV_HWPROBE_EXT_ZVFHMIN
Features["zfa"] = ExtMask & (1ULL << 32); // RISCV_HWPROBE_EXT_ZFA
Features["ztso"] = ExtMask & (1ULL << 33); // RISCV_HWPROBE_EXT_ZTSO
Features["zacas"] = ExtMask & (1ULL << 34); // RISCV_HWPROBE_EXT_ZACAS
// TODO: Re-enable zacas when it is marked non-experimental again.
// Features["zacas"] = ExtMask & (1ULL << 34); // RISCV_HWPROBE_EXT_ZACAS
Features["zicond"] = ExtMask & (1ULL << 35); // RISCV_HWPROBE_EXT_ZICOND
Features["zihintpause"] =
ExtMask & (1ULL << 36); // RISCV_HWPROBE_EXT_ZIHINTPAUSE
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
Original file line number Diff line number Diff line change
Expand Up @@ -658,17 +658,17 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
;
; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
; GFX1013: ; %bb.0:
; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000
; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000
; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000
; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
; GFX1013-NEXT: v_mov_b32_e32 v0, s0
; GFX1013-NEXT: v_mov_b32_e32 v1, s1
; GFX1013-NEXT: v_mov_b32_e32 v2, s2
; GFX1013-NEXT: v_mov_b32_e32 v3, s3
; GFX1013-NEXT: v_mov_b32_e32 v0, s4
; GFX1013-NEXT: v_mov_b32_e32 v1, s5
; GFX1013-NEXT: v_mov_b32_e32 v2, s6
; GFX1013-NEXT: v_mov_b32_e32 v3, s7
; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6
; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
Expand All @@ -681,7 +681,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[8:11]
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
Expand Down Expand Up @@ -769,14 +769,14 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
;
; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
; GFX1013: ; %bb.0:
; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
; GFX1013-NEXT: v_mov_b32_e32 v0, s0
; GFX1013-NEXT: v_mov_b32_e32 v1, s1
; GFX1013-NEXT: v_mov_b32_e32 v2, s2
; GFX1013-NEXT: v_mov_b32_e32 v3, s3
; GFX1013-NEXT: v_mov_b32_e32 v0, s4
; GFX1013-NEXT: v_mov_b32_e32 v1, s5
; GFX1013-NEXT: v_mov_b32_e32 v2, s6
; GFX1013-NEXT: v_mov_b32_e32 v3, s7
; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6
; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
Expand All @@ -789,7 +789,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
Expand Down
28 changes: 14 additions & 14 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
;
; GFX10-LABEL: dpp_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: v_mov_b32_e32 v0, s6
; GFX10-NEXT: v_mov_b32_e32 v1, s7
; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: dpp_test:
Expand Down Expand Up @@ -176,16 +176,16 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
;
; GFX10-LABEL: update_dppv2i32_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppv2i32_test:
Expand Down Expand Up @@ -232,16 +232,16 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
;
; GFX10-LABEL: update_dppv2f32_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppv2f32_test:
Expand Down
152 changes: 76 additions & 76 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
Original file line number Diff line number Diff line change
Expand Up @@ -692,148 +692,148 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
;
; GFX9-LABEL: sdivrem_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s8, s6, 31
; GFX9-NEXT: s_add_i32 s6, s6, s8
; GFX9-NEXT: s_xor_b32 s6, s6, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: s_ashr_i32 s9, s7, 31
; GFX9-NEXT: s_add_i32 s7, s7, s9
; GFX9-NEXT: s_xor_b32 s7, s7, s9
; GFX9-NEXT: s_ashr_i32 s0, s14, 31
; GFX9-NEXT: s_add_i32 s1, s14, s0
; GFX9-NEXT: s_xor_b32 s1, s1, s0
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1
; GFX9-NEXT: s_ashr_i32 s2, s15, 31
; GFX9-NEXT: s_add_i32 s3, s15, s2
; GFX9-NEXT: s_xor_b32 s3, s3, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
; GFX9-NEXT: s_sub_i32 s12, 0, s6
; GFX9-NEXT: s_ashr_i32 s10, s4, 31
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
; GFX9-NEXT: s_sub_i32 s6, 0, s1
; GFX9-NEXT: s_ashr_i32 s4, s12, 31
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX9-NEXT: s_add_i32 s4, s4, s10
; GFX9-NEXT: s_xor_b32 s4, s4, s10
; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0
; GFX9-NEXT: s_sub_i32 s7, 0, s3
; GFX9-NEXT: s_ashr_i32 s5, s13, 31
; GFX9-NEXT: v_mul_lo_u32 v2, s6, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: s_sub_i32 s12, 0, s7
; GFX9-NEXT: s_add_i32 s6, s12, s4
; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX9-NEXT: s_ashr_i32 s11, s5, 31
; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1
; GFX9-NEXT: s_add_i32 s5, s5, s11
; GFX9-NEXT: s_xor_b32 s6, s6, s4
; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1
; GFX9-NEXT: s_add_i32 s7, s13, s5
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3
; GFX9-NEXT: s_xor_b32 s5, s5, s11
; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6
; GFX9-NEXT: s_xor_b32 s7, s7, s5
; GFX9-NEXT: s_xor_b32 s0, s4, s0
; GFX9-NEXT: v_mul_lo_u32 v3, v0, s1
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1
; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3
; GFX9-NEXT: v_mul_hi_u32 v1, s7, v1
; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v3
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3
; GFX9-NEXT: v_subrev_u32_e32 v2, s1, v3
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v2
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v2
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v2
; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s3
; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
; GFX9-NEXT: s_xor_b32 s4, s10, s8
; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0
; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3
; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX9-NEXT: v_subrev_u32_e32 v0, s0, v0
; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3
; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3
; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3
; GFX9-NEXT: s_xor_b32 s4, s11, s9
; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3
; GFX9-NEXT: s_xor_b32 s0, s5, s2
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1
; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2
; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1
; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3
; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1
; GFX9-NEXT: v_xor_b32_e32 v2, s4, v2
; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1
; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2
; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2
; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v3
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sdivrem_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_ashr_i32 s1, s10, 31
; GFX10-NEXT: s_ashr_i32 s2, s11, 31
; GFX10-NEXT: s_add_i32 s0, s10, s1
; GFX10-NEXT: s_add_i32 s3, s11, s2
; GFX10-NEXT: s_xor_b32 s10, s0, s1
; GFX10-NEXT: s_ashr_i32 s1, s14, 31
; GFX10-NEXT: s_ashr_i32 s2, s15, 31
; GFX10-NEXT: s_add_i32 s0, s14, s1
; GFX10-NEXT: s_add_i32 s3, s15, s2
; GFX10-NEXT: s_xor_b32 s4, s0, s1
; GFX10-NEXT: s_xor_b32 s3, s3, s2
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3
; GFX10-NEXT: s_sub_i32 s0, 0, s10
; GFX10-NEXT: s_sub_i32 s11, 0, s3
; GFX10-NEXT: s_ashr_i32 s12, s9, 31
; GFX10-NEXT: s_sub_i32 s0, 0, s4
; GFX10-NEXT: s_sub_i32 s5, 0, s3
; GFX10-NEXT: s_ashr_i32 s6, s13, 31
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX10-NEXT: s_add_i32 s7, s13, s6
; GFX10-NEXT: s_xor_b32 s7, s7, s6
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0
; GFX10-NEXT: v_mul_lo_u32 v3, s11, v1
; GFX10-NEXT: s_ashr_i32 s11, s8, 31
; GFX10-NEXT: s_add_i32 s0, s8, s11
; GFX10-NEXT: s_add_i32 s8, s9, s12
; GFX10-NEXT: s_xor_b32 s0, s0, s11
; GFX10-NEXT: s_xor_b32 s8, s8, s12
; GFX10-NEXT: v_mul_lo_u32 v3, s5, v1
; GFX10-NEXT: s_ashr_i32 s5, s12, 31
; GFX10-NEXT: s_add_i32 s0, s12, s5
; GFX10-NEXT: s_xor_b32 s1, s5, s1
; GFX10-NEXT: s_xor_b32 s0, s0, s5
; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX10-NEXT: s_xor_b32 s1, s11, s1
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX10-NEXT: v_mul_hi_u32 v1, s8, v1
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10
; GFX10-NEXT: v_mul_hi_u32 v1, s7, v1
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s4
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s8, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s7, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX10-NEXT: s_xor_b32 s0, s12, s2
; GFX10-NEXT: s_xor_b32 s0, s6, s2
; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0
; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1
; GFX10-NEXT: v_xor_b32_e32 v2, s11, v2
; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3
; GFX10-NEXT: v_xor_b32_e32 v2, s5, v2
; GFX10-NEXT: v_xor_b32_e32 v3, s6, v3
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12, v3
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s5, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v3
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX10-NEXT: s_endpgm
%div = sdiv <2 x i32> %x, %y
store <2 x i32> %div, ptr addrspace(1) %out0
Expand Down
84 changes: 42 additions & 42 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
Original file line number Diff line number Diff line change
Expand Up @@ -576,12 +576,12 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
;
; GFX9-LABEL: udivrem_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11
; GFX9-NEXT: s_sub_i32 s0, 0, s10
; GFX9-NEXT: s_sub_i32 s1, 0, s11
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15
; GFX9-NEXT: s_sub_i32 s0, 0, s14
; GFX9-NEXT: s_sub_i32 s1, 0, s15
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
Expand All @@ -593,47 +593,47 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s12, v0
; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1
; GFX9-NEXT: v_mul_lo_u32 v2, v0, s10
; GFX9-NEXT: v_mul_hi_u32 v1, s13, v1
; GFX9-NEXT: v_mul_lo_u32 v2, v0, s14
; GFX9-NEXT: v_add_u32_e32 v4, 1, v0
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s11
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s15
; GFX9-NEXT: v_add_u32_e32 v5, 1, v1
; GFX9-NEXT: v_sub_u32_e32 v2, s8, v2
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2
; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3
; GFX9-NEXT: v_sub_u32_e32 v2, s12, v2
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v2
; GFX9-NEXT: v_sub_u32_e32 v3, s13, v3
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v2
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3
; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v3
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3
; GFX9-NEXT: v_subrev_u32_e32 v5, s15, v3
; GFX9-NEXT: v_add_u32_e32 v4, 1, v0
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v2
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v2
; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2
; GFX9-NEXT: v_add_u32_e32 v5, 1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3
; GFX9-NEXT: v_subrev_u32_e32 v4, s11, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v3
; GFX9-NEXT: v_subrev_u32_e32 v4, s15, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: udivrem_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s11
; GFX10-NEXT: s_sub_i32 s0, 0, s10
; GFX10-NEXT: s_sub_i32 s1, 0, s11
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15
; GFX10-NEXT: s_sub_i32 s0, 0, s14
; GFX10-NEXT: s_sub_i32 s1, 0, s15
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
Expand All @@ -646,34 +646,34 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0
; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s11
; GFX10-NEXT: v_mul_hi_u32 v0, s12, v0
; GFX10-NEXT: v_mul_hi_u32 v1, s13, v1
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s14
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s15
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s8, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s9, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s12, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s13, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s14, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v3
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s15, v3
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s14, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s15, v3
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[4:5]
; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[6:7]
; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[8:9]
; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[10:11]
; GFX10-NEXT: s_endpgm
%div = udiv <2 x i32> %x, %y
store <2 x i32> %div, ptr addrspace(1) %out0
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/AMDGPU/add.v2i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -245,21 +245,21 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x
;
; GFX9-LABEL: s_test_add_v2i16_kernarg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_pk_add_u16 v1, s2, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_pk_add_u16 v1, s6, v1
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_add_v2i16_kernarg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v1, s2, s3
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: v_pk_add_u16 v1, s6, s7
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_add_v2i16_kernarg:
Expand Down
408 changes: 204 additions & 204 deletions llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/build_vector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -316,14 +316,14 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
;
; GFX940-LABEL: build_v2i32_from_v4i16_shuffle:
; GFX940: ; %bb.0: ; %entry
; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: s_lshl_b32 s3, s3, 16
; GFX940-NEXT: s_lshl_b32 s2, s2, 16
; GFX940-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NEXT: v_mov_b32_e32 v1, s3
; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_lshl_b32 s0, s7, 16
; GFX940-NEXT: s_lshl_b32 s1, s6, 16
; GFX940-NEXT: v_mov_b32_e32 v0, s1
; GFX940-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1
; GFX940-NEXT: s_endpgm
entry:
%shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
Expand Down
88 changes: 44 additions & 44 deletions llvm/test/CodeGen/AMDGPU/cluster_stores.ll
Original file line number Diff line number Diff line change
Expand Up @@ -49,37 +49,37 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali
;
; GFX10-LABEL: cluster_load_cluster_store:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s4, s0, 8
; GFX10-NEXT: s_addc_u32 s5, s1, 0
; GFX10-NEXT: s_add_u32 s6, s0, 16
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_addc_u32 s7, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: s_add_u32 s0, s0, 24
; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v3, s5
; GFX10-NEXT: v_mov_b32_e32 v4, s6
; GFX10-NEXT: v_mov_b32_e32 v5, s7
; GFX10-NEXT: s_add_u32 s0, s4, 8
; GFX10-NEXT: s_addc_u32 s1, s5, 0
; GFX10-NEXT: s_add_u32 s2, s4, 16
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: s_addc_u32 s3, s5, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_add_u32 s0, s4, 24
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: s_addc_u32 s1, s5, 0
; GFX10-NEXT: v_mov_b32_e32 v5, s3
; GFX10-NEXT: v_mov_b32_e32 v4, s2
; GFX10-NEXT: v_mov_b32_e32 v7, s1
; GFX10-NEXT: v_mov_b32_e32 v6, s0
; GFX10-NEXT: s_clause 0x3
; GFX10-NEXT: flat_load_dword v8, v[0:1]
; GFX10-NEXT: flat_load_dword v9, v[2:3]
; GFX10-NEXT: flat_load_dword v10, v[4:5]
; GFX10-NEXT: flat_load_dword v11, v[6:7]
; GFX10-NEXT: s_add_u32 s0, s2, 8
; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: s_add_u32 s0, s6, 8
; GFX10-NEXT: s_addc_u32 s1, s7, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_add_u32 s0, s2, 16
; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: s_add_u32 s2, s2, 24
; GFX10-NEXT: s_addc_u32 s3, s3, 0
; GFX10-NEXT: s_add_u32 s0, s6, 16
; GFX10-NEXT: s_addc_u32 s1, s7, 0
; GFX10-NEXT: s_add_u32 s2, s6, 24
; GFX10-NEXT: v_mov_b32_e32 v1, s7
; GFX10-NEXT: s_addc_u32 s3, s7, 0
; GFX10-NEXT: v_mov_b32_e32 v5, s1
; GFX10-NEXT: v_mov_b32_e32 v4, s0
; GFX10-NEXT: v_mov_b32_e32 v7, s3
Expand Down Expand Up @@ -175,39 +175,39 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr
;
; GFX10-LABEL: cluster_load_valu_cluster_store:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s4, s0, 8
; GFX10-NEXT: s_addc_u32 s5, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: s_add_u32 s6, s0, 16
; GFX10-NEXT: v_mov_b32_e32 v3, s5
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_addc_u32 s7, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: s_add_u32 s0, s0, 24
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v4, s6
; GFX10-NEXT: v_mov_b32_e32 v5, s7
; GFX10-NEXT: s_add_u32 s0, s4, 8
; GFX10-NEXT: s_addc_u32 s1, s5, 0
; GFX10-NEXT: s_add_u32 s2, s4, 16
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_addc_u32 s3, s5, 0
; GFX10-NEXT: s_add_u32 s0, s4, 24
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: s_addc_u32 s1, s5, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: v_mov_b32_e32 v5, s3
; GFX10-NEXT: v_mov_b32_e32 v4, s2
; GFX10-NEXT: flat_load_dword v6, v[2:3]
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: flat_load_dword v8, v[0:1]
; GFX10-NEXT: flat_load_dword v9, v[4:5]
; GFX10-NEXT: flat_load_dword v10, v[2:3]
; GFX10-NEXT: s_add_u32 s0, s2, 8
; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: s_add_u32 s4, s2, 16
; GFX10-NEXT: s_add_u32 s0, s6, 8
; GFX10-NEXT: s_addc_u32 s1, s7, 0
; GFX10-NEXT: s_add_u32 s2, s6, 16
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: s_addc_u32 s5, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: s_addc_u32 s3, s7, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s6
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_add_u32 s0, s2, 24
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v5, s5
; GFX10-NEXT: s_add_u32 s0, s6, 24
; GFX10-NEXT: v_mov_b32_e32 v1, s7
; GFX10-NEXT: v_mov_b32_e32 v5, s3
; GFX10-NEXT: s_addc_u32 s1, s7, 0
; GFX10-NEXT: v_mov_b32_e32 v4, s2
; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v6
; GFX10-NEXT: v_mov_b32_e32 v7, s1
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
Original file line number Diff line number Diff line change
Expand Up @@ -665,17 +665,17 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar
;
; GFX9-LABEL: sub_zext_setcc_commute:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
; GFX9-NEXT: global_load_dword v3, v2, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3
; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: v_add_u32_e32 v0, s6, v0
; GFX9-NEXT: v_subrev_u32_e32 v0, s7, v0
; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
Expand Down Expand Up @@ -714,17 +714,17 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar
;
; GFX9-LABEL: sub_sext_setcc_commute:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
; GFX9-NEXT: global_load_dword v3, v2, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3
; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: v_add_u32_e32 v0, s6, v0
; GFX9-NEXT: v_subrev_u32_e32 v0, s7, v0
; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
Expand Down
180 changes: 90 additions & 90 deletions llvm/test/CodeGen/AMDGPU/ctlz.ll

Large diffs are not rendered by default.

158 changes: 79 additions & 79 deletions llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll

Large diffs are not rendered by default.

184 changes: 92 additions & 92 deletions llvm/test/CodeGen/AMDGPU/cttz.ll

Large diffs are not rendered by default.

126 changes: 63 additions & 63 deletions llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll

Large diffs are not rendered by default.

294 changes: 146 additions & 148 deletions llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll

Large diffs are not rendered by default.

32 changes: 16 additions & 16 deletions llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -376,22 +376,22 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32
;
; GFX9-LABEL: uniform_vec_i16_LH:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_lh_b32_b16 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_pack_lh_b32_b16 s0, s6, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX906-LABEL: uniform_vec_i16_LH:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX906-NEXT: v_mov_b32_e32 v0, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_pack_lh_b32_b16 s2, s2, s3
; GFX906-NEXT: v_mov_b32_e32 v1, s2
; GFX906-NEXT: global_store_dword v0, v1, s[0:1]
; GFX906-NEXT: s_pack_lh_b32_b16 s0, s6, s7
; GFX906-NEXT: v_mov_b32_e32 v1, s0
; GFX906-NEXT: global_store_dword v0, v1, s[4:5]
; GFX906-NEXT: s_endpgm
;
; GFX11-LABEL: uniform_vec_i16_LH:
Expand Down Expand Up @@ -466,22 +466,22 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32
;
; GFX9-LABEL: uniform_vec_i16_HH:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_hh_b32_b16 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_pack_hh_b32_b16 s0, s6, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX906-LABEL: uniform_vec_i16_HH:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX906-NEXT: v_mov_b32_e32 v0, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_pack_hh_b32_b16 s2, s2, s3
; GFX906-NEXT: v_mov_b32_e32 v1, s2
; GFX906-NEXT: global_store_dword v0, v1, s[0:1]
; GFX906-NEXT: s_pack_hh_b32_b16 s0, s6, s7
; GFX906-NEXT: v_mov_b32_e32 v1, s0
; GFX906-NEXT: global_store_dword v0, v1, s[4:5]
; GFX906-NEXT: s_endpgm
;
; GFX11-LABEL: uniform_vec_i16_HH:
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1842,21 +1842,21 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half
;
; GFX9-LABEL: s_copysign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_movk_i32 s0, 0x7fff
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NEXT: s_lshr_b32 s3, s3, 16
; GFX9-NEXT: s_lshr_b32 s2, s2, 16
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: s_lshr_b32 s1, s7, 16
; GFX9-NEXT: s_lshr_b32 s2, s6, 16
; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: s_copysign_v2f16:
Expand Down
86 changes: 43 additions & 43 deletions llvm/test/CodeGen/AMDGPU/fdiv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,10 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
;
; GFX10-LABEL: s_fdiv_f32_ninf:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2
; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2
; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6
; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0
Expand All @@ -133,8 +133,8 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_f32_ninf:
Expand Down Expand Up @@ -275,21 +275,21 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa
;
; GFX10-LABEL: s_fdiv_f32_ieee:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2
; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
; GFX10-NEXT: v_fma_f32 v2, -v0, v1, 1.0
; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v1
; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2
; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6
; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1
; GFX10-NEXT: v_fma_f32 v4, -v0, v3, v2
; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1
; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_f32_ieee:
Expand Down Expand Up @@ -370,16 +370,16 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo
;
; GFX10-LABEL: s_fdiv_25ulp_f32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3|
; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s4
; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0
; GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |s7|
; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s0
; GFX10-NEXT: v_mul_f32_e32 v1, s7, v0
; GFX10-NEXT: v_rcp_f32_e32 v1, v1
; GFX10-NEXT: v_mul_f32_e32 v1, s2, v1
; GFX10-NEXT: v_mul_f32_e32 v1, s6, v1
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_25ulp_f32:
Expand Down Expand Up @@ -482,18 +482,18 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a
;
; GFX10-LABEL: s_fdiv_25ulp_ieee_f32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s3
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s3
; GFX10-NEXT: v_frexp_mant_f32_e32 v2, s2
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v3, s2
; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s7
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s7
; GFX10-NEXT: v_frexp_mant_f32_e32 v2, s6
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v3, s6
; GFX10-NEXT: v_rcp_f32_e32 v0, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v1, v3, v1
; GFX10-NEXT: v_mul_f32_e32 v0, v2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_25ulp_ieee_f32:
Expand Down Expand Up @@ -559,12 +559,12 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a,
;
; GFX10-LABEL: s_fdiv_fast_ieee_f32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_rcp_f32_e32 v0, s3
; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_rcp_f32_e32 v0, s7
; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_fast_ieee_f32:
Expand Down Expand Up @@ -623,12 +623,12 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a,
;
; GFX10-LABEL: s_fdiv_f32_fast_math:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_rcp_f32_e32 v0, s3
; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_rcp_f32_e32 v0, s7
; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_f32_fast_math:
Expand Down Expand Up @@ -687,12 +687,12 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo
;
; GFX10-LABEL: s_fdiv_ulp25_f32_fast_math:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_rcp_f32_e32 v0, s3
; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_rcp_f32_e32 v0, s7
; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_ulp25_f32_fast_math:
Expand Down Expand Up @@ -829,10 +829,10 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
;
; GFX10-LABEL: s_fdiv_f32_arcp_daz:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2
; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2
; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6
; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0
Expand All @@ -844,8 +844,8 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_f32_arcp_daz:
Expand Down Expand Up @@ -916,12 +916,12 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a,
;
; GFX10-LABEL: s_fdiv_f32_arcp_ninf:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_rcp_f32_e32 v0, s3
; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_rcp_f32_e32 v0, s7
; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_f32_arcp_ninf:
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/flat_atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4315,12 +4315,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old
;
; GCN3-LABEL: atomic_cmpxchg_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v3, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down Expand Up @@ -4570,12 +4570,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
;
; GCN3-LABEL: atomic_cmpxchg_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v3, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down
56 changes: 28 additions & 28 deletions llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3883,21 +3883,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
;
; GCN3-LABEL: atomic_max_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_ashr_i32 s5, s3, 31
; GCN3-NEXT: s_mov_b32 s4, s3
; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN3-NEXT: s_add_u32 s0, s0, s4
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: s_ashr_i32 s1, s7, 31
; GCN3-NEXT: s_mov_b32 s0, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_max_i32_e32 v2, s2, v3
; GCN3-NEXT: v_max_i32_e32 v2, s6, v3
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down Expand Up @@ -4085,21 +4085,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
;
; GCN3-LABEL: atomic_max_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_ashr_i32 s5, s3, 31
; GCN3-NEXT: s_mov_b32 s4, s3
; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN3-NEXT: s_add_u32 s0, s0, s4
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: s_ashr_i32 s1, s7, 31
; GCN3-NEXT: s_mov_b32 s0, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v3, v[0:1]
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_max_i32_e32 v2, s2, v3
; GCN3-NEXT: v_max_i32_e32 v2, s6, v3
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down Expand Up @@ -5026,21 +5026,21 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
;
; GCN3-LABEL: atomic_umax_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_ashr_i32 s5, s3, 31
; GCN3-NEXT: s_mov_b32 s4, s3
; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN3-NEXT: s_add_u32 s0, s0, s4
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: s_ashr_i32 s1, s7, 31
; GCN3-NEXT: s_mov_b32 s0, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_max_u32_e32 v2, s2, v3
; GCN3-NEXT: v_max_u32_e32 v2, s6, v3
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down Expand Up @@ -6820,21 +6820,21 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
;
; GCN3-LABEL: atomic_min_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_ashr_i32 s5, s3, 31
; GCN3-NEXT: s_mov_b32 s4, s3
; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN3-NEXT: s_add_u32 s0, s0, s4
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: s_ashr_i32 s1, s7, 31
; GCN3-NEXT: s_mov_b32 s0, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_min_i32_e32 v2, s2, v3
; GCN3-NEXT: v_min_i32_e32 v2, s6, v3
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down
Loading