diff --git a/.github/workflows/libcxx-build-containers.yml b/.github/workflows/libcxx-build-containers.yml index 530e94df976c1..eee7bb8913944 100644 --- a/.github/workflows/libcxx-build-containers.yml +++ b/.github/workflows/libcxx-build-containers.yml @@ -64,7 +64,9 @@ jobs: - name: Push the images if: github.event_name == 'push' - run: docker compose push libcxx-linux-builder-base libcxx-linux-builder libcxx-android-builder + run: docker compose --file libcxx/utils/ci/docker/docker-compose.yml push libcxx-linux-builder-base libcxx-linux-builder libcxx-android-builder + env: + TAG: ${{ github.sha }} # We create tarballs with the images and upload them as artifacts, since that's useful for testing # the images when making changes. diff --git a/libcxx/include/__memory/inout_ptr.h b/libcxx/include/__memory/inout_ptr.h index ef345fe469bca..0fa685afb2ee6 100644 --- a/libcxx/include/__memory/inout_ptr.h +++ b/libcxx/include/__memory/inout_ptr.h @@ -96,7 +96,7 @@ class inout_ptr_t { }; template -_LIBCPP_HIDE_FROM_ABI auto inout_ptr(_Smart& __s, _Args&&... __args) { +[[nodiscard]] _LIBCPP_HIDE_FROM_ABI auto inout_ptr(_Smart& __s, _Args&&... __args) { using _Ptr = conditional_t, __pointer_of_t<_Smart>, _Pointer>; return std::inout_ptr_t<_Smart, _Ptr, _Args&&...>(__s, std::forward<_Args>(__args)...); } diff --git a/libcxx/include/__memory/out_ptr.h b/libcxx/include/__memory/out_ptr.h index e498e3307b9d6..23a77f6a0f7e9 100644 --- a/libcxx/include/__memory/out_ptr.h +++ b/libcxx/include/__memory/out_ptr.h @@ -88,7 +88,7 @@ class out_ptr_t { }; template -_LIBCPP_HIDE_FROM_ABI auto out_ptr(_Smart& __s, _Args&&... __args) { +[[nodiscard]] _LIBCPP_HIDE_FROM_ABI auto out_ptr(_Smart& __s, _Args&&... __args) { using _Ptr = conditional_t, __pointer_of_t<_Smart>, _Pointer>; return std::out_ptr_t<_Smart, _Ptr, _Args&&...>(__s, std::forward<_Args>(__args)...); } diff --git a/libcxx/test/libcxx/utilities/smartptr/nodiscard.verify.cpp b/libcxx/test/libcxx/utilities/smartptr/nodiscard.verify.cpp new file mode 100644 index 0000000000000..57eb4f7cc0bf8 --- /dev/null +++ b/libcxx/test/libcxx/utilities/smartptr/nodiscard.verify.cpp @@ -0,0 +1,29 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: std-at-least-c++23 + +// + +// Check that functions are marked [[nodiscard]] + +#include + +#include "test_macros.h" + +void test() { +#if TEST_STD_VER >= 23 + { + std::unique_ptr uPtr; + // [inout.ptr] + std::inout_ptr(uPtr); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + // [out.ptr] + std::out_ptr(uPtr); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + } +#endif +} diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp index d7348254af07b..33fcd841b2ab0 100644 --- a/libunwind/src/UnwindCursor.hpp +++ b/libunwind/src/UnwindCursor.hpp @@ -2865,6 +2865,21 @@ void UnwindCursor::setInfoBasedOnIPRegister(bool isReturnAddress) { #if defined(_LIBUNWIND_CHECK_LINUX_SIGRETURN) && \ defined(_LIBUNWIND_TARGET_AARCH64) + +/* + * The linux sigreturn restorer stub will always have the form: + * + * d2801168 movz x8, #0x8b + * d4000001 svc #0x0 + */ +#if defined(__AARCH64EB__) +#define MOVZ_X8_8B 0x681180d2 +#define SVC_0 0x010000d4 +#else +#define MOVZ_X8_8B 0xd2801168 +#define SVC_0 0xd4000001 +#endif + template bool UnwindCursor::setInfoForSigReturn(Registers_arm64 &) { // Look for the sigreturn trampoline. The trampoline's body is two @@ -2889,7 +2904,7 @@ bool UnwindCursor::setInfoForSigReturn(Registers_arm64 &) { return false; auto *instructions = reinterpret_cast(pc); // Look for instructions: mov x8, #0x8b; svc #0x0 - if (instructions[0] != 0xd2801168 || instructions[1] != 0xd4000001) + if (instructions[0] != MOVZ_X8_8B || instructions[1] != SVC_0) return false; _info = {}; diff --git a/lldb/bindings/python/python-typemaps.swig b/lldb/bindings/python/python-typemaps.swig index 3baeaa7770e6c..072e688c4bde1 100644 --- a/lldb/bindings/python/python-typemaps.swig +++ b/lldb/bindings/python/python-typemaps.swig @@ -628,61 +628,34 @@ template <> bool SetNumberFromPyObject(double &number, PyObject *obj) { } } -// These two pybuffer macros are copied out of swig/Lib/python/pybuffer.i, -// and fixed so they will not crash if PyObject_GetBuffer fails. -// https://github.com/swig/swig/issues/1640 -// -// I've also moved the call to PyBuffer_Release to the end of the SWIG wrapper, -// doing it right away is not legal according to the python buffer protocol. -%inline %{ -struct Py_buffer_RAII { - Py_buffer buffer = {}; - Py_buffer_RAII(){}; - Py_buffer &operator=(const Py_buffer_RAII &) = delete; - Py_buffer_RAII(const Py_buffer_RAII &) = delete; - ~Py_buffer_RAII() { - if (buffer.obj) - PyBuffer_Release(&buffer); - } -}; -%} -%define %pybuffer_mutable_binary(TYPEMAP, SIZE) -%typemap(in) (TYPEMAP, SIZE) (Py_buffer_RAII view) { - int res; - Py_ssize_t size = 0; - void *buf = 0; - res = PyObject_GetBuffer($input, &view.buffer, PyBUF_WRITABLE); - if (res < 0) { - PyErr_Clear(); - %argument_fail(res, "(TYPEMAP, SIZE)", $symname, $argnum); - } - size = view.buffer.len; - buf = view.buffer.buf; - $1 = ($1_ltype)buf; - $2 = ($2_ltype)(size / sizeof($*1_type)); -} -%enddef - -%define %pybuffer_binary(TYPEMAP, SIZE) -%typemap(in) (TYPEMAP, SIZE) (Py_buffer_RAII view) { - int res; - Py_ssize_t size = 0; - const void *buf = 0; - res = PyObject_GetBuffer($input, &view.buffer, PyBUF_CONTIG_RO); - if (res < 0) { - PyErr_Clear(); - %argument_fail(res, "(TYPEMAP, SIZE)", $symname, $argnum); +// Typemap for SBFile::Write. +%typemap(in) (const uint8_t *buf, size_t num_bytes) { + if (PythonByteArray::Check($input)) { + PythonByteArray bytearray(PyRefType::Borrowed, $input); + $1 = (uint8_t *)bytearray.GetBytes().data(); + $2 = bytearray.GetSize(); + } else if (PythonBytes::Check($input)) { + PythonBytes bytes(PyRefType::Borrowed, $input); + $1 = (uint8_t *)bytes.GetBytes().data(); + $2 = bytes.GetSize(); + } else { + PyErr_SetString(PyExc_ValueError, "Expecting a bytes or bytearray object"); + SWIG_fail; } - size = view.buffer.len; - buf = view.buffer.buf; - $1 = ($1_ltype)buf; - $2 = ($2_ltype)(size / sizeof($*1_type)); } -%enddef -%pybuffer_binary(const uint8_t *buf, size_t num_bytes); -%pybuffer_mutable_binary(uint8_t *buf, size_t num_bytes); +// Typemap for SBFile::Read. +%typemap(in) (uint8_t *buf, size_t num_bytes) { + if (PythonByteArray::Check($input)) { + PythonByteArray bytearray(PyRefType::Borrowed, $input); + $1 = (uint8_t *)bytearray.GetBytes().data(); + $2 = bytearray.GetSize(); + } else { + PyErr_SetString(PyExc_ValueError, "Expecting a bytearray"); + SWIG_fail; + } +} %typemap(in) (const char **symbol_name, uint32_t num_names) { using namespace lldb_private; diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig index 3a0995e84f643..ef501fbafc947 100644 --- a/lldb/bindings/python/python-wrapper.swig +++ b/lldb/bindings/python/python-wrapper.swig @@ -128,8 +128,11 @@ bool lldb_private::python::SWIGBridge::LLDBSwigPythonCallTypeScript( PyObject *pfunc_impl = nullptr; - if (pyfunct_wrapper && *pyfunct_wrapper && - PyFunction_Check(*pyfunct_wrapper)) { + if (pyfunct_wrapper && *pyfunct_wrapper +#ifndef Py_LIMITED_API + && PyFunction_Check(*pyfunct_wrapper) +#endif + ) { pfunc_impl = (PyObject *)(*pyfunct_wrapper); if (pfunc_impl->ob_refcnt == 1) { Py_XDECREF(pfunc_impl); diff --git a/lldb/bindings/python/python.swig b/lldb/bindings/python/python.swig index 4a5a39dc4b06d..b2823f98acac8 100644 --- a/lldb/bindings/python/python.swig +++ b/lldb/bindings/python/python.swig @@ -59,6 +59,11 @@ except ImportError: // Parameter types will be used in the autodoc string. %feature("autodoc", "1"); +// Include lldb-python first as it sets Py_LIMITED_API. +%begin %{ +#include "../source/Plugins/ScriptInterpreter/Python/lldb-python.h" +%} + %pythoncode%{ import uuid import re diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h index df473dff091f8..1ef90b469c70c 100644 --- a/lldb/include/lldb/Core/ModuleList.h +++ b/lldb/include/lldb/Core/ModuleList.h @@ -511,6 +511,12 @@ class ModuleList { /// Atomically swaps the contents of this module list with \a other. void Swap(ModuleList &other); + /// For each module in this ModuleList, preload its symbols. + /// + /// \param[in] parallelize + /// If true, all modules will be preloaded in parallel. + void PreloadSymbols(bool parallelize) const; + protected: // Class typedefs. typedef std::vector diff --git a/lldb/include/lldb/Target/DynamicLoader.h b/lldb/include/lldb/Target/DynamicLoader.h index 75bb6cb6bb907..4131a57df7540 100644 --- a/lldb/include/lldb/Target/DynamicLoader.h +++ b/lldb/include/lldb/Target/DynamicLoader.h @@ -352,6 +352,7 @@ class DynamicLoader : public PluginInterface { protected: // Utility methods for derived classes + /// Find a module in the target that matches the given file. lldb::ModuleSP FindModuleViaTarget(const FileSpec &file); /// Checks to see if the target module has changed, updates the target diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h index 40f9c9bea1c12..908094bfd888d 100644 --- a/lldb/include/lldb/Target/Target.h +++ b/lldb/include/lldb/Target/Target.h @@ -629,13 +629,20 @@ class Target : public std::enable_shared_from_this, /// or identify a matching Module already present in the Target, /// and return a shared pointer to it. /// + /// Note that this function previously also preloaded the module's symbols + /// depending on a setting. This function no longer does any module + /// preloading because that can potentially cause deadlocks when called in + /// parallel with this function. + /// /// \param[in] module_spec /// The criteria that must be matched for the binary being loaded. /// e.g. UUID, architecture, file path. /// /// \param[in] notify /// If notify is true, and the Module is new to this Target, - /// Target::ModulesDidLoad will be called. + /// Target::ModulesDidLoad will be called. See note in + /// Target::ModulesDidLoad about thread-safety with + /// Target::GetOrCreateModule. /// If notify is false, it is assumed that the caller is adding /// multiple Modules and will call ModulesDidLoad with the /// full list at the end. @@ -931,6 +938,13 @@ class Target : public std::enable_shared_from_this, // the address of its previous instruction and return that address. lldb::addr_t GetBreakableLoadAddress(lldb::addr_t addr); + /// This call may preload module symbols, and may do so in parallel depending + /// on the following target settings: + /// - TargetProperties::GetPreloadSymbols() + /// - TargetProperties::GetParallelModuleLoad() + /// + /// Warning: if preloading is active and this is called in parallel with + /// Target::GetOrCreateModule, this may result in a ABBA deadlock situation. void ModulesDidLoad(ModuleList &module_list); void ModulesDidUnload(ModuleList &module_list, bool delete_locations); diff --git a/lldb/source/Core/DynamicLoader.cpp b/lldb/source/Core/DynamicLoader.cpp index b309e0f0a72fd..31d277bc19681 100644 --- a/lldb/source/Core/DynamicLoader.cpp +++ b/lldb/source/Core/DynamicLoader.cpp @@ -165,7 +165,8 @@ ModuleSP DynamicLoader::FindModuleViaTarget(const FileSpec &file) { if (ModuleSP module_sp = target.GetImages().FindFirstModule(module_spec)) return module_sp; - if (ModuleSP module_sp = target.GetOrCreateModule(module_spec, false)) + if (ModuleSP module_sp = + target.GetOrCreateModule(module_spec, /*notify=*/false)) return module_sp; return nullptr; diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp index d9f845681e701..5444c04e74522 100644 --- a/lldb/source/Core/ModuleList.cpp +++ b/lldb/source/Core/ModuleList.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "lldb/Core/ModuleList.h" +#include "lldb/Core/Debugger.h" #include "lldb/Core/Module.h" #include "lldb/Core/ModuleSpec.h" #include "lldb/Core/PluginManager.h" @@ -28,6 +29,7 @@ #include "lldb/Utility/Log.h" #include "lldb/Utility/UUID.h" #include "lldb/lldb-defines.h" +#include "llvm/Support/ThreadPool.h" #if defined(_WIN32) #include "lldb/Host/windows/PosixApi.h" @@ -1381,3 +1383,21 @@ void ModuleList::Swap(ModuleList &other) { m_modules_mutex, other.m_modules_mutex); m_modules.swap(other.m_modules); } + +void ModuleList::PreloadSymbols(bool parallelize) const { + std::lock_guard guard(m_modules_mutex); + + if (!parallelize) { + for (const ModuleSP &module_sp : m_modules) + module_sp->PreloadSymbols(); + return; + } + + llvm::ThreadPoolTaskGroup task_group(Debugger::GetThreadPool()); + for (const ModuleSP &module_sp : m_modules) + task_group.async([module_sp] { + if (module_sp) + module_sp->PreloadSymbols(); + }); + task_group.wait(); +} diff --git a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp index 470fc2a2fdbb9..3605e7b2c6960 100644 --- a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp +++ b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp @@ -469,7 +469,8 @@ void DynamicLoaderPOSIXDYLD::RefreshModules() { } ModuleSP module_sp = LoadModuleAtAddress( - so_entry.file_spec, so_entry.link_addr, so_entry.base_addr, true); + so_entry.file_spec, so_entry.link_addr, so_entry.base_addr, + /*base_addr_is_offset=*/true); if (!module_sp.get()) return; @@ -726,9 +727,8 @@ void DynamicLoaderPOSIXDYLD::LoadAllCurrentModules() { task_group.async(load_module_fn, *I); task_group.wait(); } else { - for (I = m_rendezvous.begin(), E = m_rendezvous.end(); I != E; ++I) { + for (I = m_rendezvous.begin(), E = m_rendezvous.end(); I != E; ++I) load_module_fn(*I); - } } m_process->GetTarget().ModulesDidLoad(module_list); diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp index 94fc2e83e899d..b78e6ce807bca 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp @@ -348,6 +348,10 @@ void DWARFUnit::ExtractDIEsRWLocked() { void DWARFUnit::SetDwoStrOffsetsBase() { lldb::offset_t baseOffset = 0; + // Size of offset for .debug_str_offsets is same as DWARF offset byte size + // of the DWARFUnit as a default. We might override this if below if needed. + m_str_offset_size = m_header.getDwarfOffsetByteSize(); + if (const llvm::DWARFUnitIndex::Entry *entry = m_header.getIndexEntry()) { if (const auto *contribution = entry->getContribution(llvm::DW_SECT_STR_OFFSETS)) @@ -357,14 +361,17 @@ void DWARFUnit::SetDwoStrOffsetsBase() { } if (GetVersion() >= 5) { - const DWARFDataExtractor &strOffsets = - GetSymbolFileDWARF().GetDWARFContext().getOrLoadStrOffsetsData(); - uint64_t length = strOffsets.GetU32(&baseOffset); - if (length == 0xffffffff) - length = strOffsets.GetU64(&baseOffset); - + const llvm::DWARFDataExtractor &strOffsets = GetSymbolFileDWARF() + .GetDWARFContext() + .getOrLoadStrOffsetsData() + .GetAsLLVMDWARF(); + + uint64_t length; + llvm::dwarf::DwarfFormat format; + std::tie(length, format) = strOffsets.getInitialLength(&baseOffset); + m_str_offset_size = format == llvm::dwarf::DwarfFormat::DWARF64 ? 8 : 4; // Check version. - if (strOffsets.GetU16(&baseOffset) < 5) + if (strOffsets.getU16(&baseOffset) < 5) return; // Skip padding. @@ -409,7 +416,16 @@ void DWARFUnit::AddUnitDIE(const DWARFDebugInfoEntry &cu_die) { SetRangesBase(form_value.Unsigned()); break; case DW_AT_str_offsets_base: + // When we have a DW_AT_str_offsets_base attribute, it points us to the + // first string offset for this DWARFUnit which is after the string + // offsets table header. In this case we use the DWARF32/DWARF64 of the + // DWARFUnit to determine the string offset byte size. DWO files do not + // use this attribute and they point to the start of the string offsets + // table header which can be used to determine the DWARF32/DWARF64 status + // of the string table. See SetDwoStrOffsetsBase() for now it figures out + // the m_str_offset_size value that should be used. SetStrOffsetsBase(form_value.Unsigned()); + m_str_offset_size = m_header.getDwarfOffsetByteSize(); break; case DW_AT_low_pc: SetBaseAddress(form_value.Address()); @@ -1079,10 +1095,9 @@ uint32_t DWARFUnit::GetHeaderByteSize() const { return m_header.getSize(); } std::optional DWARFUnit::GetStringOffsetSectionItem(uint32_t index) const { - lldb::offset_t offset = - GetStrOffsetsBase() + index * m_header.getDwarfOffsetByteSize(); + lldb::offset_t offset = GetStrOffsetsBase() + index * m_str_offset_size; return m_dwarf.GetDWARFContext().getOrLoadStrOffsetsData().GetMaxU64( - &offset, m_header.getDwarfOffsetByteSize()); + &offset, m_str_offset_size); } llvm::Expected diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h index 91a693860c55a..b5199a891e3bd 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h @@ -364,6 +364,7 @@ class DWARFUnit : public DWARFExpression::Delegate, public UserID { dw_offset_t m_line_table_offset = DW_INVALID_OFFSET; dw_offset_t m_str_offsets_base = 0; // Value of DW_AT_str_offsets_base. + dw_offset_t m_str_offset_size = 4; // Size in bytes of a string offset. std::optional m_rnglist_table; bool m_rnglist_table_done = false; diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index 3b51e17d1c4e0..6fc8053038d10 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -1855,6 +1855,9 @@ void Target::NotifyModulesRemoved(lldb_private::ModuleList &module_list) { } void Target::ModulesDidLoad(ModuleList &module_list) { + if (GetPreloadSymbols()) + module_list.PreloadSymbols(GetParallelModuleLoad()); + const size_t num_images = module_list.GetSize(); if (m_valid && num_images) { for (size_t idx = 0; idx < num_images; ++idx) { @@ -2509,10 +2512,6 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &orig_module_spec, if (symbol_file_spec) module_sp->SetSymbolFileFileSpec(symbol_file_spec); - // Preload symbols outside of any lock, so hopefully we can do this for - // each library in parallel. - if (GetPreloadSymbols()) - module_sp->PreloadSymbols(); llvm::SmallVector replaced_modules; for (ModuleSP &old_module_sp : old_modules) { if (m_images.GetIndexForModule(old_module_sp.get()) != diff --git a/lldb/test/Shell/SymbolFile/DWARF/Inputs/dwp-str-offsets-dwarf64-dwp.yaml b/lldb/test/Shell/SymbolFile/DWARF/Inputs/dwp-str-offsets-dwarf64-dwp.yaml new file mode 100644 index 0000000000000..d4e2ceea9c4f6 --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/Inputs/dwp-str-offsets-dwarf64-dwp.yaml @@ -0,0 +1,44 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_X86_64 + SectionHeaderStringTable: .strtab +Sections: + - Name: .debug_abbrev.dwo + Type: SHT_PROGBITS + Flags: [ SHF_EXCLUDE ] + AddressAlign: 0x1 + Content: 01110125251305032576250000022E01111B1206401803253A0B3B0B49133F190000030500021803253A0B3B0B4913000004240003253E0B0B0B0000050F00491300000626004913000000 + - Name: .debug_str.dwo + Type: SHT_PROGBITS + Flags: [ SHF_EXCLUDE, SHF_MERGE, SHF_STRINGS ] + AddressAlign: 0x1 + EntSize: 0x1 + Content: 6D61696E00696E74006172676300617267760063686172004170706C6520636C616E672076657273696F6E2031372E302E302028636C616E672D313730302E342E342E3129006D61696E2E6D696E696D616C2E637070006D61696E2E6D696E696D616C2E64776F00 + - Name: .debug_str_offsets.dwo + Type: SHT_PROGBITS + Flags: [ SHF_EXCLUDE ] + AddressAlign: 0x1 + Content: 'FFFFFFFF4400000000000000050000000000000000000000050000000000000009000000000000000E000000000000001300000000000000180000000000000046000000000000005700000000000000' + - Name: .debug_info.dwo + Type: SHT_PROGBITS + Flags: [ SHF_EXCLUDE ] + AddressAlign: 0x1 + Content: 54000000050005080000000099E97383BBC6980B0105210006070200160000000156000001400000000302917802000140000000030291700300014400000000040105040549000000054E00000006530000000404060100 + - Name: .debug_cu_index + Type: SHT_PROGBITS + AddressAlign: 0x1 + Content: 05000000030000000100000002000000000000000000000099E97383BBC6980B0000000001000000010000000300000006000000000000000000000000000000580000004B00000028000000 + - Type: SectionHeaderTable + Sections: + - Name: .strtab + - Name: .debug_abbrev.dwo + - Name: .debug_str.dwo + - Name: .debug_str_offsets.dwo + - Name: .debug_info.dwo + - Name: .debug_cu_index + - Name: .symtab +Symbols: [] +... diff --git a/lldb/test/Shell/SymbolFile/DWARF/Inputs/dwp-str-offsets-dwarf64-exe.yaml b/lldb/test/Shell/SymbolFile/DWARF/Inputs/dwp-str-offsets-dwarf64-exe.yaml new file mode 100644 index 0000000000000..3785bbe6ecbe0 --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/Inputs/dwp-str-offsets-dwarf64-exe.yaml @@ -0,0 +1,100 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_R ] + VAddr: 0x200040 + Align: 0x8 + Offset: 0x40 + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .eh_frame + LastSec: .eh_frame + VAddr: 0x200000 + Align: 0x1000 + Offset: 0x0 + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + FirstSec: .text + LastSec: .text + VAddr: 0x201160 + Align: 0x1000 + Offset: 0x160 + - Type: PT_GNU_STACK + Flags: [ PF_W, PF_R ] + Align: 0x0 + Offset: 0x0 +Sections: + - Name: .eh_frame + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x200120 + AddressAlign: 0x8 + Content: 1400000000000000017A5200017810011B0C0708900100001C0000001C000000201000001600000000410E108602430D06510C070800000000000000 + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x201160 + AddressAlign: 0x10 + Content: 554889E5C745FC00000000897DF8488975F031C05DC3 + - Name: .debug_abbrev + Type: SHT_PROGBITS + AddressAlign: 0x1 + Content: 014A00101772171B25B442197625111B12067317000000 + - Name: .debug_info + Type: SHT_PROGBITS + AddressAlign: 0x1 + Content: 24000000050004080000000099E97383BBC6980B0100000000080000000001001600000008000000 + - Name: .debug_str_offsets + Type: SHT_PROGBITS + AddressAlign: 0x1 + Content: 0C000000050000000000000002000000 + - Name: .debug_gnu_pubnames + Type: SHT_PROGBITS + AddressAlign: 0x1 + Content: 18000000020000000000280000001A000000306D61696E0000000000 + - Name: .debug_gnu_pubtypes + Type: SHT_PROGBITS + AddressAlign: 0x1 + Content: '21000000020000000000280000004000000090696E74005300000090636861720000000000' + - Name: .comment + Type: SHT_PROGBITS + Flags: [ SHF_MERGE, SHF_STRINGS ] + AddressAlign: 0x1 + EntSize: 0x1 + Content: 004170706C6520636C616E672076657273696F6E2031372E302E302028636C616E672D313730302E342E342E3129004C696E6B65723A204C4C442032322E302E30202868747470733A2F2F6769746875622E636F6D2F636C6179626F72672F6C6C766D2D70726F6A6563742E67697420613234333130363863303837656463303938393330303934343864343162356138336361303363392900 + - Name: .debug_line + Type: SHT_PROGBITS + AddressAlign: 0x1 + Content: 5A0000000500080037000000010101FB0E0D00010101010000000100000101011F010000000003011F020F051E0102000000004E0649A10A56ED4D381C49A3DB4F6825040000090260112000000000000105030A0821060B2E0202000101 + - Name: .debug_line_str + Type: SHT_PROGBITS + Flags: [ SHF_MERGE, SHF_STRINGS ] + AddressAlign: 0x1 + EntSize: 0x1 + Content: 2E006D61696E2E6D696E696D616C2E63707000 +Symbols: + - Name: main.minimal.cpp + Type: STT_FILE + Index: SHN_ABS + - Name: main + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x201160 + Size: 0x16 +DWARF: + debug_str: + - . + - main.minimal.dwo + debug_addr: + - Length: 0xC + Version: 0x5 + AddressSize: 0x8 + Entries: + - Address: 0x201160 +... diff --git a/lldb/test/Shell/SymbolFile/DWARF/dwp-str-offsets-dwarf64.test b/lldb/test/Shell/SymbolFile/DWARF/dwp-str-offsets-dwarf64.test new file mode 100644 index 0000000000000..b010aa0995971 --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/dwp-str-offsets-dwarf64.test @@ -0,0 +1,25 @@ +# This test verifies that LLDB can read a .dwp file that has a DWARF32 compile +# unit that has a DWARF64 .debug_str_offsets table. This will be needed for +# llvm-dwp changes that will start emitting 64 bit line tables for compile units +# that have strings in the .debug_str section whose string offsets exceed 4GB. +# By having a 64 bit .debug_str_offsets table, we can emit .debug_str sections +# with no size limits. + +# RUN: yaml2obj %S/Inputs/dwp-str-offsets-dwarf64-exe.yaml > %t +# RUN: yaml2obj %S/Inputs/dwp-str-offsets-dwarf64-dwp.yaml > %t.dwp +# RUN: %lldb %t -b \ +# RUN: -o 'image lookup --verbose --address 0x0000000000201172' | \ +# RUN: FileCheck %s + +# CHECK: (lldb) image lookup --verbose --address 0x0000000000201172 +# CHECK-NEXT: Address: dwp-str-offsets-dwarf64.test.tmp[0x0000000000201172] (dwp-str-offsets-dwarf64.test.tmp.PT_LOAD[1]..text + 18) +# CHECK-NEXT: Summary: dwp-str-offsets-dwarf64.test.tmp`main + 18 at main.minimal.cpp:2:3 +# CHECK-NEXT: Module: file = "{{.*}}/dwp-str-offsets-dwarf64.test.tmp", arch = "x86_64" +# CHECK-NEXT: CompileUnit: id = {0x00000000}, file = "main.minimal.cpp", language = "" +# CHECK-NEXT: Function: id = {0x7fffff000000001a}, name = "main", range = [0x0000000000201160-0x0000000000201176) +# CHECK-NEXT: FuncType: id = {0x7fffff000000001a}, byte-size = 0, decl = main.minimal.cpp:1, compiler_type = "int (int, const char **)" +# CHECK-NEXT: Blocks: id = {0x7fffff000000001a}, range = [0x00201160-0x00201176) +# CHECK-NEXT: LineEntry: [0x0000000000201172-0x0000000000201174): main.minimal.cpp:2:3 +# CHECK-NEXT: Symbol: id = {0x00000002}, range = [0x0000000000201160-0x0000000000201176), name="main" +# CHECK-NEXT: Variable: id = {0x7fffff0000000029}, name = "argc", type = "int", valid ranges = , location = DW_OP_fbreg -8, decl = main.minimal.cpp:1 +# CHECK-NEXT: Variable: id = {0x7fffff0000000034}, name = "argv", type = "const char **", valid ranges = , location = DW_OP_fbreg -16, decl = main.minimal.cpp:1 diff --git a/llvm/include/llvm/CodeGen/RDFGraph.h b/llvm/include/llvm/CodeGen/RDFGraph.h index 6bb6033a8a2f2..8363d88f34c37 100644 --- a/llvm/include/llvm/CodeGen/RDFGraph.h +++ b/llvm/include/llvm/CodeGen/RDFGraph.h @@ -462,7 +462,7 @@ struct TargetOperandInfo { // Packed register reference. Only used for storage. struct PackedRegisterRef { - RegisterId Reg; + RegisterId Id; uint32_t MaskId; }; @@ -779,13 +779,13 @@ struct DataFlowGraph { void releaseBlock(NodeId B, DefStackMap &DefM); PackedRegisterRef pack(RegisterRef RR) { - return {RR.Reg, LMI.getIndexForLaneMask(RR.Mask)}; + return {RR.Id, LMI.getIndexForLaneMask(RR.Mask)}; } PackedRegisterRef pack(RegisterRef RR) const { - return {RR.Reg, LMI.getIndexForLaneMask(RR.Mask)}; + return {RR.Id, LMI.getIndexForLaneMask(RR.Mask)}; } RegisterRef unpack(PackedRegisterRef PR) const { - return RegisterRef(PR.Reg, LMI.getLaneMaskForIndex(PR.MaskId)); + return RegisterRef(PR.Id, LMI.getLaneMaskForIndex(PR.MaskId)); } RegisterRef makeRegRef(unsigned Reg, unsigned Sub) const; diff --git a/llvm/include/llvm/CodeGen/RDFRegisters.h b/llvm/include/llvm/CodeGen/RDFRegisters.h index bedc95a9da7f1..6583efc00cf96 100644 --- a/llvm/include/llvm/CodeGen/RDFRegisters.h +++ b/llvm/include/llvm/CodeGen/RDFRegisters.h @@ -91,45 +91,45 @@ struct RegisterRef { static constexpr RegisterId UnitFlag = 1u << 31; public: - RegisterId Reg = 0; + RegisterId Id = 0; LaneBitmask Mask = LaneBitmask::getNone(); // Only for registers. constexpr RegisterRef() = default; constexpr explicit RegisterRef(RegisterId R, LaneBitmask M = LaneBitmask::getAll()) - : Reg(R), Mask(isRegId(R) && R != 0 ? M : LaneBitmask::getNone()) {} + : Id(R), Mask(isRegId(R) && R != 0 ? M : LaneBitmask::getNone()) {} // Classify null register as a "register". - constexpr bool isReg() const { return Reg == 0 || isRegId(Reg); } - constexpr bool isUnit() const { return isUnitId(Reg); } - constexpr bool isMask() const { return isMaskId(Reg); } + constexpr bool isReg() const { return Id == 0 || isRegId(Id); } + constexpr bool isUnit() const { return isUnitId(Id); } + constexpr bool isMask() const { return isMaskId(Id); } constexpr MCRegister asMCReg() const { assert(isReg()); - return Reg; + return Id; } constexpr MCRegUnit asMCRegUnit() const { assert(isUnit()); - return Reg & ~UnitFlag; + return Id & ~UnitFlag; } constexpr unsigned asMaskIdx() const { assert(isMask()); - return Reg & ~MaskFlag; + return Id & ~MaskFlag; } - constexpr operator bool() const { - return !isReg() || (Reg != 0 && Mask.any()); + explicit constexpr operator bool() const { + return !isReg() || (Id != 0 && Mask.any()); } size_t hash() const { - return std::hash{}(Reg) ^ + return std::hash{}(Id) ^ std::hash{}(Mask.getAsInteger()); } static constexpr bool isRegId(RegisterId Id) { - return !(Id & UnitFlag) && !(Id & MaskFlag); + return Id != 0 && !(Id & UnitFlag) && !(Id & MaskFlag); } static constexpr bool isUnitId(RegisterId Id) { return Id & UnitFlag; } static constexpr bool isMaskId(RegisterId Id) { return Id & MaskFlag; } @@ -151,21 +151,21 @@ struct PhysicalRegisterInfo { return RegisterRef::toMaskId(RegMasks.find(RM)); } - const uint32_t *getRegMaskBits(RegisterId R) const { - return RegMasks.get(RegisterRef(R).asMaskIdx()); + const uint32_t *getRegMaskBits(RegisterRef RR) const { + return RegMasks.get(RR.asMaskIdx()); } bool alias(RegisterRef RA, RegisterRef RB) const; // Returns the set of aliased physical registers. - std::set getAliasSet(RegisterId Reg) const; + std::set getAliasSet(RegisterRef RR) const; RegisterRef getRefForUnit(uint32_t U) const { return RegisterRef(UnitInfos[U].Reg, UnitInfos[U].Mask); } - const BitVector &getMaskUnits(RegisterId MaskId) const { - return MaskInfos[RegisterRef(MaskId).asMaskIdx()].Units; + const BitVector &getMaskUnits(RegisterRef RR) const { + return MaskInfos[RR.asMaskIdx()].Units; } std::set getUnits(RegisterRef RR) const; diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp index 2ef96cc4400f7..f5fbfb7e5e21d 100644 --- a/llvm/lib/CodeGen/Analysis.cpp +++ b/llvm/lib/CodeGen/Analysis.cpp @@ -177,8 +177,8 @@ void llvm::computeValueLLTs(const DataLayout &DL, Type &Ty, return; // Base case: we can get an LLT for this LLVM IR type. ValueTys.push_back(getLLTForType(Ty, DL)); - if (Offsets != nullptr) - Offsets->push_back(StartingOffset * 8); + if (Offsets) + Offsets->push_back(StartingOffset); } /// ExtractTypeInfo - Returns the type info, possibly bitcast, encoded in V. diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 53c831b203cae..2ec138b6e186d 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1412,14 +1412,14 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) { Regs.size() == 1 ? LI.getMetadata(LLVMContext::MD_range) : nullptr; for (unsigned i = 0; i < Regs.size(); ++i) { Register Addr; - MIRBuilder.materializeObjectPtrOffset(Addr, Base, OffsetTy, Offsets[i] / 8); + MIRBuilder.materializeObjectPtrOffset(Addr, Base, OffsetTy, Offsets[i]); - MachinePointerInfo Ptr(LI.getPointerOperand(), Offsets[i] / 8); + MachinePointerInfo Ptr(LI.getPointerOperand(), Offsets[i]); Align BaseAlign = getMemOpAlign(LI); - auto MMO = MF->getMachineMemOperand( - Ptr, Flags, MRI->getType(Regs[i]), - commonAlignment(BaseAlign, Offsets[i] / 8), AAInfo, Ranges, - LI.getSyncScopeID(), LI.getOrdering()); + auto MMO = + MF->getMachineMemOperand(Ptr, Flags, MRI->getType(Regs[i]), + commonAlignment(BaseAlign, Offsets[i]), AAInfo, + Ranges, LI.getSyncScopeID(), LI.getOrdering()); MIRBuilder.buildLoad(Regs[i], Addr, *MMO); } @@ -1451,14 +1451,14 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) { for (unsigned i = 0; i < Vals.size(); ++i) { Register Addr; - MIRBuilder.materializeObjectPtrOffset(Addr, Base, OffsetTy, Offsets[i] / 8); + MIRBuilder.materializeObjectPtrOffset(Addr, Base, OffsetTy, Offsets[i]); - MachinePointerInfo Ptr(SI.getPointerOperand(), Offsets[i] / 8); + MachinePointerInfo Ptr(SI.getPointerOperand(), Offsets[i]); Align BaseAlign = getMemOpAlign(SI); - auto MMO = MF->getMachineMemOperand( - Ptr, Flags, MRI->getType(Vals[i]), - commonAlignment(BaseAlign, Offsets[i] / 8), SI.getAAMetadata(), nullptr, - SI.getSyncScopeID(), SI.getOrdering()); + auto MMO = MF->getMachineMemOperand(Ptr, Flags, MRI->getType(Vals[i]), + commonAlignment(BaseAlign, Offsets[i]), + SI.getAAMetadata(), nullptr, + SI.getSyncScopeID(), SI.getOrdering()); MIRBuilder.buildStore(Vals[i], Addr, *MMO); } return true; @@ -1483,8 +1483,8 @@ static uint64_t getOffsetFromIndices(const User &U, const DataLayout &DL) { llvm::append_range(Indices, drop_begin(U.operands())); } - return 8 * static_cast( - DL.getIndexedOffsetInType(Src->getType(), Indices)); + return static_cast( + DL.getIndexedOffsetInType(Src->getType(), Indices)); } bool IRTranslator::translateExtractValue(const User &U, diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index d02f097fef829..cacb292acee18 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -3065,6 +3065,14 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { Observer.changedInstr(MI); return Legalized; + case TargetOpcode::G_FPEXT: + if (TypeIdx != 1) + return UnableToLegalize; + + Observer.changingInstr(MI); + widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT); + Observer.changedInstr(MI); + return Legalized; case TargetOpcode::G_FPTOSI: case TargetOpcode::G_FPTOUI: case TargetOpcode::G_INTRINSIC_LRINT: diff --git a/llvm/lib/CodeGen/RDFGraph.cpp b/llvm/lib/CodeGen/RDFGraph.cpp index 2fb3d4ed30f24..7e7407e117dee 100644 --- a/llvm/lib/CodeGen/RDFGraph.cpp +++ b/llvm/lib/CodeGen/RDFGraph.cpp @@ -1061,13 +1061,13 @@ void DataFlowGraph::pushClobbers(Instr IA, DefStackMap &DefM) { // Push the definition on the stack for the register and all aliases. // The def stack traversal in linkNodeUp will check the exact aliasing. - DefM[RR.Reg].push(DA); - Defined.insert(RR.Reg); - for (RegisterId A : getPRI().getAliasSet(RR.Reg)) { + DefM[RR.Id].push(DA); + Defined.insert(RR.Id); + for (RegisterId A : getPRI().getAliasSet(RR)) { if (RegisterRef::isRegId(A) && !isTracked(RegisterRef(A))) continue; // Check that we don't push the same def twice. - assert(A != RR.Reg); + assert(A != RR.Id); if (!Defined.count(A)) DefM[A].push(DA); } @@ -1109,7 +1109,7 @@ void DataFlowGraph::pushDefs(Instr IA, DefStackMap &DefM) { #ifndef NDEBUG // Assert if the register is defined in two or more unrelated defs. // This could happen if there are two or more def operands defining it. - if (!Defined.insert(RR.Reg).second) { + if (!Defined.insert(RR.Id).second) { MachineInstr *MI = Stmt(IA).Addr->getCode(); dbgs() << "Multiple definitions of register: " << Print(RR, *this) << " in\n " << *MI << "in " << printMBBReference(*MI->getParent()) @@ -1119,12 +1119,12 @@ void DataFlowGraph::pushDefs(Instr IA, DefStackMap &DefM) { #endif // Push the definition on the stack for the register and all aliases. // The def stack traversal in linkNodeUp will check the exact aliasing. - DefM[RR.Reg].push(DA); - for (RegisterId A : getPRI().getAliasSet(RR.Reg)) { + DefM[RR.Id].push(DA); + for (RegisterId A : getPRI().getAliasSet(RR)) { if (RegisterRef::isRegId(A) && !isTracked(RegisterRef(A))) continue; // Check that we don't push the same def twice. - assert(A != RR.Reg); + assert(A != RR.Id); DefM[A].push(DA); } // Mark all the related defs as visited. @@ -1465,11 +1465,11 @@ void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, Block BA, for (RegisterRef RR : Defs.refs()) { if (!DefM.empty()) { - auto F = DefM.find(RR.Reg); + auto F = DefM.find(RR.Id); // Do not create a phi for unallocatable registers, or for registers // that are never livein to BA. // If a phi exists for RR, do not create another. - if (!MRI.isAllocatable(RR.Reg) || PhiDefs.hasCoverOf(RR) || + if (!MRI.isAllocatable(RR.asMCReg()) || PhiDefs.hasCoverOf(RR) || F == DefM.end() || F->second.empty()) continue; // Do not create a phi, if all reaching defs are clobbering @@ -1601,7 +1601,7 @@ void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, Stmt SA, Predicate P) { Defs.insert(RR); #endif - auto F = DefM.find(RR.Reg); + auto F = DefM.find(RR.Id); if (F == DefM.end()) continue; DefStack &DS = F->second; @@ -1691,7 +1691,7 @@ void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, BlockRefsMap &PhiClobberM, for (auto U : IA.Addr->members_if(IsUseForBA, *this)) { PhiUse PUA = U; RegisterRef RR = PUA.Addr->getRegRef(*this); - linkRefUp(IA, PUA, DefM[RR.Reg]); + linkRefUp(IA, PUA, DefM[RR.Id]); } } } diff --git a/llvm/lib/CodeGen/RDFLiveness.cpp b/llvm/lib/CodeGen/RDFLiveness.cpp index 2e1cf499eab41..8dc227566fcc2 100644 --- a/llvm/lib/CodeGen/RDFLiveness.cpp +++ b/llvm/lib/CodeGen/RDFLiveness.cpp @@ -511,7 +511,7 @@ void Liveness::computePhiInfo() { uint16_t F = A.Addr->getFlags(); if ((F & (NodeAttrs::Undef | NodeAttrs::PhiRef)) == 0) { RegisterRef R = A.Addr->getRegRef(DFG); - RealUses[R.Reg].insert({A.Id, R.Mask}); + RealUses[R.Id].insert({A.Id, R.Mask}); } UN = A.Addr->getSibling(); } @@ -706,8 +706,8 @@ void Liveness::computePhiInfo() { LaneBitmask M = R.Mask & V.second; if (M.none()) continue; - if (RegisterRef SS = ClearIn(RegisterRef(R.Reg, M), MidDefs, SM)) { - NodeRefSet &RS = RealUseMap[P.first][SS.Reg]; + if (RegisterRef SS = ClearIn(RegisterRef(R.Id, M), MidDefs, SM)) { + NodeRefSet &RS = RealUseMap[P.first][SS.Id]; Changed |= RS.insert({V.first, SS.Mask}).second; } } @@ -839,7 +839,7 @@ void Liveness::computeLiveIns() { RegisterAggr TA(PRI); TA.insert(D.Addr->getRegRef(DFG)).intersect(S); LaneBitmask TM = TA.makeRegRef().Mask; - LOX[S.Reg].insert({D.Id, TM}); + LOX[S.Id].insert({D.Id, TM}); } } } @@ -899,7 +899,7 @@ void Liveness::resetLiveIns() { // Add the newly computed live-ins. const RegisterAggr &LiveIns = LiveMap[&B]; for (RegisterRef R : LiveIns.refs()) - B.addLiveIn({MCPhysReg(R.Reg), R.Mask}); + B.addLiveIn({R.asMCReg(), R.Mask}); } } @@ -1046,7 +1046,7 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) { for (const std::pair &LE : LiveInCopy) { RegisterRef LRef(LE.first); - NodeRefSet &NewDefs = LiveIn[LRef.Reg]; // To be filled. + NodeRefSet &NewDefs = LiveIn[LRef.Id]; // To be filled. const NodeRefSet &OldDefs = LE.second; for (NodeRef OR : OldDefs) { // R is a def node that was live-on-exit @@ -1129,7 +1129,7 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) { RegisterRef RR = UA.Addr->getRegRef(DFG); for (NodeAddr D : getAllReachingDefs(UA)) if (getBlockWithRef(D.Id) != B) - LiveIn[RR.Reg].insert({D.Id, RR.Mask}); + LiveIn[RR.Id].insert({D.Id, RR.Mask}); } } diff --git a/llvm/lib/CodeGen/RDFRegisters.cpp b/llvm/lib/CodeGen/RDFRegisters.cpp index 3821f3b791bbd..07729ebec6e51 100644 --- a/llvm/lib/CodeGen/RDFRegisters.cpp +++ b/llvm/lib/CodeGen/RDFRegisters.cpp @@ -101,13 +101,13 @@ bool PhysicalRegisterInfo::alias(RegisterRef RA, RegisterRef RB) const { return !disjoint(getUnits(RA), getUnits(RB)); } -std::set PhysicalRegisterInfo::getAliasSet(RegisterId Reg) const { +std::set PhysicalRegisterInfo::getAliasSet(RegisterRef RR) const { // Do not include Reg in the alias set. std::set AS; - assert(!RegisterRef::isUnitId(Reg) && "No units allowed"); - if (RegisterRef::isMaskId(Reg)) { + assert(!RR.isUnit() && "No units allowed"); + if (RR.isMask()) { // XXX SLOW - const uint32_t *MB = getRegMaskBits(Reg); + const uint32_t *MB = getRegMaskBits(RR); for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i) { if (MB[i / 32] & (1u << (i % 32))) continue; @@ -116,8 +116,8 @@ std::set PhysicalRegisterInfo::getAliasSet(RegisterId Reg) const { return AS; } - assert(RegisterRef::isRegId(Reg)); - for (MCRegAliasIterator AI(Reg, &TRI, false); AI.isValid(); ++AI) + assert(RR.isReg()); + for (MCRegAliasIterator AI(RR.asMCReg(), &TRI, false); AI.isValid(); ++AI) AS.insert(*AI); return AS; @@ -139,7 +139,7 @@ std::set PhysicalRegisterInfo::getUnits(RegisterRef RR) const { assert(RR.isMask()); unsigned NumRegs = TRI.getNumRegs(); - const uint32_t *MB = getRegMaskBits(RR.Reg); + const uint32_t *MB = getRegMaskBits(RR); for (unsigned I = 0, E = (NumRegs + 31) / 32; I != E; ++I) { uint32_t C = ~MB[I]; // Clobbered regs if (I == 0) // Reg 0 should be ignored @@ -160,7 +160,7 @@ std::set PhysicalRegisterInfo::getUnits(RegisterRef RR) const { } RegisterRef PhysicalRegisterInfo::mapTo(RegisterRef RR, RegisterId R) const { - if (RR.Reg == R) + if (RR.Id == R) return RR; if (unsigned Idx = TRI.getSubRegIndex(RegisterRef(R).asMCReg(), RR.asMCReg())) return RegisterRef(R, TRI.composeSubRegIndexLaneMask(Idx, RR.Mask)); @@ -177,11 +177,11 @@ RegisterRef PhysicalRegisterInfo::mapTo(RegisterRef RR, RegisterId R) const { bool PhysicalRegisterInfo::equal_to(RegisterRef A, RegisterRef B) const { if (!A.isReg() || !B.isReg()) { - // For non-regs, or comparing reg and non-reg, use only the Reg member. - return A.Reg == B.Reg; + // For non-regs, or comparing reg and non-reg, use only the Id member. + return A.Id == B.Id; } - if (A.Reg == B.Reg) + if (A.Id == B.Id) return A.Mask == B.Mask; // Compare reg units lexicographically. @@ -213,14 +213,14 @@ bool PhysicalRegisterInfo::equal_to(RegisterRef A, RegisterRef B) const { bool PhysicalRegisterInfo::less(RegisterRef A, RegisterRef B) const { if (!A.isReg() || !B.isReg()) { - // For non-regs, or comparing reg and non-reg, use only the Reg member. - return A.Reg < B.Reg; + // For non-regs, or comparing reg and non-reg, use only the Id member. + return A.Id < B.Id; } - if (A.Reg == B.Reg) + if (A.Id == B.Id) return A.Mask < B.Mask; if (A.Mask == B.Mask) - return A.Reg < B.Reg; + return A.Id < B.Id; // Compare reg units lexicographically. llvm::MCRegUnitMaskIterator AI(A.asMCReg(), &getTRI()); @@ -275,7 +275,7 @@ void PhysicalRegisterInfo::print(raw_ostream &OS, const RegisterAggr &A) const { bool RegisterAggr::hasAliasOf(RegisterRef RR) const { if (RR.isMask()) - return Units.anyCommon(PRI.getMaskUnits(RR.Reg)); + return Units.anyCommon(PRI.getMaskUnits(RR)); for (MCRegUnitMaskIterator U(RR.asMCReg(), &PRI.getTRI()); U.isValid(); ++U) { auto [Unit, LaneMask] = *U; @@ -288,7 +288,7 @@ bool RegisterAggr::hasAliasOf(RegisterRef RR) const { bool RegisterAggr::hasCoverOf(RegisterRef RR) const { if (RR.isMask()) { - BitVector T(PRI.getMaskUnits(RR.Reg)); + BitVector T(PRI.getMaskUnits(RR)); return T.reset(Units).none(); } @@ -303,7 +303,7 @@ bool RegisterAggr::hasCoverOf(RegisterRef RR) const { RegisterAggr &RegisterAggr::insert(RegisterRef RR) { if (RR.isMask()) { - Units |= PRI.getMaskUnits(RR.Reg); + Units |= PRI.getMaskUnits(RR); return *this; } @@ -392,7 +392,7 @@ RegisterAggr::ref_iterator::ref_iterator(const RegisterAggr &RG, bool End) : Owner(&RG) { for (int U = RG.Units.find_first(); U >= 0; U = RG.Units.find_next(U)) { RegisterRef R = RG.PRI.getRefForUnit(U); - Masks[R.Reg] |= R.Mask; + Masks[R.Id] |= R.Mask; } Pos = End ? Masks.end() : Masks.begin(); Index = End ? Masks.size() : 0; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c5f3cd29f684e..70005ba7051cc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18673,11 +18673,13 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { if (Flags.hasAllowReciprocal()) { // If this FDIV is part of a reciprocal square root, it may be folded // into a target-specific square root estimate instruction. + bool N1AllowReciprocal = N1->getFlags().hasAllowReciprocal(); if (N1.getOpcode() == ISD::FSQRT) { if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0))) return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); } else if (N1.getOpcode() == ISD::FP_EXTEND && - N1.getOperand(0).getOpcode() == ISD::FSQRT) { + N1.getOperand(0).getOpcode() == ISD::FSQRT && + N1AllowReciprocal) { if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0))) { RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV); AddToWorklist(RV.getNode()); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 7b3a0881feb10..4f13f3b128ea4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3976,7 +3976,10 @@ void SelectionDAGBuilder::visitFPExt(const User &I) { SDValue N = getValue(I.getOperand(0)); EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(), I.getType()); - setValue(&I, DAG.getNode(ISD::FP_EXTEND, getCurSDLoc(), DestVT, N)); + SDNodeFlags Flags; + if (auto *TruncInst = dyn_cast(&I)) + Flags.copyFMF(*TruncInst); + setValue(&I, DAG.getNode(ISD::FP_EXTEND, getCurSDLoc(), DestVT, N, Flags)); } void SelectionDAGBuilder::visitFPToUI(const User &I) { diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp index b3bc6ea3bcb30..9f069bebcec73 100644 --- a/llvm/lib/Support/SpecialCaseList.cpp +++ b/llvm/lib/Support/SpecialCaseList.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/iterator_range.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/GlobPattern.h" #include "llvm/Support/LineIterator.h" #include "llvm/Support/MemoryBuffer.h" @@ -42,10 +43,9 @@ namespace { class RegexMatcher { public: Error insert(StringRef Pattern, unsigned LineNumber); - void preprocess(); - unsigned match(StringRef Query) const; +private: struct Reg { Reg(StringRef Name, unsigned LineNo, Regex &&Rg) : Name(Name), LineNo(LineNo), Rg(std::move(Rg)) {} @@ -60,10 +60,9 @@ class RegexMatcher { class GlobMatcher { public: Error insert(StringRef Pattern, unsigned LineNumber); - void preprocess(); - unsigned match(StringRef Query) const; +private: struct Glob { Glob(StringRef Name, unsigned LineNo, GlobPattern &&Pattern) : Name(Name), LineNo(LineNo), Pattern(std::move(Pattern)) {} @@ -72,15 +71,20 @@ class GlobMatcher { GlobPattern Pattern; }; + void LazyInit() const; + std::vector Globs; - RadixTree, - RadixTree, - SmallVector>> + mutable RadixTree, + RadixTree, + SmallVector>> PrefixSuffixToGlob; - RadixTree, SmallVector> + mutable RadixTree, + SmallVector> SubstrToGlob; + + mutable bool Initialized = false; }; /// Represents a set of patterns and their line numbers @@ -89,7 +93,6 @@ class Matcher { Matcher(bool UseGlobs, bool RemoveDotSlash); Error insert(StringRef Pattern, unsigned LineNumber); - void preprocess(); unsigned match(StringRef Query) const; bool matchAny(StringRef Query) const { return match(Query); } @@ -122,8 +125,6 @@ Error RegexMatcher::insert(StringRef Pattern, unsigned LineNumber) { return Error::success(); } -void RegexMatcher::preprocess() {} - unsigned RegexMatcher::match(StringRef Query) const { for (const auto &R : reverse(RegExes)) if (R.Rg.match(Query)) @@ -142,7 +143,10 @@ Error GlobMatcher::insert(StringRef Pattern, unsigned LineNumber) { return Error::success(); } -void GlobMatcher::preprocess() { +void GlobMatcher::LazyInit() const { + if (LLVM_LIKELY(Initialized)) + return; + Initialized = true; for (const auto &[Idx, G] : enumerate(Globs)) { StringRef Prefix = G.Pattern.prefix(); StringRef Suffix = G.Pattern.suffix(); @@ -167,6 +171,8 @@ void GlobMatcher::preprocess() { } unsigned GlobMatcher::match(StringRef Query) const { + LazyInit(); + int Best = -1; if (!PrefixSuffixToGlob.empty()) { for (const auto &[_, SToGlob] : PrefixSuffixToGlob.find_prefixes(Query)) { @@ -224,10 +230,6 @@ Error Matcher::insert(StringRef Pattern, unsigned LineNumber) { return std::visit([&](auto &V) { return V.insert(Pattern, LineNumber); }, M); } -void Matcher::preprocess() { - return std::visit([&](auto &V) { return V.preprocess(); }, M); -} - unsigned Matcher::match(StringRef Query) const { if (RemoveDotSlash) Query = llvm::sys::path::remove_leading_dotslash(Query); @@ -237,7 +239,6 @@ unsigned Matcher::match(StringRef Query) const { class SpecialCaseList::Section::SectionImpl { public: - void preprocess(); const Matcher *findMatcher(StringRef Prefix, StringRef Category) const; using SectionEntries = StringMap>; @@ -395,9 +396,6 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB, } } - for (Section &S : Sections) - S.Impl->preprocess(); - return true; } @@ -448,13 +446,6 @@ SpecialCaseList::Section::SectionImpl::findMatcher(StringRef Prefix, return &II->second; } -void SpecialCaseList::Section::SectionImpl::preprocess() { - SectionMatcher.preprocess(); - for (auto &[K1, E] : Entries) - for (auto &[K2, M] : E) - M.preprocess(); -} - unsigned SpecialCaseList::Section::getLastMatch(StringRef Prefix, StringRef Query, StringRef Category) const { diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 038ad77ae69b2..a88817c9d2d19 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -825,6 +825,16 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .legalFor( {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}}) .libcallFor({{s128, s64}, {s128, s32}, {s128, s16}}) + .moreElementsToNextPow2(0) + .widenScalarIf( + [](const LegalityQuery &Q) { + LLT DstTy = Q.Types[0]; + LLT SrcTy = Q.Types[1]; + return SrcTy.isVector() && DstTy.isVector() && + SrcTy.getScalarSizeInBits() == 16 && + DstTy.getScalarSizeInBits() == 64; + }, + changeElementTo(1, s32)) .clampNumElements(0, v4s32, v4s32) .clampNumElements(0, v2s64, v2s64) .scalarize(0); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b8b419d93021a..9308934c8baf8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -470,6 +470,24 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops); } +SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N, + SelectionDAG &DAG) const { + // TODO: Handle undef as zero + + assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); + uint32_t LHSVal, RHSVal; + if (getConstantValue(N->getOperand(0), LHSVal) && + getConstantValue(N->getOperand(1), RHSVal)) { + SDLoc SL(N); + uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16); + return DAG.getMachineNode( + isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL, + N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32)); + } + + return nullptr; +} + void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 4fa0d3f72e1c7..c902b7e7f1d87 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -45,21 +45,6 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) { return false; } -// TODO: Handle undef as zero -static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) { - assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); - uint32_t LHSVal, RHSVal; - if (getConstantValue(N->getOperand(0), LHSVal) && - getConstantValue(N->getOperand(1), RHSVal)) { - SDLoc SL(N); - uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16); - return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0), - DAG.getTargetConstant(K, SL, MVT::i32)); - } - - return nullptr; -} - /// AMDGPU specific code to select AMDGPU machine instructions for /// SelectionDAG operations. class AMDGPUDAGToDAGISel : public SelectionDAGISel { @@ -115,6 +100,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; + SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) const; + SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const; SDNode *glueCopyToM0(SDNode *N, SDValue Val) const; SDNode *glueCopyToM0LDSInit(SDNode *N) const; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index ee5f9b0019db6..2cb76a582a9b1 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -313,29 +313,22 @@ collectVirtualRegUses(SmallVectorImpl &VRegMaskOrUnits, /// Mostly copy/paste from CodeGen/RegisterPressure.cpp static LaneBitmask getLanesWithProperty( const LiveIntervals &LIS, const MachineRegisterInfo &MRI, - bool TrackLaneMasks, Register RegUnit, SlotIndex Pos, - LaneBitmask SafeDefault, + bool TrackLaneMasks, Register Reg, SlotIndex Pos, function_ref Property) { - if (RegUnit.isVirtual()) { - const LiveInterval &LI = LIS.getInterval(RegUnit); - LaneBitmask Result; - if (TrackLaneMasks && LI.hasSubRanges()) { - for (const LiveInterval::SubRange &SR : LI.subranges()) { - if (Property(SR, Pos)) - Result |= SR.LaneMask; - } - } else if (Property(LI, Pos)) { - Result = TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(RegUnit) - : LaneBitmask::getAll(); + assert(Reg.isVirtual()); + const LiveInterval &LI = LIS.getInterval(Reg); + LaneBitmask Result; + if (TrackLaneMasks && LI.hasSubRanges()) { + for (const LiveInterval::SubRange &SR : LI.subranges()) { + if (Property(SR, Pos)) + Result |= SR.LaneMask; } - - return Result; + } else if (Property(LI, Pos)) { + Result = + TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(Reg) : LaneBitmask::getAll(); } - const LiveRange *LR = LIS.getCachedRegUnit(RegUnit); - if (LR == nullptr) - return SafeDefault; - return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone(); + return Result; } /// Mostly copy/paste from CodeGen/RegisterPressure.cpp @@ -503,10 +496,9 @@ void GCNRPTracker::reset(const MachineRegisterInfo &MRI_, } /// Mostly copy/paste from CodeGen/RegisterPressure.cpp -LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit, - SlotIndex Pos) const { +LaneBitmask GCNRPTracker::getLastUsedLanes(Register Reg, SlotIndex Pos) const { return getLanesWithProperty( - LIS, *MRI, true, RegUnit, Pos.getBaseIndex(), LaneBitmask::getNone(), + LIS, *MRI, true, Reg, Pos.getBaseIndex(), [](const LiveRange &LR, SlotIndex Pos) { const LiveRange::Segment *S = LR.getSegmentContaining(Pos); return S != nullptr && S->end == Pos.getRegSlot(); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index f54874d2a5b40..ab310bba142a3 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -292,7 +292,7 @@ class GCNRPTracker { /// Mostly copy/paste from CodeGen/RegisterPressure.cpp void bumpDeadDefs(ArrayRef DeadDefs); - LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const; + LaneBitmask getLastUsedLanes(Register Reg, SlotIndex Pos) const; public: // reset tracker and set live register set to the specified value. diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 6dd4b1d7bd000..6cc9b3cc67530 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2941,15 +2941,25 @@ def : GCNPat < >; def : GCNPat < - (i64 (zext i32:$src)), + (i64 (UniformUnaryFrag i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1) >; def : GCNPat < - (i64 (anyext i32:$src)), + (i64 (zext i32:$src)), + (REG_SEQUENCE VReg_64, $src, sub0, (V_MOV_B32_e32 (i32 0)), sub1) +>; + +def : GCNPat < + (i64 (UniformUnaryFrag i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1) >; +def : GCNPat < + (i64 (anyext i32:$src)), + (REG_SEQUENCE VReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1) +>; + class ZExt_i64_i1_Pat : GCNPat < (i64 (ext i1:$src)), (REG_SEQUENCE VReg_64, diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp index 2ee3b9d3b1e27..53afbc433c933 100644 --- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp +++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp @@ -107,7 +107,7 @@ class HexagonOptAddrMode : public MachineFunctionPass { bool canRemoveAddasl(NodeAddr AddAslSN, MachineInstr &MI, const NodeList &UNodeList); bool isSafeToExtLR(NodeAddr SN, MachineInstr *MI, - unsigned LRExtReg, const NodeList &UNodeList); + Register LRExtReg, const NodeList &UNodeList); void getAllRealUses(NodeAddr SN, NodeList &UNodeList); bool allValidCandidates(NodeAddr SA, NodeList &UNodeList); short getBaseWithLongOffset(const MachineInstr &MI) const; @@ -177,7 +177,7 @@ bool HexagonOptAddrMode::canRemoveAddasl(NodeAddr AddAslSN, NodeId OffsetRegRD = 0; for (NodeAddr UA : AddAslSN.Addr->members_if(DFG->IsUse, *DFG)) { RegisterRef RR = UA.Addr->getRegRef(*DFG); - if (OffsetReg == RR.Reg) { + if (OffsetReg == RR.asMCReg()) { OffsetRR = RR; OffsetRegRD = UA.Addr->getReachingDef(); } @@ -300,7 +300,7 @@ void HexagonOptAddrMode::getAllRealUses(NodeAddr SA, } bool HexagonOptAddrMode::isSafeToExtLR(NodeAddr SN, - MachineInstr *MI, unsigned LRExtReg, + MachineInstr *MI, Register LRExtReg, const NodeList &UNodeList) { RegisterRef LRExtRR; NodeId LRExtRegRD = 0; @@ -308,7 +308,7 @@ bool HexagonOptAddrMode::isSafeToExtLR(NodeAddr SN, // for the LRExtReg. for (NodeAddr UA : SN.Addr->members_if(DFG->IsUse, *DFG)) { RegisterRef RR = UA.Addr->getRegRef(*DFG); - if (LRExtReg == RR.Reg) { + if (LRExtReg == RR.asMCReg()) { LRExtRR = RR; LRExtRegRD = UA.Addr->getReachingDef(); } @@ -552,7 +552,7 @@ bool HexagonOptAddrMode::processAddBases(NodeAddr AddSN, // Find the UseNode that contains the base register and it's reachingDef for (NodeAddr UA : AddSN.Addr->members_if(DFG->IsUse, *DFG)) { RegisterRef URR = UA.Addr->getRegRef(*DFG); - if (BaseReg != URR.Reg) + if (BaseReg != URR.asMCReg()) continue; UAReachingDefID = UA.Addr->getReachingDef(); @@ -740,7 +740,7 @@ bool HexagonOptAddrMode::processAddUses(NodeAddr AddSN, // for the LRExtReg. for (NodeAddr UA : AddSN.Addr->members_if(DFG->IsUse, *DFG)) { RegisterRef RR = UA.Addr->getRegRef(*DFG); - if (BaseReg == RR.Reg) + if (BaseReg == RR.asMCReg()) LRExtRegRD = UA.Addr->getReachingDef(); } diff --git a/llvm/lib/Target/Hexagon/RDFCopy.cpp b/llvm/lib/Target/Hexagon/RDFCopy.cpp index 4cab5da7b1caf..4d0df66a7c6d7 100644 --- a/llvm/lib/Target/Hexagon/RDFCopy.cpp +++ b/llvm/lib/Target/Hexagon/RDFCopy.cpp @@ -43,11 +43,11 @@ bool CopyPropagation::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) { const MachineOperand &Src = MI->getOperand(1); RegisterRef DstR = DFG.makeRegRef(Dst.getReg(), Dst.getSubReg()); RegisterRef SrcR = DFG.makeRegRef(Src.getReg(), Src.getSubReg()); - assert(Register::isPhysicalRegister(DstR.Reg)); - assert(Register::isPhysicalRegister(SrcR.Reg)); + assert(DstR.asMCReg().isPhysical()); + assert(SrcR.asMCReg().isPhysical()); const TargetRegisterInfo &TRI = DFG.getTRI(); - if (TRI.getMinimalPhysRegClass(DstR.Reg) != - TRI.getMinimalPhysRegClass(SrcR.Reg)) + if (TRI.getMinimalPhysRegClass(DstR.asMCReg()) != + TRI.getMinimalPhysRegClass(SrcR.asMCReg())) return false; if (!DFG.isTracked(SrcR) || !DFG.isTracked(DstR)) return false; @@ -65,7 +65,7 @@ void CopyPropagation::recordCopy(NodeAddr SA, EqualityMap &EM) { Copies.push_back(SA.Id); for (auto I : EM) { - auto FS = DefM.find(I.second.Reg); + auto FS = DefM.find(I.second.Id); if (FS == DefM.end() || FS->second.empty()) continue; // Undefined source RDefMap[I.second][SA.Id] = FS->second.top()->Id; @@ -92,7 +92,7 @@ void CopyPropagation::updateMap(NodeAddr IA) { for (auto &R : RDefMap) { if (!RRs.count(R.first)) continue; - auto F = DefM.find(R.first.Reg); + auto F = DefM.find(R.first.Id); if (F == DefM.end() || F->second.empty()) continue; R.second[IA.Id] = F->second.top()->Id; @@ -154,16 +154,16 @@ bool CopyPropagation::run() { bool HasLimit = CpLimit.getNumOccurrences() > 0; #endif - auto MinPhysReg = [this] (RegisterRef RR) -> unsigned { + auto MinPhysReg = [this](RegisterRef RR) -> MCRegister { const TargetRegisterInfo &TRI = DFG.getTRI(); - const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(RR.Reg); + const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(RR.asMCReg()); if ((RC.LaneMask & RR.Mask) == RC.LaneMask) - return RR.Reg; - for (MCSubRegIndexIterator S(RR.Reg, &TRI); S.isValid(); ++S) + return RR.asMCReg(); + for (MCSubRegIndexIterator S(RR.asMCReg(), &TRI); S.isValid(); ++S) if (RR.Mask == TRI.getSubRegIndexLaneMask(S.getSubRegIndex())) return S.getSubReg(); llvm_unreachable("Should have found a register"); - return 0; + return MCRegister(); }; const PhysicalRegisterInfo &PRI = DFG.getPRI(); @@ -214,7 +214,7 @@ bool CopyPropagation::run() { << *NodeAddr(IA).Addr->getCode(); } - unsigned NewReg = MinPhysReg(SR); + MCRegister NewReg = MinPhysReg(SR); Op.setReg(NewReg); Op.setSubReg(0); DFG.unlinkUse(UA, false); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e61eb0fcfe492..938eacde7548d 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -537,7 +537,8 @@ static bool isSplat(ArrayRef VL) { /// \param I The instruction to check for commutativity /// \param ValWithUses The value whose uses are analyzed for special /// patterns -static bool isCommutative(Instruction *I, Value *ValWithUses) { +static bool isCommutative(Instruction *I, Value *ValWithUses, + bool IsCopyable = false) { if (auto *Cmp = dyn_cast(I)) return Cmp->isCommutative(); if (auto *BO = dyn_cast(I)) @@ -546,7 +547,7 @@ static bool isCommutative(Instruction *I, Value *ValWithUses) { !ValWithUses->hasNUsesOrMore(UsesLimit) && all_of( ValWithUses->uses(), - [](const Use &U) { + [&](const Use &U) { // Commutative, if icmp eq/ne sub, 0 CmpPredicate Pred; if (match(U.getUser(), @@ -555,10 +556,11 @@ static bool isCommutative(Instruction *I, Value *ValWithUses) { return true; // Commutative, if abs(sub nsw, true) or abs(sub, false). ConstantInt *Flag; + auto *I = dyn_cast(U.get()); return match(U.getUser(), m_Intrinsic( m_Specific(U.get()), m_ConstantInt(Flag))) && - (!cast(U.get())->hasNoSignedWrap() || + ((!IsCopyable && I && !I->hasNoSignedWrap()) || Flag->isOne()); })) || (BO->getOpcode() == Instruction::FSub && @@ -3164,7 +3166,8 @@ class BoUpSLP { bool IsInverseOperation = false; if (S.isCopyableElement(VL[Lane])) { // The value is a copyable element. - IsInverseOperation = !isCommutative(MainOp, VL[Lane]); + IsInverseOperation = + !isCommutative(MainOp, VL[Lane], /*IsCopyable=*/true); } else { assert(I && "Expected instruction"); auto [SelectedOp, Ops] = convertTo(I, S); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpext.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpext.mir index 1c10e08d54c61..50394b6bbbf99 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpext.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fpext.mir @@ -32,3 +32,133 @@ body: | RET_ReallyLR ... + +--- +name: fpext_f16_f64 +body: | + bb.0: + liveins: $h0 + ; CHECK-LABEL: name: fpext_f16_f64 + ; CHECK: liveins: $h0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0 + ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s64) = G_FPEXT [[COPY]](s16) + ; CHECK-NEXT: $d0 = COPY [[FPEXT]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $d0 + %0:_(s16) = COPY $h0 + %1:_(s64) = G_FPEXT %0(s16) + $d0 = COPY %1(s64) + RET_ReallyLR implicit $d0 +... + +--- +name: fpext_v2f16_v2f64 +body: | + bb.0: + liveins: $d0 + + ; CHECK-LABEL: name: fpext_v2f16_v2f64 + ; CHECK: liveins: $d0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0 + ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(<4 x s32>) = G_FPEXT [[COPY]](<4 x s16>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[FPEXT]](<4 x s32>) + ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(<2 x s64>) = G_FPEXT [[UV]](<2 x s32>) + ; CHECK-NEXT: $q0 = COPY [[FPEXT1]](<2 x s64>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %1:_(<4 x s16>) = COPY $d0 + %0:_(<2 x s16>), %2:_(<2 x s16>) = G_UNMERGE_VALUES %1(<4 x s16>) + %3:_(<2 x s64>) = G_FPEXT %0(<2 x s16>) + $q0 = COPY %3(<2 x s64>) + RET_ReallyLR implicit $q0 +... + +--- +name: fpext_v3f16_v3f64 +body: | + bb.0: + liveins: $d0 + + ; CHECK-LABEL: name: fpext_v3f16_v3f64 + ; CHECK: liveins: $d0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0 + ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(<4 x s32>) = G_FPEXT [[COPY]](<4 x s16>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[FPEXT]](<4 x s32>) + ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(<2 x s64>) = G_FPEXT [[UV]](<2 x s32>) + ; CHECK-NEXT: [[FPEXT2:%[0-9]+]]:_(<2 x s64>) = G_FPEXT [[UV1]](<2 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[FPEXT1]](<2 x s64>) + ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[FPEXT2]](<2 x s64>) + ; CHECK-NEXT: $d0 = COPY [[UV2]](s64) + ; CHECK-NEXT: $d1 = COPY [[UV3]](s64) + ; CHECK-NEXT: $d2 = COPY [[UV4]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $d0, implicit $d1, implicit $d2 + %1:_(<4 x s16>) = COPY $d0 + %2:_(s16), %3:_(s16), %4:_(s16), %5:_(s16) = G_UNMERGE_VALUES %1(<4 x s16>) + %0:_(<3 x s16>) = G_BUILD_VECTOR %2(s16), %3(s16), %4(s16) + %6:_(<3 x s64>) = G_FPEXT %0(<3 x s16>) + %7:_(s64), %8:_(s64), %9:_(s64) = G_UNMERGE_VALUES %6(<3 x s64>) + $d0 = COPY %7(s64) + $d1 = COPY %8(s64) + $d2 = COPY %9(s64) + RET_ReallyLR implicit $d0, implicit $d1, implicit $d2 +... + +--- +name: fpext_v4f16_v4f64 +body: | + bb.0: + liveins: $d0 + + ; CHECK-LABEL: name: fpext_v4f16_v4f64 + ; CHECK: liveins: $d0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0 + ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(<4 x s32>) = G_FPEXT [[COPY]](<4 x s16>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[FPEXT]](<4 x s32>) + ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(<2 x s64>) = G_FPEXT [[UV]](<2 x s32>) + ; CHECK-NEXT: [[FPEXT2:%[0-9]+]]:_(<2 x s64>) = G_FPEXT [[UV1]](<2 x s32>) + ; CHECK-NEXT: $q0 = COPY [[FPEXT1]](<2 x s64>) + ; CHECK-NEXT: $q1 = COPY [[FPEXT2]](<2 x s64>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1 + %0:_(<4 x s16>) = COPY $d0 + %1:_(<4 x s64>) = G_FPEXT %0(<4 x s16>) + %2:_(<2 x s64>), %3:_(<2 x s64>) = G_UNMERGE_VALUES %1(<4 x s64>) + $q0 = COPY %2(<2 x s64>) + $q1 = COPY %3(<2 x s64>) + RET_ReallyLR implicit $q0, implicit $q1 +... + +--- +name: fpext_v8f16_v8f64 +body: | + bb.0: + liveins: $q0 + + ; CHECK-LABEL: name: fpext_v8f16_v8f64 + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s16>), [[UV1:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[COPY]](<8 x s16>) + ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(<4 x s32>) = G_FPEXT [[UV]](<4 x s16>) + ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(<4 x s32>) = G_FPEXT [[UV1]](<4 x s16>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<2 x s32>), [[UV3:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[FPEXT]](<4 x s32>) + ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<2 x s32>), [[UV5:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[FPEXT1]](<4 x s32>) + ; CHECK-NEXT: [[FPEXT2:%[0-9]+]]:_(<2 x s64>) = G_FPEXT [[UV2]](<2 x s32>) + ; CHECK-NEXT: [[FPEXT3:%[0-9]+]]:_(<2 x s64>) = G_FPEXT [[UV3]](<2 x s32>) + ; CHECK-NEXT: [[FPEXT4:%[0-9]+]]:_(<2 x s64>) = G_FPEXT [[UV4]](<2 x s32>) + ; CHECK-NEXT: [[FPEXT5:%[0-9]+]]:_(<2 x s64>) = G_FPEXT [[UV5]](<2 x s32>) + ; CHECK-NEXT: $q0 = COPY [[FPEXT2]](<2 x s64>) + ; CHECK-NEXT: $q1 = COPY [[FPEXT3]](<2 x s64>) + ; CHECK-NEXT: $q2 = COPY [[FPEXT4]](<2 x s64>) + ; CHECK-NEXT: $q3 = COPY [[FPEXT5]](<2 x s64>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3 + %0:_(<8 x s16>) = COPY $q0 + %1:_(<8 x s64>) = G_FPEXT %0(<8 x s16>) + %2:_(<2 x s64>), %3:_(<2 x s64>), %4:_(<2 x s64>), %5:_(<2 x s64>) = G_UNMERGE_VALUES %1(<8 x s64>) + $q0 = COPY %2(<2 x s64>) + $q1 = COPY %3(<2 x s64>) + $q2 = COPY %4(<2 x s64>) + $q3 = COPY %5(<2 x s64>) + RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3 +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index 896603d6eb20d..92b273c6141d1 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -555,8 +555,8 @@ # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FPEXT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices -# DEBUG-NEXT: .. the first uncovered type index: 2, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FPTRUNC (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: 2, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll index a37aabb0b5384..12b6562b5cf0c 100644 --- a/llvm/test/CodeGen/AArch64/fmla.ll +++ b/llvm/test/CodeGen/AArch64/fmla.ll @@ -865,22 +865,22 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) { ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v3.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v2.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v5.4h -; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[4] ; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h ; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[5] -; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v4.4s -; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v0.4s -; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[6] -; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v5.4h +; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v2.h[4] +; CHECK-GI-NOFP16-NEXT: fmul v3.4s, v3.4s, v4.4s +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[5] +; CHECK-GI-NOFP16-NEXT: fcvtn v4.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v3.4s +; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v2.h[6] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v4.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v3.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1] -; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v2.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v4.h[1] +; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v2.4s, v1.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v4.h[2] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3] +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[3] ; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] ; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1] ; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2] @@ -1350,22 +1350,22 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) { ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v3.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v2.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v5.4h -; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[4] ; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h ; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[5] -; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v4.4s -; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v0.4s -; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[6] -; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v5.4h +; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v2.h[4] +; CHECK-GI-NOFP16-NEXT: fmul v3.4s, v3.4s, v4.4s +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[5] +; CHECK-GI-NOFP16-NEXT: fcvtn v4.4h, v0.4s +; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v3.4s +; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v2.h[6] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v4.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v3.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1] -; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v2.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2] +; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v4.h[1] +; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v2.4s, v1.4s +; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v4.h[2] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3] +; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[3] ; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0] ; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1] ; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2] diff --git a/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll index 6233ce743b706..1e1e25c04b384 100644 --- a/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll +++ b/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll @@ -241,30 +241,16 @@ define <4 x double> @h_to_d(<4 x half> %a) { ; ; CHECK-CVT-GI-LABEL: h_to_d: ; CHECK-CVT-GI: // %bb.0: -; CHECK-CVT-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-CVT-GI-NEXT: mov h1, v0.h[1] -; CHECK-CVT-GI-NEXT: mov h2, v0.h[2] -; CHECK-CVT-GI-NEXT: mov h3, v0.h[3] -; CHECK-CVT-GI-NEXT: fcvt d0, h0 -; CHECK-CVT-GI-NEXT: fcvt d4, h1 -; CHECK-CVT-GI-NEXT: fcvt d1, h2 -; CHECK-CVT-GI-NEXT: fcvt d2, h3 -; CHECK-CVT-GI-NEXT: mov v0.d[1], v4.d[0] -; CHECK-CVT-GI-NEXT: mov v1.d[1], v2.d[0] +; CHECK-CVT-GI-NEXT: fcvtl v1.4s, v0.4h +; CHECK-CVT-GI-NEXT: fcvtl v0.2d, v1.2s +; CHECK-CVT-GI-NEXT: fcvtl2 v1.2d, v1.4s ; CHECK-CVT-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: h_to_d: ; CHECK-FP16-GI: // %bb.0: -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-GI-NEXT: mov h1, v0.h[1] -; CHECK-FP16-GI-NEXT: mov h2, v0.h[2] -; CHECK-FP16-GI-NEXT: mov h3, v0.h[3] -; CHECK-FP16-GI-NEXT: fcvt d0, h0 -; CHECK-FP16-GI-NEXT: fcvt d4, h1 -; CHECK-FP16-GI-NEXT: fcvt d1, h2 -; CHECK-FP16-GI-NEXT: fcvt d2, h3 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v4.d[0] -; CHECK-FP16-GI-NEXT: mov v1.d[1], v2.d[0] +; CHECK-FP16-GI-NEXT: fcvtl v1.4s, v0.4h +; CHECK-FP16-GI-NEXT: fcvtl v0.2d, v1.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v1.2d, v1.4s ; CHECK-FP16-GI-NEXT: ret %1 = fpext <4 x half> %a to <4 x double> ret <4 x double> %1 diff --git a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll index 86763eb5f9e3b..7b152bcccf1e5 100644 --- a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll +++ b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll @@ -298,48 +298,22 @@ define <8 x double> @h_to_d(<8 x half> %a) { ; ; CHECK-CVT-GI-LABEL: h_to_d: ; CHECK-CVT-GI: // %bb.0: -; CHECK-CVT-GI-NEXT: mov h1, v0.h[1] -; CHECK-CVT-GI-NEXT: mov h2, v0.h[2] -; CHECK-CVT-GI-NEXT: mov h3, v0.h[3] -; CHECK-CVT-GI-NEXT: mov h4, v0.h[4] -; CHECK-CVT-GI-NEXT: mov h5, v0.h[5] -; CHECK-CVT-GI-NEXT: mov h6, v0.h[6] -; CHECK-CVT-GI-NEXT: mov h7, v0.h[7] -; CHECK-CVT-GI-NEXT: fcvt d0, h0 -; CHECK-CVT-GI-NEXT: fcvt d16, h1 -; CHECK-CVT-GI-NEXT: fcvt d1, h2 -; CHECK-CVT-GI-NEXT: fcvt d17, h3 -; CHECK-CVT-GI-NEXT: fcvt d2, h4 -; CHECK-CVT-GI-NEXT: fcvt d4, h5 -; CHECK-CVT-GI-NEXT: fcvt d3, h6 -; CHECK-CVT-GI-NEXT: fcvt d5, h7 -; CHECK-CVT-GI-NEXT: mov v0.d[1], v16.d[0] -; CHECK-CVT-GI-NEXT: mov v1.d[1], v17.d[0] -; CHECK-CVT-GI-NEXT: mov v2.d[1], v4.d[0] -; CHECK-CVT-GI-NEXT: mov v3.d[1], v5.d[0] +; CHECK-CVT-GI-NEXT: fcvtl v1.4s, v0.4h +; CHECK-CVT-GI-NEXT: fcvtl2 v3.4s, v0.8h +; CHECK-CVT-GI-NEXT: fcvtl v0.2d, v1.2s +; CHECK-CVT-GI-NEXT: fcvtl2 v1.2d, v1.4s +; CHECK-CVT-GI-NEXT: fcvtl v2.2d, v3.2s +; CHECK-CVT-GI-NEXT: fcvtl2 v3.2d, v3.4s ; CHECK-CVT-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: h_to_d: ; CHECK-FP16-GI: // %bb.0: -; CHECK-FP16-GI-NEXT: mov h1, v0.h[1] -; CHECK-FP16-GI-NEXT: mov h2, v0.h[2] -; CHECK-FP16-GI-NEXT: mov h3, v0.h[3] -; CHECK-FP16-GI-NEXT: mov h4, v0.h[4] -; CHECK-FP16-GI-NEXT: mov h5, v0.h[5] -; CHECK-FP16-GI-NEXT: mov h6, v0.h[6] -; CHECK-FP16-GI-NEXT: mov h7, v0.h[7] -; CHECK-FP16-GI-NEXT: fcvt d0, h0 -; CHECK-FP16-GI-NEXT: fcvt d16, h1 -; CHECK-FP16-GI-NEXT: fcvt d1, h2 -; CHECK-FP16-GI-NEXT: fcvt d17, h3 -; CHECK-FP16-GI-NEXT: fcvt d2, h4 -; CHECK-FP16-GI-NEXT: fcvt d4, h5 -; CHECK-FP16-GI-NEXT: fcvt d3, h6 -; CHECK-FP16-GI-NEXT: fcvt d5, h7 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v16.d[0] -; CHECK-FP16-GI-NEXT: mov v1.d[1], v17.d[0] -; CHECK-FP16-GI-NEXT: mov v2.d[1], v4.d[0] -; CHECK-FP16-GI-NEXT: mov v3.d[1], v5.d[0] +; CHECK-FP16-GI-NEXT: fcvtl v1.4s, v0.4h +; CHECK-FP16-GI-NEXT: fcvtl2 v3.4s, v0.8h +; CHECK-FP16-GI-NEXT: fcvtl v0.2d, v1.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v1.2d, v1.4s +; CHECK-FP16-GI-NEXT: fcvtl v2.2d, v3.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v3.2d, v3.4s ; CHECK-FP16-GI-NEXT: ret %1 = fpext <8 x half> %a to <8 x double> ret <8 x double> %1 diff --git a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll index 21c1b82dc4404..fd7c869fe2f92 100644 --- a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll @@ -285,31 +285,24 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; ; CHECK-FP16-GI-LABEL: stest_f16i32: ; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-GI-NEXT: mov h1, v0.h[1] -; CHECK-FP16-GI-NEXT: mov h2, v0.h[2] +; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h ; CHECK-FP16-GI-NEXT: adrp x8, .LCPI6_1 -; CHECK-FP16-GI-NEXT: mov h3, v0.h[3] -; CHECK-FP16-GI-NEXT: fcvt d0, h0 -; CHECK-FP16-GI-NEXT: fcvt d1, h1 -; CHECK-FP16-GI-NEXT: fcvt d2, h2 -; CHECK-FP16-GI-NEXT: fcvt d3, h3 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-FP16-GI-NEXT: mov v2.d[1], v3.d[0] -; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v2.2d ; CHECK-FP16-GI-NEXT: ldr q2, [x8, :lo12:.LCPI6_1] ; CHECK-FP16-GI-NEXT: adrp x8, .LCPI6_0 -; CHECK-FP16-GI-NEXT: cmgt v3.2d, v2.2d, v0.2d -; CHECK-FP16-GI-NEXT: cmgt v4.2d, v2.2d, v1.2d -; CHECK-FP16-GI-NEXT: bif v0.16b, v2.16b, v3.16b -; CHECK-FP16-GI-NEXT: bif v1.16b, v2.16b, v4.16b +; CHECK-FP16-GI-NEXT: fcvtl v1.2d, v0.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v0.2d, v0.4s +; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v1.2d +; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d +; CHECK-FP16-GI-NEXT: cmgt v3.2d, v2.2d, v1.2d +; CHECK-FP16-GI-NEXT: cmgt v4.2d, v2.2d, v0.2d +; CHECK-FP16-GI-NEXT: bif v1.16b, v2.16b, v3.16b +; CHECK-FP16-GI-NEXT: bif v0.16b, v2.16b, v4.16b ; CHECK-FP16-GI-NEXT: ldr q2, [x8, :lo12:.LCPI6_0] -; CHECK-FP16-GI-NEXT: cmgt v3.2d, v0.2d, v2.2d -; CHECK-FP16-GI-NEXT: cmgt v4.2d, v1.2d, v2.2d -; CHECK-FP16-GI-NEXT: bif v0.16b, v2.16b, v3.16b -; CHECK-FP16-GI-NEXT: bif v1.16b, v2.16b, v4.16b -; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-FP16-GI-NEXT: cmgt v3.2d, v1.2d, v2.2d +; CHECK-FP16-GI-NEXT: cmgt v4.2d, v0.2d, v2.2d +; CHECK-FP16-GI-NEXT: bif v1.16b, v2.16b, v3.16b +; CHECK-FP16-GI-NEXT: bif v0.16b, v2.16b, v4.16b +; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v1.4s, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %conv = fptosi <4 x half> %x to <4 x i64> @@ -351,24 +344,17 @@ define <4 x i32> @utest_f16i32(<4 x half> %x) { ; ; CHECK-FP16-GI-LABEL: utest_f16i32: ; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-GI-NEXT: mov h2, v0.h[1] -; CHECK-FP16-GI-NEXT: mov h3, v0.h[2] -; CHECK-FP16-GI-NEXT: mov h4, v0.h[3] -; CHECK-FP16-GI-NEXT: fcvt d0, h0 +; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h ; CHECK-FP16-GI-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-FP16-GI-NEXT: fcvt d2, h2 -; CHECK-FP16-GI-NEXT: fcvt d3, h3 -; CHECK-FP16-GI-NEXT: fcvt d4, h4 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v2.d[0] -; CHECK-FP16-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-FP16-GI-NEXT: fcvtl v2.2d, v0.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v0.2d, v0.4s +; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v2.2d ; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v3.2d -; CHECK-FP16-GI-NEXT: cmhi v3.2d, v1.2d, v0.2d -; CHECK-FP16-GI-NEXT: cmhi v4.2d, v1.2d, v2.2d -; CHECK-FP16-GI-NEXT: bif v0.16b, v1.16b, v3.16b -; CHECK-FP16-GI-NEXT: bit v1.16b, v2.16b, v4.16b -; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-FP16-GI-NEXT: cmhi v3.2d, v1.2d, v2.2d +; CHECK-FP16-GI-NEXT: cmhi v4.2d, v1.2d, v0.2d +; CHECK-FP16-GI-NEXT: bif v2.16b, v1.16b, v3.16b +; CHECK-FP16-GI-NEXT: bif v0.16b, v1.16b, v4.16b +; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v2.4s, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %conv = fptoui <4 x half> %x to <4 x i64> @@ -412,28 +398,21 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; ; CHECK-FP16-GI-LABEL: ustest_f16i32: ; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-GI-NEXT: mov h2, v0.h[1] -; CHECK-FP16-GI-NEXT: mov h3, v0.h[2] -; CHECK-FP16-GI-NEXT: mov h4, v0.h[3] -; CHECK-FP16-GI-NEXT: fcvt d0, h0 +; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h ; CHECK-FP16-GI-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-FP16-GI-NEXT: fcvt d2, h2 -; CHECK-FP16-GI-NEXT: fcvt d3, h3 -; CHECK-FP16-GI-NEXT: fcvt d4, h4 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v2.d[0] -; CHECK-FP16-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-FP16-GI-NEXT: fcvtl v2.2d, v0.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v0.2d, v0.4s +; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v2.2d ; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v3.2d -; CHECK-FP16-GI-NEXT: cmgt v3.2d, v1.2d, v0.2d -; CHECK-FP16-GI-NEXT: cmgt v4.2d, v1.2d, v2.2d -; CHECK-FP16-GI-NEXT: bif v0.16b, v1.16b, v3.16b -; CHECK-FP16-GI-NEXT: bit v1.16b, v2.16b, v4.16b -; CHECK-FP16-GI-NEXT: cmgt v2.2d, v0.2d, #0 -; CHECK-FP16-GI-NEXT: cmgt v3.2d, v1.2d, #0 -; CHECK-FP16-GI-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-FP16-GI-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-FP16-GI-NEXT: cmgt v3.2d, v1.2d, v2.2d +; CHECK-FP16-GI-NEXT: cmgt v4.2d, v1.2d, v0.2d +; CHECK-FP16-GI-NEXT: bif v2.16b, v1.16b, v3.16b +; CHECK-FP16-GI-NEXT: bif v0.16b, v1.16b, v4.16b +; CHECK-FP16-GI-NEXT: cmgt v1.2d, v2.2d, #0 +; CHECK-FP16-GI-NEXT: cmgt v3.2d, v0.2d, #0 +; CHECK-FP16-GI-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-FP16-GI-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v1.4s, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %conv = fptosi <4 x half> %x to <4 x i64> @@ -2273,31 +2252,24 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; ; CHECK-FP16-GI-LABEL: stest_f16i32_mm: ; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-GI-NEXT: mov h1, v0.h[1] -; CHECK-FP16-GI-NEXT: mov h2, v0.h[2] +; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h ; CHECK-FP16-GI-NEXT: adrp x8, .LCPI33_1 -; CHECK-FP16-GI-NEXT: mov h3, v0.h[3] -; CHECK-FP16-GI-NEXT: fcvt d0, h0 -; CHECK-FP16-GI-NEXT: fcvt d1, h1 -; CHECK-FP16-GI-NEXT: fcvt d2, h2 -; CHECK-FP16-GI-NEXT: fcvt d3, h3 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-FP16-GI-NEXT: mov v2.d[1], v3.d[0] -; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v2.2d ; CHECK-FP16-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_1] ; CHECK-FP16-GI-NEXT: adrp x8, .LCPI33_0 -; CHECK-FP16-GI-NEXT: cmgt v3.2d, v2.2d, v0.2d -; CHECK-FP16-GI-NEXT: cmgt v4.2d, v2.2d, v1.2d -; CHECK-FP16-GI-NEXT: bif v0.16b, v2.16b, v3.16b -; CHECK-FP16-GI-NEXT: bif v1.16b, v2.16b, v4.16b +; CHECK-FP16-GI-NEXT: fcvtl v1.2d, v0.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v0.2d, v0.4s +; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v1.2d +; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d +; CHECK-FP16-GI-NEXT: cmgt v3.2d, v2.2d, v1.2d +; CHECK-FP16-GI-NEXT: cmgt v4.2d, v2.2d, v0.2d +; CHECK-FP16-GI-NEXT: bif v1.16b, v2.16b, v3.16b +; CHECK-FP16-GI-NEXT: bif v0.16b, v2.16b, v4.16b ; CHECK-FP16-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_0] -; CHECK-FP16-GI-NEXT: cmgt v3.2d, v0.2d, v2.2d -; CHECK-FP16-GI-NEXT: cmgt v4.2d, v1.2d, v2.2d -; CHECK-FP16-GI-NEXT: bif v0.16b, v2.16b, v3.16b -; CHECK-FP16-GI-NEXT: bif v1.16b, v2.16b, v4.16b -; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-FP16-GI-NEXT: cmgt v3.2d, v1.2d, v2.2d +; CHECK-FP16-GI-NEXT: cmgt v4.2d, v0.2d, v2.2d +; CHECK-FP16-GI-NEXT: bif v1.16b, v2.16b, v3.16b +; CHECK-FP16-GI-NEXT: bif v0.16b, v2.16b, v4.16b +; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v1.4s, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %conv = fptosi <4 x half> %x to <4 x i64> @@ -2337,24 +2309,17 @@ define <4 x i32> @utest_f16i32_mm(<4 x half> %x) { ; ; CHECK-FP16-GI-LABEL: utest_f16i32_mm: ; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-GI-NEXT: mov h2, v0.h[1] -; CHECK-FP16-GI-NEXT: mov h3, v0.h[2] -; CHECK-FP16-GI-NEXT: mov h4, v0.h[3] -; CHECK-FP16-GI-NEXT: fcvt d0, h0 +; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h ; CHECK-FP16-GI-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-FP16-GI-NEXT: fcvt d2, h2 -; CHECK-FP16-GI-NEXT: fcvt d3, h3 -; CHECK-FP16-GI-NEXT: fcvt d4, h4 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v2.d[0] -; CHECK-FP16-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-FP16-GI-NEXT: fcvtl v2.2d, v0.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v0.2d, v0.4s +; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v2.2d ; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v3.2d -; CHECK-FP16-GI-NEXT: cmhi v3.2d, v1.2d, v0.2d -; CHECK-FP16-GI-NEXT: cmhi v4.2d, v1.2d, v2.2d -; CHECK-FP16-GI-NEXT: bif v0.16b, v1.16b, v3.16b -; CHECK-FP16-GI-NEXT: bit v1.16b, v2.16b, v4.16b -; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-FP16-GI-NEXT: cmhi v3.2d, v1.2d, v2.2d +; CHECK-FP16-GI-NEXT: cmhi v4.2d, v1.2d, v0.2d +; CHECK-FP16-GI-NEXT: bif v2.16b, v1.16b, v3.16b +; CHECK-FP16-GI-NEXT: bif v0.16b, v1.16b, v4.16b +; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v2.4s, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %conv = fptoui <4 x half> %x to <4 x i64> @@ -2397,28 +2362,21 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; ; CHECK-FP16-GI-LABEL: ustest_f16i32_mm: ; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-GI-NEXT: mov h2, v0.h[1] -; CHECK-FP16-GI-NEXT: mov h3, v0.h[2] -; CHECK-FP16-GI-NEXT: mov h4, v0.h[3] -; CHECK-FP16-GI-NEXT: fcvt d0, h0 +; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h ; CHECK-FP16-GI-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-FP16-GI-NEXT: fcvt d2, h2 -; CHECK-FP16-GI-NEXT: fcvt d3, h3 -; CHECK-FP16-GI-NEXT: fcvt d4, h4 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v2.d[0] -; CHECK-FP16-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-FP16-GI-NEXT: fcvtl v2.2d, v0.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v0.2d, v0.4s +; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v2.2d ; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v3.2d -; CHECK-FP16-GI-NEXT: cmgt v3.2d, v1.2d, v0.2d -; CHECK-FP16-GI-NEXT: cmgt v4.2d, v1.2d, v2.2d -; CHECK-FP16-GI-NEXT: bif v0.16b, v1.16b, v3.16b -; CHECK-FP16-GI-NEXT: bit v1.16b, v2.16b, v4.16b -; CHECK-FP16-GI-NEXT: cmgt v2.2d, v0.2d, #0 -; CHECK-FP16-GI-NEXT: cmgt v3.2d, v1.2d, #0 -; CHECK-FP16-GI-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-FP16-GI-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-FP16-GI-NEXT: cmgt v3.2d, v1.2d, v2.2d +; CHECK-FP16-GI-NEXT: cmgt v4.2d, v1.2d, v0.2d +; CHECK-FP16-GI-NEXT: bif v2.16b, v1.16b, v3.16b +; CHECK-FP16-GI-NEXT: bif v0.16b, v1.16b, v4.16b +; CHECK-FP16-GI-NEXT: cmgt v1.2d, v2.2d, #0 +; CHECK-FP16-GI-NEXT: cmgt v3.2d, v0.2d, #0 +; CHECK-FP16-GI-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-FP16-GI-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v1.4s, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %conv = fptosi <4 x half> %x to <4 x i64> diff --git a/llvm/test/CodeGen/AArch64/fpext.ll b/llvm/test/CodeGen/AArch64/fpext.ll index 549f20f99acdb..7085bfdc52f40 100644 --- a/llvm/test/CodeGen/AArch64/fpext.ll +++ b/llvm/test/CodeGen/AArch64/fpext.ll @@ -82,11 +82,12 @@ define <3 x double> @fpext_v3f32_v3f64(<3 x float> %a) { ; ; CHECK-GI-LABEL: fpext_v3f32_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov s1, v0.s[2] +; CHECK-GI-NEXT: mov v1.s[0], v0.s[2] ; CHECK-GI-NEXT: fcvtl v0.2d, v0.2s -; CHECK-GI-NEXT: fcvt d2, s1 +; CHECK-GI-NEXT: fcvtl v2.2d, v1.2s ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-GI-NEXT: ret entry: %c = fpext <3 x float> %a to <3 x double> @@ -320,20 +321,11 @@ entry: } define <2 x double> @fpext_v2f16_v2f64(<2 x half> %a) { -; CHECK-SD-LABEL: fpext_v2f16_v2f64: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h -; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fpext_v2f16_v2f64: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov h1, v0.h[1] -; CHECK-GI-NEXT: fcvt d0, h0 -; CHECK-GI-NEXT: fcvt d1, h1 -; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fpext_v2f16_v2f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtl v0.4s, v0.4h +; CHECK-NEXT: fcvtl v0.2d, v0.2s +; CHECK-NEXT: ret entry: %c = fpext <2 x half> %a to <2 x double> ret <2 x double> %c @@ -353,12 +345,12 @@ define <3 x double> @fpext_v3f16_v3f64(<3 x half> %a) { ; ; CHECK-GI-LABEL: fpext_v3f16_v3f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov h1, v0.h[1] -; CHECK-GI-NEXT: mov h2, v0.h[2] -; CHECK-GI-NEXT: fcvt d0, h0 -; CHECK-GI-NEXT: fcvt d1, h1 -; CHECK-GI-NEXT: fcvt d2, h2 +; CHECK-GI-NEXT: fcvtl v1.4s, v0.4h +; CHECK-GI-NEXT: fcvtl v0.2d, v1.2s +; CHECK-GI-NEXT: fcvtl2 v2.2d, v1.4s +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: %c = fpext <3 x half> %a to <3 x double> @@ -375,16 +367,9 @@ define <4 x double> @fpext_v4f16_v4f64(<4 x half> %a) { ; ; CHECK-GI-LABEL: fpext_v4f16_v4f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov h1, v0.h[1] -; CHECK-GI-NEXT: mov h2, v0.h[2] -; CHECK-GI-NEXT: mov h3, v0.h[3] -; CHECK-GI-NEXT: fcvt d0, h0 -; CHECK-GI-NEXT: fcvt d4, h1 -; CHECK-GI-NEXT: fcvt d1, h2 -; CHECK-GI-NEXT: fcvt d2, h3 -; CHECK-GI-NEXT: mov v0.d[1], v4.d[0] -; CHECK-GI-NEXT: mov v1.d[1], v2.d[0] +; CHECK-GI-NEXT: fcvtl v1.4s, v0.4h +; CHECK-GI-NEXT: fcvtl v0.2d, v1.2s +; CHECK-GI-NEXT: fcvtl2 v1.2d, v1.4s ; CHECK-GI-NEXT: ret entry: %c = fpext <4 x half> %a to <4 x double> diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll index ae6f796e0b394..d82bcb90c8975 100644 --- a/llvm/test/CodeGen/AArch64/fptoi.ll +++ b/llvm/test/CodeGen/AArch64/fptoi.ll @@ -4610,11 +4610,8 @@ define <2 x i64> @fptos_v2f16_v2i64(<2 x half> %a) { ; ; CHECK-FP16-GI-LABEL: fptos_v2f16_v2i64: ; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-GI-NEXT: mov h1, v0.h[1] -; CHECK-FP16-GI-NEXT: fcvt d0, h0 -; CHECK-FP16-GI-NEXT: fcvt d1, h1 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h +; CHECK-FP16-GI-NEXT: fcvtl v0.2d, v0.2s ; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d ; CHECK-FP16-GI-NEXT: ret entry: @@ -4654,11 +4651,8 @@ define <2 x i64> @fptou_v2f16_v2i64(<2 x half> %a) { ; ; CHECK-FP16-GI-LABEL: fptou_v2f16_v2i64: ; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-GI-NEXT: mov h1, v0.h[1] -; CHECK-FP16-GI-NEXT: fcvt d0, h0 -; CHECK-FP16-GI-NEXT: fcvt d1, h1 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h +; CHECK-FP16-GI-NEXT: fcvtl v0.2d, v0.2s ; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v0.2d ; CHECK-FP16-GI-NEXT: ret entry: @@ -4710,20 +4704,14 @@ define <3 x i64> @fptos_v3f16_v3i64(<3 x half> %a) { ; ; CHECK-FP16-GI-LABEL: fptos_v3f16_v3i64: ; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-GI-NEXT: mov h2, v0.h[1] -; CHECK-FP16-GI-NEXT: fcvt d1, h0 -; CHECK-FP16-GI-NEXT: mov h3, v0.h[2] -; CHECK-FP16-GI-NEXT: fcvt d0, h0 -; CHECK-FP16-GI-NEXT: fcvt d2, h2 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v2.d[0] -; CHECK-FP16-GI-NEXT: fcvt d2, h3 -; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-FP16-GI-NEXT: mov v2.d[1], v1.d[0] -; CHECK-FP16-GI-NEXT: mov d1, v0.d[1] +; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h +; CHECK-FP16-GI-NEXT: fcvtl v1.2d, v0.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v2.2d, v0.4s +; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v1.2d ; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v2.2d -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-FP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-FP16-GI-NEXT: mov d1, v0.d[1] +; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-FP16-GI-NEXT: ret entry: %c = fptosi <3 x half> %a to <3 x i64> @@ -4774,20 +4762,14 @@ define <3 x i64> @fptou_v3f16_v3i64(<3 x half> %a) { ; ; CHECK-FP16-GI-LABEL: fptou_v3f16_v3i64: ; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-GI-NEXT: mov h2, v0.h[1] -; CHECK-FP16-GI-NEXT: fcvt d1, h0 -; CHECK-FP16-GI-NEXT: mov h3, v0.h[2] -; CHECK-FP16-GI-NEXT: fcvt d0, h0 -; CHECK-FP16-GI-NEXT: fcvt d2, h2 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v2.d[0] -; CHECK-FP16-GI-NEXT: fcvt d2, h3 -; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-FP16-GI-NEXT: mov v2.d[1], v1.d[0] -; CHECK-FP16-GI-NEXT: mov d1, v0.d[1] +; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h +; CHECK-FP16-GI-NEXT: fcvtl v1.2d, v0.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v2.2d, v0.4s +; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v1.2d ; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v2.2d -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-FP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-FP16-GI-NEXT: mov d1, v0.d[1] +; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-FP16-GI-NEXT: ret entry: %c = fptoui <3 x half> %a to <3 x i64> @@ -4842,17 +4824,10 @@ define <4 x i64> @fptos_v4f16_v4i64(<4 x half> %a) { ; ; CHECK-FP16-GI-LABEL: fptos_v4f16_v4i64: ; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-GI-NEXT: mov h1, v0.h[1] -; CHECK-FP16-GI-NEXT: mov h2, v0.h[2] -; CHECK-FP16-GI-NEXT: mov h3, v0.h[3] -; CHECK-FP16-GI-NEXT: fcvt d0, h0 -; CHECK-FP16-GI-NEXT: fcvt d1, h1 -; CHECK-FP16-GI-NEXT: fcvt d2, h2 -; CHECK-FP16-GI-NEXT: fcvt d3, h3 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-FP16-GI-NEXT: mov v2.d[1], v3.d[0] -; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d +; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h +; CHECK-FP16-GI-NEXT: fcvtl v1.2d, v0.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v2.2d, v0.4s +; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v1.2d ; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v2.2d ; CHECK-FP16-GI-NEXT: ret entry: @@ -4908,17 +4883,10 @@ define <4 x i64> @fptou_v4f16_v4i64(<4 x half> %a) { ; ; CHECK-FP16-GI-LABEL: fptou_v4f16_v4i64: ; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-GI-NEXT: mov h1, v0.h[1] -; CHECK-FP16-GI-NEXT: mov h2, v0.h[2] -; CHECK-FP16-GI-NEXT: mov h3, v0.h[3] -; CHECK-FP16-GI-NEXT: fcvt d0, h0 -; CHECK-FP16-GI-NEXT: fcvt d1, h1 -; CHECK-FP16-GI-NEXT: fcvt d2, h2 -; CHECK-FP16-GI-NEXT: fcvt d3, h3 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-FP16-GI-NEXT: mov v2.d[1], v3.d[0] -; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v0.2d +; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h +; CHECK-FP16-GI-NEXT: fcvtl v1.2d, v0.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v2.2d, v0.4s +; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v1.2d ; CHECK-FP16-GI-NEXT: fcvtzu v1.2d, v2.2d ; CHECK-FP16-GI-NEXT: ret entry: @@ -5005,29 +4973,16 @@ define <8 x i64> @fptos_v8f16_v8i64(<8 x half> %a) { ; ; CHECK-FP16-GI-LABEL: fptos_v8f16_v8i64: ; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: mov h1, v0.h[1] -; CHECK-FP16-GI-NEXT: mov h2, v0.h[2] -; CHECK-FP16-GI-NEXT: mov h3, v0.h[3] -; CHECK-FP16-GI-NEXT: mov h4, v0.h[4] -; CHECK-FP16-GI-NEXT: mov h5, v0.h[5] -; CHECK-FP16-GI-NEXT: mov h6, v0.h[6] -; CHECK-FP16-GI-NEXT: mov h7, v0.h[7] -; CHECK-FP16-GI-NEXT: fcvt d0, h0 -; CHECK-FP16-GI-NEXT: fcvt d1, h1 -; CHECK-FP16-GI-NEXT: fcvt d2, h2 -; CHECK-FP16-GI-NEXT: fcvt d3, h3 -; CHECK-FP16-GI-NEXT: fcvt d4, h4 -; CHECK-FP16-GI-NEXT: fcvt d5, h5 -; CHECK-FP16-GI-NEXT: fcvt d6, h6 -; CHECK-FP16-GI-NEXT: fcvt d7, h7 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-FP16-GI-NEXT: mov v2.d[1], v3.d[0] -; CHECK-FP16-GI-NEXT: mov v4.d[1], v5.d[0] -; CHECK-FP16-GI-NEXT: mov v6.d[1], v7.d[0] -; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v2.2d -; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v4.2d -; CHECK-FP16-GI-NEXT: fcvtzs v3.2d, v6.2d +; CHECK-FP16-GI-NEXT: fcvtl v1.4s, v0.4h +; CHECK-FP16-GI-NEXT: fcvtl2 v0.4s, v0.8h +; CHECK-FP16-GI-NEXT: fcvtl v2.2d, v1.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v1.2d, v1.4s +; CHECK-FP16-GI-NEXT: fcvtl v3.2d, v0.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v4.2d, v0.4s +; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v2.2d +; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v1.2d +; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v3.2d +; CHECK-FP16-GI-NEXT: fcvtzs v3.2d, v4.2d ; CHECK-FP16-GI-NEXT: ret entry: %c = fptosi <8 x half> %a to <8 x i64> @@ -5113,29 +5068,16 @@ define <8 x i64> @fptou_v8f16_v8i64(<8 x half> %a) { ; ; CHECK-FP16-GI-LABEL: fptou_v8f16_v8i64: ; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: mov h1, v0.h[1] -; CHECK-FP16-GI-NEXT: mov h2, v0.h[2] -; CHECK-FP16-GI-NEXT: mov h3, v0.h[3] -; CHECK-FP16-GI-NEXT: mov h4, v0.h[4] -; CHECK-FP16-GI-NEXT: mov h5, v0.h[5] -; CHECK-FP16-GI-NEXT: mov h6, v0.h[6] -; CHECK-FP16-GI-NEXT: mov h7, v0.h[7] -; CHECK-FP16-GI-NEXT: fcvt d0, h0 -; CHECK-FP16-GI-NEXT: fcvt d1, h1 -; CHECK-FP16-GI-NEXT: fcvt d2, h2 -; CHECK-FP16-GI-NEXT: fcvt d3, h3 -; CHECK-FP16-GI-NEXT: fcvt d4, h4 -; CHECK-FP16-GI-NEXT: fcvt d5, h5 -; CHECK-FP16-GI-NEXT: fcvt d6, h6 -; CHECK-FP16-GI-NEXT: fcvt d7, h7 -; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-FP16-GI-NEXT: mov v2.d[1], v3.d[0] -; CHECK-FP16-GI-NEXT: mov v4.d[1], v5.d[0] -; CHECK-FP16-GI-NEXT: mov v6.d[1], v7.d[0] -; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-FP16-GI-NEXT: fcvtzu v1.2d, v2.2d -; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v4.2d -; CHECK-FP16-GI-NEXT: fcvtzu v3.2d, v6.2d +; CHECK-FP16-GI-NEXT: fcvtl v1.4s, v0.4h +; CHECK-FP16-GI-NEXT: fcvtl2 v0.4s, v0.8h +; CHECK-FP16-GI-NEXT: fcvtl v2.2d, v1.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v1.2d, v1.4s +; CHECK-FP16-GI-NEXT: fcvtl v3.2d, v0.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v4.2d, v0.4s +; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v2.2d +; CHECK-FP16-GI-NEXT: fcvtzu v1.2d, v1.2d +; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v3.2d +; CHECK-FP16-GI-NEXT: fcvtzu v3.2d, v4.2d ; CHECK-FP16-GI-NEXT: ret entry: %c = fptoui <8 x half> %a to <8 x i64> @@ -5285,52 +5227,26 @@ define <16 x i64> @fptos_v16f16_v16i64(<16 x half> %a) { ; ; CHECK-FP16-GI-LABEL: fptos_v16f16_v16i64: ; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: mov h3, v0.h[1] -; CHECK-FP16-GI-NEXT: mov h4, v0.h[2] -; CHECK-FP16-GI-NEXT: mov h5, v0.h[3] -; CHECK-FP16-GI-NEXT: fcvt d2, h0 -; CHECK-FP16-GI-NEXT: mov h6, v0.h[4] -; CHECK-FP16-GI-NEXT: mov h7, v0.h[5] -; CHECK-FP16-GI-NEXT: mov h16, v0.h[6] -; CHECK-FP16-GI-NEXT: mov h0, v0.h[7] -; CHECK-FP16-GI-NEXT: mov h17, v1.h[1] -; CHECK-FP16-GI-NEXT: mov h18, v1.h[2] -; CHECK-FP16-GI-NEXT: mov h19, v1.h[3] -; CHECK-FP16-GI-NEXT: mov h20, v1.h[4] -; CHECK-FP16-GI-NEXT: mov h21, v1.h[5] -; CHECK-FP16-GI-NEXT: mov h22, v1.h[6] -; CHECK-FP16-GI-NEXT: mov h23, v1.h[7] -; CHECK-FP16-GI-NEXT: fcvt d3, h3 -; CHECK-FP16-GI-NEXT: fcvt d4, h4 -; CHECK-FP16-GI-NEXT: fcvt d5, h5 -; CHECK-FP16-GI-NEXT: fcvt d6, h6 -; CHECK-FP16-GI-NEXT: fcvt d7, h7 -; CHECK-FP16-GI-NEXT: fcvt d16, h16 -; CHECK-FP16-GI-NEXT: fcvt d0, h0 -; CHECK-FP16-GI-NEXT: fcvt d24, h1 -; CHECK-FP16-GI-NEXT: fcvt d1, h17 -; CHECK-FP16-GI-NEXT: fcvt d17, h18 -; CHECK-FP16-GI-NEXT: fcvt d18, h19 -; CHECK-FP16-GI-NEXT: fcvt d19, h20 -; CHECK-FP16-GI-NEXT: fcvt d20, h21 -; CHECK-FP16-GI-NEXT: fcvt d21, h22 -; CHECK-FP16-GI-NEXT: fcvt d22, h23 -; CHECK-FP16-GI-NEXT: mov v2.d[1], v3.d[0] -; CHECK-FP16-GI-NEXT: mov v4.d[1], v5.d[0] -; CHECK-FP16-GI-NEXT: mov v6.d[1], v7.d[0] -; CHECK-FP16-GI-NEXT: mov v16.d[1], v0.d[0] -; CHECK-FP16-GI-NEXT: mov v24.d[1], v1.d[0] -; CHECK-FP16-GI-NEXT: mov v17.d[1], v18.d[0] -; CHECK-FP16-GI-NEXT: mov v19.d[1], v20.d[0] -; CHECK-FP16-GI-NEXT: mov v21.d[1], v22.d[0] -; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v2.2d -; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v4.2d -; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v6.2d -; CHECK-FP16-GI-NEXT: fcvtzs v3.2d, v16.2d -; CHECK-FP16-GI-NEXT: fcvtzs v4.2d, v24.2d -; CHECK-FP16-GI-NEXT: fcvtzs v5.2d, v17.2d -; CHECK-FP16-GI-NEXT: fcvtzs v6.2d, v19.2d -; CHECK-FP16-GI-NEXT: fcvtzs v7.2d, v21.2d +; CHECK-FP16-GI-NEXT: fcvtl v2.4s, v0.4h +; CHECK-FP16-GI-NEXT: fcvtl2 v0.4s, v0.8h +; CHECK-FP16-GI-NEXT: fcvtl v3.4s, v1.4h +; CHECK-FP16-GI-NEXT: fcvtl2 v1.4s, v1.8h +; CHECK-FP16-GI-NEXT: fcvtl v4.2d, v2.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v2.2d, v2.4s +; CHECK-FP16-GI-NEXT: fcvtl v5.2d, v0.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v6.2d, v0.4s +; CHECK-FP16-GI-NEXT: fcvtl v7.2d, v3.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v16.2d, v3.4s +; CHECK-FP16-GI-NEXT: fcvtl v17.2d, v1.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v18.2d, v1.4s +; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v4.2d +; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v2.2d +; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v5.2d +; CHECK-FP16-GI-NEXT: fcvtzs v3.2d, v6.2d +; CHECK-FP16-GI-NEXT: fcvtzs v4.2d, v7.2d +; CHECK-FP16-GI-NEXT: fcvtzs v5.2d, v16.2d +; CHECK-FP16-GI-NEXT: fcvtzs v6.2d, v17.2d +; CHECK-FP16-GI-NEXT: fcvtzs v7.2d, v18.2d ; CHECK-FP16-GI-NEXT: ret entry: %c = fptosi <16 x half> %a to <16 x i64> @@ -5480,52 +5396,26 @@ define <16 x i64> @fptou_v16f16_v16i64(<16 x half> %a) { ; ; CHECK-FP16-GI-LABEL: fptou_v16f16_v16i64: ; CHECK-FP16-GI: // %bb.0: // %entry -; CHECK-FP16-GI-NEXT: mov h3, v0.h[1] -; CHECK-FP16-GI-NEXT: mov h4, v0.h[2] -; CHECK-FP16-GI-NEXT: mov h5, v0.h[3] -; CHECK-FP16-GI-NEXT: fcvt d2, h0 -; CHECK-FP16-GI-NEXT: mov h6, v0.h[4] -; CHECK-FP16-GI-NEXT: mov h7, v0.h[5] -; CHECK-FP16-GI-NEXT: mov h16, v0.h[6] -; CHECK-FP16-GI-NEXT: mov h0, v0.h[7] -; CHECK-FP16-GI-NEXT: mov h17, v1.h[1] -; CHECK-FP16-GI-NEXT: mov h18, v1.h[2] -; CHECK-FP16-GI-NEXT: mov h19, v1.h[3] -; CHECK-FP16-GI-NEXT: mov h20, v1.h[4] -; CHECK-FP16-GI-NEXT: mov h21, v1.h[5] -; CHECK-FP16-GI-NEXT: mov h22, v1.h[6] -; CHECK-FP16-GI-NEXT: mov h23, v1.h[7] -; CHECK-FP16-GI-NEXT: fcvt d3, h3 -; CHECK-FP16-GI-NEXT: fcvt d4, h4 -; CHECK-FP16-GI-NEXT: fcvt d5, h5 -; CHECK-FP16-GI-NEXT: fcvt d6, h6 -; CHECK-FP16-GI-NEXT: fcvt d7, h7 -; CHECK-FP16-GI-NEXT: fcvt d16, h16 -; CHECK-FP16-GI-NEXT: fcvt d0, h0 -; CHECK-FP16-GI-NEXT: fcvt d24, h1 -; CHECK-FP16-GI-NEXT: fcvt d1, h17 -; CHECK-FP16-GI-NEXT: fcvt d17, h18 -; CHECK-FP16-GI-NEXT: fcvt d18, h19 -; CHECK-FP16-GI-NEXT: fcvt d19, h20 -; CHECK-FP16-GI-NEXT: fcvt d20, h21 -; CHECK-FP16-GI-NEXT: fcvt d21, h22 -; CHECK-FP16-GI-NEXT: fcvt d22, h23 -; CHECK-FP16-GI-NEXT: mov v2.d[1], v3.d[0] -; CHECK-FP16-GI-NEXT: mov v4.d[1], v5.d[0] -; CHECK-FP16-GI-NEXT: mov v6.d[1], v7.d[0] -; CHECK-FP16-GI-NEXT: mov v16.d[1], v0.d[0] -; CHECK-FP16-GI-NEXT: mov v24.d[1], v1.d[0] -; CHECK-FP16-GI-NEXT: mov v17.d[1], v18.d[0] -; CHECK-FP16-GI-NEXT: mov v19.d[1], v20.d[0] -; CHECK-FP16-GI-NEXT: mov v21.d[1], v22.d[0] -; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v2.2d -; CHECK-FP16-GI-NEXT: fcvtzu v1.2d, v4.2d -; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v6.2d -; CHECK-FP16-GI-NEXT: fcvtzu v3.2d, v16.2d -; CHECK-FP16-GI-NEXT: fcvtzu v4.2d, v24.2d -; CHECK-FP16-GI-NEXT: fcvtzu v5.2d, v17.2d -; CHECK-FP16-GI-NEXT: fcvtzu v6.2d, v19.2d -; CHECK-FP16-GI-NEXT: fcvtzu v7.2d, v21.2d +; CHECK-FP16-GI-NEXT: fcvtl v2.4s, v0.4h +; CHECK-FP16-GI-NEXT: fcvtl2 v0.4s, v0.8h +; CHECK-FP16-GI-NEXT: fcvtl v3.4s, v1.4h +; CHECK-FP16-GI-NEXT: fcvtl2 v1.4s, v1.8h +; CHECK-FP16-GI-NEXT: fcvtl v4.2d, v2.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v2.2d, v2.4s +; CHECK-FP16-GI-NEXT: fcvtl v5.2d, v0.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v6.2d, v0.4s +; CHECK-FP16-GI-NEXT: fcvtl v7.2d, v3.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v16.2d, v3.4s +; CHECK-FP16-GI-NEXT: fcvtl v17.2d, v1.2s +; CHECK-FP16-GI-NEXT: fcvtl2 v18.2d, v1.4s +; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v4.2d +; CHECK-FP16-GI-NEXT: fcvtzu v1.2d, v2.2d +; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v5.2d +; CHECK-FP16-GI-NEXT: fcvtzu v3.2d, v6.2d +; CHECK-FP16-GI-NEXT: fcvtzu v4.2d, v7.2d +; CHECK-FP16-GI-NEXT: fcvtzu v5.2d, v16.2d +; CHECK-FP16-GI-NEXT: fcvtzu v6.2d, v17.2d +; CHECK-FP16-GI-NEXT: fcvtzu v7.2d, v18.2d ; CHECK-FP16-GI-NEXT: ret entry: %c = fptoui <16 x half> %a to <16 x i64> diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll index 6eeceb2e41321..2417205759767 100644 --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -3088,30 +3088,14 @@ define <4 x i64> @test_signed_v4f16_v4i64(<4 x half> %f) { ; CHECK-SD-FP16-NEXT: mov v1.d[1], x11 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-CVT-LABEL: test_signed_v4f16_v4i64: -; CHECK-GI-CVT: // %bb.0: -; CHECK-GI-CVT-NEXT: fcvtl v0.4s, v0.4h -; CHECK-GI-CVT-NEXT: fcvtl v1.2d, v0.2s -; CHECK-GI-CVT-NEXT: fcvtl2 v2.2d, v0.4s -; CHECK-GI-CVT-NEXT: fcvtzs v0.2d, v1.2d -; CHECK-GI-CVT-NEXT: fcvtzs v1.2d, v2.2d -; CHECK-GI-CVT-NEXT: ret -; -; CHECK-GI-FP16-LABEL: test_signed_v4f16_v4i64: -; CHECK-GI-FP16: // %bb.0: -; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] -; CHECK-GI-FP16-NEXT: mov h3, v0.h[3] -; CHECK-GI-FP16-NEXT: fcvt d0, h0 -; CHECK-GI-FP16-NEXT: fcvt d1, h1 -; CHECK-GI-FP16-NEXT: fcvt d2, h2 -; CHECK-GI-FP16-NEXT: fcvt d3, h3 -; CHECK-GI-FP16-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-FP16-NEXT: mov v2.d[1], v3.d[0] -; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v2.2d -; CHECK-GI-FP16-NEXT: ret +; CHECK-GI-LABEL: test_signed_v4f16_v4i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h +; CHECK-GI-NEXT: fcvtl v1.2d, v0.2s +; CHECK-GI-NEXT: fcvtl2 v2.2d, v0.4s +; CHECK-GI-NEXT: fcvtzs v0.2d, v1.2d +; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d +; CHECK-GI-NEXT: ret %x = call <4 x i64> @llvm.fptosi.sat.v4f16.v4i64(<4 x half> %f) ret <4 x i64> %x } @@ -3797,46 +3781,19 @@ define <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) { ; CHECK-SD-FP16-NEXT: mov v3.d[1], x14 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-CVT-LABEL: test_signed_v8f16_v8i64: -; CHECK-GI-CVT: // %bb.0: -; CHECK-GI-CVT-NEXT: fcvtl v1.4s, v0.4h -; CHECK-GI-CVT-NEXT: fcvtl2 v0.4s, v0.8h -; CHECK-GI-CVT-NEXT: fcvtl v2.2d, v1.2s -; CHECK-GI-CVT-NEXT: fcvtl2 v1.2d, v1.4s -; CHECK-GI-CVT-NEXT: fcvtl v3.2d, v0.2s -; CHECK-GI-CVT-NEXT: fcvtl2 v4.2d, v0.4s -; CHECK-GI-CVT-NEXT: fcvtzs v0.2d, v2.2d -; CHECK-GI-CVT-NEXT: fcvtzs v1.2d, v1.2d -; CHECK-GI-CVT-NEXT: fcvtzs v2.2d, v3.2d -; CHECK-GI-CVT-NEXT: fcvtzs v3.2d, v4.2d -; CHECK-GI-CVT-NEXT: ret -; -; CHECK-GI-FP16-LABEL: test_signed_v8f16_v8i64: -; CHECK-GI-FP16: // %bb.0: -; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] -; CHECK-GI-FP16-NEXT: mov h3, v0.h[3] -; CHECK-GI-FP16-NEXT: mov h4, v0.h[4] -; CHECK-GI-FP16-NEXT: mov h5, v0.h[5] -; CHECK-GI-FP16-NEXT: mov h6, v0.h[6] -; CHECK-GI-FP16-NEXT: mov h7, v0.h[7] -; CHECK-GI-FP16-NEXT: fcvt d0, h0 -; CHECK-GI-FP16-NEXT: fcvt d1, h1 -; CHECK-GI-FP16-NEXT: fcvt d2, h2 -; CHECK-GI-FP16-NEXT: fcvt d3, h3 -; CHECK-GI-FP16-NEXT: fcvt d4, h4 -; CHECK-GI-FP16-NEXT: fcvt d5, h5 -; CHECK-GI-FP16-NEXT: fcvt d6, h6 -; CHECK-GI-FP16-NEXT: fcvt d7, h7 -; CHECK-GI-FP16-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-FP16-NEXT: mov v2.d[1], v3.d[0] -; CHECK-GI-FP16-NEXT: mov v4.d[1], v5.d[0] -; CHECK-GI-FP16-NEXT: mov v6.d[1], v7.d[0] -; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v2.2d -; CHECK-GI-FP16-NEXT: fcvtzs v2.2d, v4.2d -; CHECK-GI-FP16-NEXT: fcvtzs v3.2d, v6.2d -; CHECK-GI-FP16-NEXT: ret +; CHECK-GI-LABEL: test_signed_v8f16_v8i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fcvtl v1.4s, v0.4h +; CHECK-GI-NEXT: fcvtl2 v0.4s, v0.8h +; CHECK-GI-NEXT: fcvtl v2.2d, v1.2s +; CHECK-GI-NEXT: fcvtl2 v1.2d, v1.4s +; CHECK-GI-NEXT: fcvtl v3.2d, v0.2s +; CHECK-GI-NEXT: fcvtl2 v4.2d, v0.4s +; CHECK-GI-NEXT: fcvtzs v0.2d, v2.2d +; CHECK-GI-NEXT: fcvtzs v1.2d, v1.2d +; CHECK-GI-NEXT: fcvtzs v2.2d, v3.2d +; CHECK-GI-NEXT: fcvtzs v3.2d, v4.2d +; CHECK-GI-NEXT: ret %x = call <8 x i64> @llvm.fptosi.sat.v8f16.v8i64(<8 x half> %f) ret <8 x i64> %x } diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll index cd16988f10313..ecca1165753bf 100644 --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -2506,30 +2506,14 @@ define <4 x i64> @test_unsigned_v4f16_v4i64(<4 x half> %f) { ; CHECK-SD-FP16-NEXT: mov v1.d[1], x11 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-CVT-LABEL: test_unsigned_v4f16_v4i64: -; CHECK-GI-CVT: // %bb.0: -; CHECK-GI-CVT-NEXT: fcvtl v0.4s, v0.4h -; CHECK-GI-CVT-NEXT: fcvtl v1.2d, v0.2s -; CHECK-GI-CVT-NEXT: fcvtl2 v2.2d, v0.4s -; CHECK-GI-CVT-NEXT: fcvtzu v0.2d, v1.2d -; CHECK-GI-CVT-NEXT: fcvtzu v1.2d, v2.2d -; CHECK-GI-CVT-NEXT: ret -; -; CHECK-GI-FP16-LABEL: test_unsigned_v4f16_v4i64: -; CHECK-GI-FP16: // %bb.0: -; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] -; CHECK-GI-FP16-NEXT: mov h3, v0.h[3] -; CHECK-GI-FP16-NEXT: fcvt d0, h0 -; CHECK-GI-FP16-NEXT: fcvt d1, h1 -; CHECK-GI-FP16-NEXT: fcvt d2, h2 -; CHECK-GI-FP16-NEXT: fcvt d3, h3 -; CHECK-GI-FP16-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-FP16-NEXT: mov v2.d[1], v3.d[0] -; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v2.2d -; CHECK-GI-FP16-NEXT: ret +; CHECK-GI-LABEL: test_unsigned_v4f16_v4i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h +; CHECK-GI-NEXT: fcvtl v1.2d, v0.2s +; CHECK-GI-NEXT: fcvtl2 v2.2d, v0.4s +; CHECK-GI-NEXT: fcvtzu v0.2d, v1.2d +; CHECK-GI-NEXT: fcvtzu v1.2d, v2.2d +; CHECK-GI-NEXT: ret %x = call <4 x i64> @llvm.fptoui.sat.v4f16.v4i64(<4 x half> %f) ret <4 x i64> %x } @@ -3114,46 +3098,19 @@ define <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) { ; CHECK-SD-FP16-NEXT: mov v3.d[1], x14 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-CVT-LABEL: test_unsigned_v8f16_v8i64: -; CHECK-GI-CVT: // %bb.0: -; CHECK-GI-CVT-NEXT: fcvtl v1.4s, v0.4h -; CHECK-GI-CVT-NEXT: fcvtl2 v0.4s, v0.8h -; CHECK-GI-CVT-NEXT: fcvtl v2.2d, v1.2s -; CHECK-GI-CVT-NEXT: fcvtl2 v1.2d, v1.4s -; CHECK-GI-CVT-NEXT: fcvtl v3.2d, v0.2s -; CHECK-GI-CVT-NEXT: fcvtl2 v4.2d, v0.4s -; CHECK-GI-CVT-NEXT: fcvtzu v0.2d, v2.2d -; CHECK-GI-CVT-NEXT: fcvtzu v1.2d, v1.2d -; CHECK-GI-CVT-NEXT: fcvtzu v2.2d, v3.2d -; CHECK-GI-CVT-NEXT: fcvtzu v3.2d, v4.2d -; CHECK-GI-CVT-NEXT: ret -; -; CHECK-GI-FP16-LABEL: test_unsigned_v8f16_v8i64: -; CHECK-GI-FP16: // %bb.0: -; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] -; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] -; CHECK-GI-FP16-NEXT: mov h3, v0.h[3] -; CHECK-GI-FP16-NEXT: mov h4, v0.h[4] -; CHECK-GI-FP16-NEXT: mov h5, v0.h[5] -; CHECK-GI-FP16-NEXT: mov h6, v0.h[6] -; CHECK-GI-FP16-NEXT: mov h7, v0.h[7] -; CHECK-GI-FP16-NEXT: fcvt d0, h0 -; CHECK-GI-FP16-NEXT: fcvt d1, h1 -; CHECK-GI-FP16-NEXT: fcvt d2, h2 -; CHECK-GI-FP16-NEXT: fcvt d3, h3 -; CHECK-GI-FP16-NEXT: fcvt d4, h4 -; CHECK-GI-FP16-NEXT: fcvt d5, h5 -; CHECK-GI-FP16-NEXT: fcvt d6, h6 -; CHECK-GI-FP16-NEXT: fcvt d7, h7 -; CHECK-GI-FP16-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-FP16-NEXT: mov v2.d[1], v3.d[0] -; CHECK-GI-FP16-NEXT: mov v4.d[1], v5.d[0] -; CHECK-GI-FP16-NEXT: mov v6.d[1], v7.d[0] -; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v0.2d -; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v2.2d -; CHECK-GI-FP16-NEXT: fcvtzu v2.2d, v4.2d -; CHECK-GI-FP16-NEXT: fcvtzu v3.2d, v6.2d -; CHECK-GI-FP16-NEXT: ret +; CHECK-GI-LABEL: test_unsigned_v8f16_v8i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fcvtl v1.4s, v0.4h +; CHECK-GI-NEXT: fcvtl2 v0.4s, v0.8h +; CHECK-GI-NEXT: fcvtl v2.2d, v1.2s +; CHECK-GI-NEXT: fcvtl2 v1.2d, v1.4s +; CHECK-GI-NEXT: fcvtl v3.2d, v0.2s +; CHECK-GI-NEXT: fcvtl2 v4.2d, v0.4s +; CHECK-GI-NEXT: fcvtzu v0.2d, v2.2d +; CHECK-GI-NEXT: fcvtzu v1.2d, v1.2d +; CHECK-GI-NEXT: fcvtzu v2.2d, v3.2d +; CHECK-GI-NEXT: fcvtzu v3.2d, v4.2d +; CHECK-GI-NEXT: ret %x = call <8 x i64> @llvm.fptoui.sat.v8f16.v8i64(<8 x half> %f) ret <8 x i64> %x } diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index ff74d1f71616d..88e3c86c791de 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2549,17 +2549,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164_ITERATIVE-LABEL: add_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s2 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v2, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 @@ -2606,7 +2606,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1132_ITERATIVE-LABEL: add_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2614,8 +2614,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s1 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 @@ -2659,8 +2659,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264_ITERATIVE-LABEL: add_i64_varying: ; GFX1264_ITERATIVE: ; %bb.0: ; %entry -; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2668,8 +2668,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1] ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s8 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s8 ; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8 ; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8 ; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 @@ -2714,7 +2714,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232_ITERATIVE-LABEL: add_i64_varying: ; GFX1232_ITERATIVE: ; %bb.0: ; %entry -; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2723,8 +2723,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s1 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 @@ -6930,15 +6930,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164_ITERATIVE-LABEL: sub_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[8:9], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2 @@ -7087,8 +7087,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264_ITERATIVE-LABEL: sub_i64_varying: ; GFX1264_ITERATIVE: ; %bb.0: ; %entry -; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -7096,8 +7096,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1] ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s8 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s8 ; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8 ; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8 ; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 @@ -7142,7 +7142,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232_ITERATIVE-LABEL: sub_i64_varying: ; GFX1232_ITERATIVE: ; %bb.0: ; %entry -; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -7151,8 +7151,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s1 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index f5ca24f59a286..12517c2bc1b5d 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -2181,17 +2181,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: add_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 @@ -2233,7 +2233,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: add_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2241,8 +2241,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 @@ -2982,19 +2982,20 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; ; GFX1164_ITERATIVE-LABEL: add_i64_varying_nouse: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s4, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s4 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s5 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -5594,17 +5595,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: sub_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 @@ -5646,7 +5647,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: sub_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5654,8 +5655,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 @@ -7063,17 +7064,17 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: and_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s8 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 @@ -7113,7 +7114,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: and_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -7121,8 +7122,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 @@ -8411,17 +8412,17 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: or_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s8 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 @@ -8461,7 +8462,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: or_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -8469,8 +8470,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 @@ -9759,17 +9760,17 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: xor_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s8 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 @@ -9809,7 +9810,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: xor_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -9817,8 +9818,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 @@ -11380,8 +11381,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: max_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s0, 0 @@ -11391,8 +11392,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 ; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s[8:9], s[0:1], s[6:7] @@ -11437,7 +11438,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: max_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 @@ -11447,8 +11448,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[0:1], s[6:7] @@ -13205,8 +13206,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: min_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s0, -1 @@ -13216,8 +13217,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 ; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], s[6:7] @@ -13262,7 +13263,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: min_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 @@ -13272,8 +13273,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[0:1], s[6:7] @@ -15021,8 +15022,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: umax_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -15030,17 +15031,17 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s[8:9], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -15077,7 +15078,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: umax_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -15086,8 +15087,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) @@ -16832,8 +16833,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: umin_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -16841,17 +16842,17 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s[8:9], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -16888,7 +16889,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: umin_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -16897,8 +16898,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 2b63a8cf69476..b1557b33110b9 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -436,7 +436,6 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] ; GCN-O0-NEXT: s_mov_b32 s4, 2 ; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s4, v0 -; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 ; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec @@ -666,7 +665,6 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 2 ; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s0, v0 -; GCN-O0-NEXT: s_mov_b32 s1, 0 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 ; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index df7f8c6f39b3f..676f11b99f922 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -220,27 +220,24 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v20, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v0 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v11 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 @@ -253,14 +250,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v0, v2, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v0, v13, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v0, v14, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v19, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[9:10], s[4:5] +; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], s[4:5] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] @@ -271,140 +268,136 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v19, v0, s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v13, v0, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v21 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v17, vcc, s10, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v7, v8, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v7, v12, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v14, vcc +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v14, v0, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v21 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v17, vcc, s10, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v4, v8, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v4, v13, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v9, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v18 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[20:21], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v8, v9, s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v8, v10, s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v14, v7, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v12, v7, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-O0-NEXT: v_xor_b32_e64 v14, v14, v19 -; GFX9-O0-NEXT: v_xor_b32_e64 v12, v12, v13 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v9, v4, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v13, v4, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-O0-NEXT: v_xor_b32_e64 v9, v9, v19 +; GFX9-O0-NEXT: v_xor_b32_e64 v13, v13, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_ashrrev_i64 v[12:13], s4, v[12:13] -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[13:14] +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v18 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v18 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v12, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[12:13], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v12, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[13:14], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[13:14], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[10:11], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[11:12], s[8:9] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 ; GFX9-O0-NEXT: s_mov_b32 s12, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s12 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 -; GFX9-O0-NEXT: v_min_u32_e64 v7, v7, v8 -; GFX9-O0-NEXT: s_mov_b32 s13, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s12 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 -; GFX9-O0-NEXT: v_min_u32_e64 v12, v6, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-O0-NEXT: v_min_u32_e64 v8, v4, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s12 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v10 +; GFX9-O0-NEXT: v_min_u32_e64 v13, v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v4 ; GFX9-O0-NEXT: s_mov_b64 s[14:15], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 ; GFX9-O0-NEXT: s_mov_b32 s16, s14 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: s_mov_b32 s13, s15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v9, s[16:17], v9, s16 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s13 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[16:17], v6, v10, s[16:17] -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v7, v8, s[8:9] +; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[16:17], v10, s16 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s13 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[16:17], v7, v11, s[16:17] +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[8:9] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[5:6], s[8:9] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s12 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 +; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 -; GFX9-O0-NEXT: v_min_u32_e64 v11, v4, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v2 +; GFX9-O0-NEXT: v_add_u32_e64 v10, v10, s12 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v3 +; GFX9-O0-NEXT: v_min_u32_e64 v11, v10, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v11 @@ -515,30 +508,30 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -553,22 +546,22 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_9 ; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] @@ -608,27 +601,27 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -638,30 +631,30 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 ; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 ; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] @@ -792,24 +785,24 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 @@ -819,30 +812,30 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-O0-NEXT: s_branch .LBB0_1 @@ -850,14 +843,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload @@ -933,41 +926,41 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s4 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -1006,14 +999,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 ; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] @@ -1055,12 +1048,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5] ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 @@ -1074,18 +1067,18 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] @@ -1098,12 +1091,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_7 ; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -1143,7 +1136,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -2422,21 +2415,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -2451,17 +2444,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v8, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v5, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 @@ -2470,64 +2463,60 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v7, v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v8, v2, v0 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v2, v0 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 -; GFX9-O0-NEXT: s_mov_b32 s8, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 -; GFX9-O0-NEXT: s_mov_b32 s9, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 +; GFX9-O0-NEXT: s_mov_b32 s8, 32 ; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-O0-NEXT: v_min_u32_e64 v6, v4, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-O0-NEXT: v_min_u32_e64 v16, v5, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v4 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-O0-NEXT: s_mov_b32 s12, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 ; GFX9-O0-NEXT: s_mov_b32 s9, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v7, s[12:13], v7, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[12:13], v4, v8, s[12:13] -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[12:13], v8, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[10:11], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v5, v6, s[12:13] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[14:15], s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v6, v7, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 +; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 -; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v2 +; GFX9-O0-NEXT: v_add_u32_e64 v10, v10, s8 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v3 +; GFX9-O0-NEXT: v_min_u32_e64 v14, v10, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 5cc68451d5ab7..22a26f373927e 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -13,8 +13,8 @@ define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr ad ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_mov_b32 m0, -1 @@ -60,8 +60,8 @@ define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr ad ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -115,8 +115,8 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[2:3] @@ -176,8 +176,8 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[2:3] @@ -403,8 +403,8 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -458,8 +458,8 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[2:3] @@ -520,8 +520,8 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[2:3] @@ -591,8 +591,8 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[2:3] @@ -740,8 +740,8 @@ define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr ad ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_mov_b32 m0, -1 @@ -788,8 +788,8 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 @@ -844,8 +844,8 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 @@ -942,8 +942,8 @@ define amdgpu_kernel void @simple_write2_two_val_f64(ptr addrspace(1) %C, ptr ad ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index f5d7bb3a45fe1..8fcf1ad3fbc95 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -5028,7 +5028,6 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: s_mov_b32 s4, 2 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: v_lshlrev_b32_e64 v0, s4, v0 -; NOOPT-NEXT: s_mov_b32 s4, 0 ; NOOPT-NEXT: v_mov_b32_e32 v2, 0 ; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v1, v2 @@ -5795,7 +5794,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_mov_b32 s20, 2 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: v_lshlrev_b32_e64 v0, s20, v0 -; NOOPT-NEXT: s_mov_b32 s20, 0 ; NOOPT-NEXT: v_mov_b32_e32 v2, 0 ; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 5b6fc6ae2cb91..69ed10c7c02a9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -497,10 +497,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 -; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 -; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1030-NEXT: v_mov_b32_e32 v7, 0x48004700 +; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 +; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 +; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_add_co_u32 v0, s0, s0, v2 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 @@ -810,11 +810,11 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 +; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700 +; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500 +; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200 -; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500 -; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_add_co_u32 v0, s4, s6, v0 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll index 447a5f20748f3..dc9e23b0a457d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll @@ -270,25 +270,45 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) { ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: raw_atomic_buffer_load_i64: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB5_1: ; %bb1 -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 -; GFX12-NEXT: ; %bb.2: ; %bb2 -; GFX12-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_i64: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-SDAG-TRUE16-NEXT: .LBB5_1: ; %bb1 +; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT +; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_i64: +; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-GISEL-TRUE16-NEXT: .LBB5_1: ; %bb1 +; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT +; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2 +; GFX12-GISEL-TRUE16-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() %id.zext = zext i32 %id to i64 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll index 2e0e42026d75c..7e8f490e6a7f9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll @@ -270,25 +270,45 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr) ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: raw_ptr_atomic_buffer_load_i64: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB5_1: ; %bb1 -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 -; GFX12-NEXT: ; %bb.2: ; %bb2 -; GFX12-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i64: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-SDAG-TRUE16-NEXT: .LBB5_1: ; %bb1 +; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT +; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-GISEL-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i64: +; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-GISEL-TRUE16-NEXT: .LBB5_1: ; %bb1 +; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT +; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2 +; GFX12-GISEL-TRUE16-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() %id.zext = zext i32 %id to i64 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll index ebb3368414ef2..1070e95cda783 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll @@ -352,28 +352,51 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %i ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: struct_atomic_buffer_load_i64: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: .LBB6_1: ; %bb1 -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 -; GFX12-NEXT: ; %bb.2: ; %bb2 -; GFX12-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: struct_atomic_buffer_load_i64: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_clause 0x1 +; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-SDAG-TRUE16-NEXT: .LBB6_1: ; %bb1 +; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT +; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-GISEL-TRUE16-LABEL: struct_atomic_buffer_load_i64: +; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX12-GISEL-TRUE16-NEXT: s_clause 0x1 +; GFX12-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-GISEL-TRUE16-NEXT: .LBB6_1: ; %bb1 +; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT +; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2 +; GFX12-GISEL-TRUE16-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() %id.zext = zext i32 %id to i64 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll index 40be5673543fb..1e4b43d1f4fce 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll @@ -352,28 +352,51 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %p ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: struct_ptr_atomic_buffer_load_i64: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: .LBB6_1: ; %bb1 -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 -; GFX12-NEXT: ; %bb.2: ; %bb2 -; GFX12-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: struct_ptr_atomic_buffer_load_i64: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_clause 0x1 +; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-SDAG-TRUE16-NEXT: .LBB6_1: ; %bb1 +; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT +; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-GISEL-TRUE16-LABEL: struct_ptr_atomic_buffer_load_i64: +; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX12-GISEL-TRUE16-NEXT: s_clause 0x1 +; GFX12-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-GISEL-TRUE16-NEXT: .LBB6_1: ; %bb1 +; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT +; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2 +; GFX12-GISEL-TRUE16-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() %id.zext = zext i32 %id to i64 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll index e8eccb0e408c1..fd59fddda0c07 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -171,7 +171,6 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr { ; GFX8-NOOPT-NEXT: s_nop 1 ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf ; GFX8-NOOPT-NEXT: v_add_u32_e64 v2, s[0:1], v0, v1 -; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, v0 @@ -264,7 +263,6 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 @@ -358,7 +356,6 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 @@ -452,7 +449,6 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 @@ -546,7 +542,6 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 @@ -640,7 +635,6 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 @@ -882,7 +876,6 @@ define amdgpu_kernel void @update_dppi64_imm_old_test(ptr addrspace(1) %arg, i64 ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3 ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s0, v0 -; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 @@ -978,7 +971,6 @@ define amdgpu_kernel void @update_dppf64_imm_old_test(ptr addrspace(1) %arg, dou ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3 ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s0, v0 -; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll index 7d3b316915923..c98feeb96232d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -746,9 +746,9 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX9-NEXT: s_mov_b64 exec, 0 ; GFX9-NEXT: .LBB6_6: ; %.continue1 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX9-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB6_7: ; GFX9-NEXT: s_mov_b64 exec, 0 @@ -792,9 +792,9 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-32-NEXT: .LBB6_6: ; %.continue1 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX10-32-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX10-32-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX10-32-NEXT: s_endpgm ; GFX10-32-NEXT: .LBB6_7: ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 @@ -838,9 +838,9 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-64-NEXT: s_mov_b64 exec, 0 ; GFX10-64-NEXT: .LBB6_6: ; %.continue1 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX10-64-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX10-64-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX10-64-NEXT: s_endpgm ; GFX10-64-NEXT: .LBB6_7: ; GFX10-64-NEXT: s_mov_b64 exec, 0 @@ -1005,9 +1005,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: .LBB7_8: ; %.return ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX9-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB7_9: ; GFX9-NEXT: s_mov_b64 exec, 0 @@ -1068,9 +1068,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: .LBB7_8: ; %.return ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 -; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX10-32-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX10-32-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX10-32-NEXT: s_endpgm ; GFX10-32-NEXT: .LBB7_9: ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 @@ -1131,9 +1131,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: .LBB7_8: ; %.return ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] -; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX10-64-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX10-64-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX10-64-NEXT: s_endpgm ; GFX10-64-NEXT: .LBB7_9: ; GFX10-64-NEXT: s_mov_b64 exec, 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll index c9e61c6ab5d42..bdde7c0975425 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -45,7 +45,6 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) { ; GFX12-NEXT: s_mov_b32 s2, 2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX12-NEXT: s_mov_b32 s2, 0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-NEXT: v_mov_b32_e32 v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index ec6ec4328db87..a2a8ce75d7fb4 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -217,7 +217,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v2, v0 @@ -250,7 +249,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX10-WGP-NEXT: s_mov_b32 s6, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v0 @@ -282,7 +280,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX10-CU-NEXT: s_mov_b32 s6, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v0 @@ -309,7 +306,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 @@ -342,7 +338,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 2 ; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v2, s6, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 @@ -374,7 +369,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 2 ; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e64 v2, s6, v0 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 @@ -402,7 +396,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s4, 2 ; GFX942-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s4, v0 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s4, 0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v2 @@ -422,7 +415,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s4, 2 ; GFX942-TGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s4, v0 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s4, 0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v2 @@ -443,7 +435,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX11-WGP-NEXT: s_mov_b32 s2, 2 ; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX11-WGP-NEXT: s_mov_b32 s2, 0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v0 @@ -472,7 +463,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX11-CU-NEXT: s_mov_b32 s2, 2 ; GFX11-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX11-CU-NEXT: s_mov_b32 s2, 0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v0 @@ -502,7 +492,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX12-WGP-NEXT: s_mov_b32 s2, 0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0 @@ -534,7 +523,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-NEXT: s_mov_b32 s2, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX12-CU-NEXT: s_mov_b32 s2, 0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0 @@ -786,7 +774,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX7-NEXT: flat_load_dword v2, v[1:2] ; GFX7-NEXT: s_mov_b32 s4, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX7-NEXT: s_mov_b32 s4, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v4, v0 @@ -817,7 +804,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2] ; GFX10-WGP-NEXT: s_mov_b32 s4, 2 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX10-WGP-NEXT: s_mov_b32 s4, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v4, v0 @@ -847,7 +833,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX10-CU-NEXT: flat_load_dword v2, v[1:2] ; GFX10-CU-NEXT: s_mov_b32 s4, 2 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX10-CU-NEXT: s_mov_b32 s4, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v4, v0 @@ -873,7 +858,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[1:2] ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v4, v0 @@ -903,7 +887,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s4, 2 ; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v4, s4, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s4, 0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v5, v0 @@ -933,7 +916,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s4, 2 ; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e64 v4, s4, v0 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s4, 0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v5, v0 @@ -961,7 +943,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 2 ; GFX942-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s2, v0 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3 @@ -981,7 +962,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 2 ; GFX942-TGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s2, v0 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3 @@ -1002,7 +982,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s0 ; GFX11-WGP-NEXT: s_mov_b32 s0, 2 ; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX11-WGP-NEXT: s_mov_b32 s0, 0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v4, v0 @@ -1030,7 +1009,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s0 ; GFX11-CU-NEXT: s_mov_b32 s0, 2 ; GFX11-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX11-CU-NEXT: s_mov_b32 s0, 0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v4, v0 @@ -1060,7 +1038,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-WGP-NEXT: s_mov_b32 s0, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX12-WGP-NEXT: s_mov_b32 s0, 0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0 @@ -1092,7 +1069,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: s_mov_b32 s0, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX12-CU-NEXT: s_mov_b32 s0, 0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index 2afa5779c7522..c59b0ee83e955 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -173,7 +173,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v2, v0 @@ -207,7 +206,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX10-WGP-NEXT: s_mov_b32 s6, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v0 @@ -240,7 +238,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX10-CU-NEXT: s_mov_b32 s6, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v0 @@ -268,7 +265,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 @@ -299,7 +295,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX11-WGP-NEXT: s_mov_b32 s2, 2 ; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX11-WGP-NEXT: s_mov_b32 s2, 0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v0 @@ -329,7 +324,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX11-CU-NEXT: s_mov_b32 s2, 2 ; GFX11-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX11-CU-NEXT: s_mov_b32 s2, 0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v0 @@ -360,7 +354,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX12-WGP-NEXT: s_mov_b32 s2, 0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0 @@ -395,7 +388,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-NEXT: s_mov_b32 s2, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX12-CU-NEXT: s_mov_b32 s2, 0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0 @@ -615,7 +607,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX7-NEXT: flat_load_dword v2, v[1:2] ; GFX7-NEXT: s_mov_b32 s4, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX7-NEXT: s_mov_b32 s4, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v4, v0 @@ -647,7 +638,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2] ; GFX10-WGP-NEXT: s_mov_b32 s4, 2 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX10-WGP-NEXT: s_mov_b32 s4, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v4, v0 @@ -678,7 +668,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX10-CU-NEXT: flat_load_dword v2, v[1:2] ; GFX10-CU-NEXT: s_mov_b32 s4, 2 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX10-CU-NEXT: s_mov_b32 s4, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v4, v0 @@ -705,7 +694,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[1:2] ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v4, v0 @@ -735,7 +723,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s0 ; GFX11-WGP-NEXT: s_mov_b32 s0, 2 ; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX11-WGP-NEXT: s_mov_b32 s0, 0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v4, v0 @@ -764,7 +751,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s0 ; GFX11-CU-NEXT: s_mov_b32 s0, 2 ; GFX11-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX11-CU-NEXT: s_mov_b32 s0, 0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v4, v0 @@ -795,7 +781,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-WGP-NEXT: s_mov_b32 s0, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX12-WGP-NEXT: s_mov_b32 s0, 0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0 @@ -832,7 +817,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: s_mov_b32 s0, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX12-CU-NEXT: s_mov_b32 s0, 0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index d74c230488ea2..72cbbc0283545 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -231,7 +231,6 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX6-NEXT: s_mov_b64 s[10:11], s[12:13] ; GFX6-NEXT: s_mov_b32 s12, 2 ; GFX6-NEXT: v_lshlrev_b32_e64 v0, s12, v0 -; GFX6-NEXT: s_mov_b32 s12, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 @@ -250,7 +249,6 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v2, v0 @@ -320,7 +318,6 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], s[8:9] ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s8, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, 0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 @@ -692,7 +689,6 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX6-NEXT: s_mov_b32 s9, 2 ; GFX6-NEXT: v_lshlrev_b32_e64 v1, s9, v0 -; GFX6-NEXT: s_mov_b32 s9, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0 ; GFX6-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v2, v0 @@ -712,7 +708,6 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s5, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s5, v0 -; GFX7-NEXT: s_mov_b32 s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v2, v0 @@ -770,7 +765,6 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[6:7] ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s5, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index c326edfdd490e..2a40ee532be98 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -187,7 +187,6 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_mov_b32 s8, 2 ; GFX6-NEXT: v_lshlrev_b32_e64 v0, s8, v0 -; GFX6-NEXT: s_mov_b32 s8, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 @@ -206,7 +205,6 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v2, v0 @@ -276,7 +274,6 @@ define amdgpu_kernel void @global_volatile_load_1( ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], s[8:9] ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s8, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, 0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 @@ -560,7 +557,6 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_mov_b32 s5, 2 ; GFX6-NEXT: v_lshlrev_b32_e64 v1, s5, v0 -; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0 ; GFX6-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v2, v0 @@ -581,7 +577,6 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s5, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s5, v0 -; GFX7-NEXT: s_mov_b32 s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v2, v0 @@ -642,7 +637,6 @@ define amdgpu_kernel void @global_volatile_store_1( ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[6:7] ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s5, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index bbc04aa46adc5..32862f73d2f29 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -239,26 +239,23 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v3 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 ; GFX9-O0-NEXT: v_ashrrev_i64 v[2:3], s4, v[2:3] ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -268,8 +265,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 @@ -278,20 +275,20 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_writelane_b32 v30, s10, 2 ; GFX9-O0-NEXT: s_mov_b32 s11, s7 ; GFX9-O0-NEXT: v_writelane_b32 v30, s11, 3 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v2 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v10, vcc, s10, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v4, v3, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v0, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v1, vcc -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v11 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[11:12], s[4:5] +; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[12:13], s[4:5] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v3 @@ -301,147 +298,143 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v17 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, s10, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v10, v9, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v7, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v12, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v8, vcc -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v12, vcc, s10, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v14, vcc, v9, v10, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v9, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v9, v8, vcc +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v13 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[17:18], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[4:5] -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v10, v14, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 killed $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v18 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v18 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v18 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v12, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[12:13], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v12, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[13:14], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[13:14], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[10:11], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[11:12], s[8:9] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 ; GFX9-O0-NEXT: s_mov_b32 s12, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s12 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 -; GFX9-O0-NEXT: v_min_u32_e64 v7, v7, v8 -; GFX9-O0-NEXT: s_mov_b32 s13, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s12 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 -; GFX9-O0-NEXT: v_min_u32_e64 v12, v6, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-O0-NEXT: v_min_u32_e64 v8, v4, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s12 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v10 +; GFX9-O0-NEXT: v_min_u32_e64 v13, v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v4 ; GFX9-O0-NEXT: s_mov_b64 s[14:15], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 ; GFX9-O0-NEXT: s_mov_b32 s16, s14 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: s_mov_b32 s13, s15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v9, s[16:17], v9, s16 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s13 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[16:17], v6, v10, s[16:17] -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v7, v8, s[8:9] +; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[16:17], v10, s16 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s13 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[16:17], v7, v11, s[16:17] +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[8:9] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[5:6], s[8:9] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s12 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 +; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 -; GFX9-O0-NEXT: v_min_u32_e64 v11, v4, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v2 +; GFX9-O0-NEXT: v_add_u32_e64 v10, v10, s12 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v3 +; GFX9-O0-NEXT: v_min_u32_e64 v11, v10, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v11 @@ -552,30 +545,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -590,22 +583,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_9 ; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] @@ -645,27 +638,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -675,30 +668,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 ; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 ; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] @@ -829,24 +822,24 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 @@ -856,30 +849,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-O0-NEXT: s_branch .LBB0_1 @@ -887,14 +880,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload @@ -970,41 +963,41 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s4 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -1043,14 +1036,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 ; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] @@ -1092,12 +1085,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5] ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 @@ -1111,18 +1104,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] @@ -1143,205 +1136,193 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[16:17] +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[17:18] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v21 ; GFX9-O0-NEXT: v_mul_lo_u32 v8, v1, v0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], s4, v[20:21] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 -; GFX9-O0-NEXT: v_mul_lo_u32 v5, v2, v5 -; GFX9-O0-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v0, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v17 -; GFX9-O0-NEXT: v_add3_u32 v8, v0, v5, v8 +; GFX9-O0-NEXT: v_lshrrev_b64 v[21:22], s4, v[21:22] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 +; GFX9-O0-NEXT: v_mul_lo_u32 v2, v5, v2 +; GFX9-O0-NEXT: v_mad_u64_u32 v[17:18], s[6:7], v5, v0, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-O0-NEXT: v_add3_u32 v8, v0, v2, v8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s5 ; GFX9-O0-NEXT: v_lshlrev_b64 v[8:9], s4, v[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: s_mov_b32 s5, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v17 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v16, v5, v8 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-O0-NEXT: v_lshrrev_b64 v[8:9], s4, v[18:19] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14 -; GFX9-O0-NEXT: v_mul_lo_u32 v9, v8, v5 -; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], s4, v[14:15] -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 killed $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v18 -; GFX9-O0-NEXT: v_mul_lo_u32 v14, v14, v0 -; GFX9-O0-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v8, v0, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v19 -; GFX9-O0-NEXT: v_add3_u32 v8, v8, v9, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v17, v8, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-O0-NEXT: v_lshrrev_b64 v[8:9], s4, v[19:20] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15 +; GFX9-O0-NEXT: v_mul_lo_u32 v14, v9, v8 +; GFX9-O0-NEXT: v_lshrrev_b64 v[15:16], s4, v[15:16] +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v19 +; GFX9-O0-NEXT: v_mul_lo_u32 v15, v15, v0 +; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v9, v0, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v20 +; GFX9-O0-NEXT: v_add3_u32 v14, v9, v14, v15 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 -; GFX9-O0-NEXT: v_lshlrev_b64 v[8:9], s4, v[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 killed $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 killed $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v19 +; GFX9-O0-NEXT: v_or_b32_e64 v19, v14, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v18 +; GFX9-O0-NEXT: v_add_co_u32_e64 v17, s[6:7], v15, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v9, s[6:7], v9, v14, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v8, v1, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v19, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v14 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v19 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 -; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17 -; GFX9-O0-NEXT: v_add_co_u32_e64 v16, s[6:7], v14, v15 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v8, s[6:7], v8, v9, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 -; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v5, v1, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v19 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v21, v14, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v9 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v8, v5, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v20 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 ; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15 ; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v19 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_or_b32_e64 v20, v9, v14 -; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v8 -; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v5, v2, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-O0-NEXT: v_or_b32_e64 v23, v9, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v8 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v0, v5, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v20 +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[6:7], v8, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v9, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18 -; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v18 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v22, v8, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v5 -; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v0, v2, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 -; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[6:7], v8, v9 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0xffffffff ; GFX9-O0-NEXT: s_mov_b32 s5, s7 -; GFX9-O0-NEXT: v_and_b32_e64 v2, v2, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: v_and_b32_e64 v5, v5, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v8 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: v_and_b32_e64 v18, v5, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 -; GFX9-O0-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v0, v1, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v23 +; GFX9-O0-NEXT: v_and_b32_e64 v19, v16, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 +; GFX9-O0-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v0, v1, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v25, v23 +; GFX9-O0-NEXT: ; kill: def $vgpr25 killed $vgpr25 def $vgpr25_vgpr26 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v26, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v26 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v24 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v5 -; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s4, v[22:23] +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, s5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], s4, v[23:24] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v24 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v25 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v22 -; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v19 -; GFX9-O0-NEXT: v_add_co_u32_e64 v0, s[6:7], v0, v5 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v1, v2, s[6:7] +; GFX9-O0-NEXT: v_or_b32_e64 v23, v1, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v24 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v20 +; GFX9-O0-NEXT: v_add_co_u32_e64 v0, s[6:7], v0, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v1, v5, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_lshrrev_b64 v[18:19], s4, v[0:1] -; GFX9-O0-NEXT: v_lshrrev_b64 v[22:23], s4, v[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 -; GFX9-O0-NEXT: v_add_co_u32_e64 v18, s[6:7], v8, v9 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[19:20], s4, v[0:1] +; GFX9-O0-NEXT: v_lshrrev_b64 v[23:24], s4, v[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v20 +; GFX9-O0-NEXT: v_add_co_u32_e64 v19, s[6:7], v9, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v8, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v20 +; GFX9-O0-NEXT: v_add_co_u32_e64 v19, s[6:7], v9, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v8, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v20 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 -; GFX9-O0-NEXT: v_add_co_u32_e64 v18, s[6:7], v8, v9 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 -; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[6:7], v8, v9 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7] +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[6:7], v8, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v9, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 @@ -1402,7 +1383,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -1617,31 +1598,31 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill @@ -1651,8 +1632,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -1666,17 +1647,17 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v8, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v5, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 @@ -1685,64 +1666,60 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v7, v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v8, v2, v0 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v2, v0 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 -; GFX9-O0-NEXT: s_mov_b32 s8, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 -; GFX9-O0-NEXT: s_mov_b32 s9, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 +; GFX9-O0-NEXT: s_mov_b32 s8, 32 ; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-O0-NEXT: v_min_u32_e64 v6, v4, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-O0-NEXT: v_min_u32_e64 v16, v5, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v4 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-O0-NEXT: s_mov_b32 s12, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 ; GFX9-O0-NEXT: s_mov_b32 s9, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v7, s[12:13], v7, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[12:13], v4, v8, s[12:13] -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[12:13], v8, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[10:11], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v5, v6, s[12:13] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[14:15], s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v6, v7, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 +; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 -; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v2 +; GFX9-O0-NEXT: v_add_u32_e64 v10, v10, s8 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v3 +; GFX9-O0-NEXT: v_min_u32_e64 v14, v10, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 @@ -2440,205 +2417,193 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 -; GFX9-O0-NEXT: v_mul_lo_u32 v4, v5, v2 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshrrev_b64 v[12:13], s4, v[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mul_lo_u32 v3, v6, v3 -; GFX9-O0-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v6, v2, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v13 -; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v4 +; GFX9-O0-NEXT: v_mul_lo_u32 v10, v6, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], s4, v[13:14] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mul_lo_u32 v3, v7, v3 +; GFX9-O0-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v2, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[3:4], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 killed $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: s_mov_b32 s5, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v13 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v3, v4 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 -; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 -; GFX9-O0-NEXT: v_mul_lo_u32 v3, v2, v7 -; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s4, v[10:11] -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-O0-NEXT: v_mul_lo_u32 v10, v10, v4 -; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v2, v4, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 -; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[17:18], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v3, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[15:16] +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-O0-NEXT: v_mul_lo_u32 v3, v2, v10 +; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s4, v[11:12] +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v15 +; GFX9-O0-NEXT: v_mul_lo_u32 v11, v11, v5 +; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v2, v5, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v11 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: v_add_co_u32_e64 v12, s[6:7], v10, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-O0-NEXT: v_add_co_u32_e64 v13, s[6:7], v11, v12 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 -; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v7, v5, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 +; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v10, v6, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11 -; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v11 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[15:16] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 -; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v7, v6, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v10, v7, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 -; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v7, v7, v16 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v18, v10, v11 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v7 -; GFX9-O0-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v4, v6, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 -; GFX9-O0-NEXT: v_add_co_u32_e64 v6, s[6:7], v6, v15 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v14, s[6:7], v7, v14, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[15:16] +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v17 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v19, v11, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v10 +; GFX9-O0-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v5, v7, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v11 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v18 +; GFX9-O0-NEXT: v_add_co_u32_e64 v15, s[6:7], v15, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v7, v12, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v16 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0xffffffff ; GFX9-O0-NEXT: s_mov_b32 s5, s7 -; GFX9-O0-NEXT: v_and_b32_e64 v14, v14, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 +; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: v_and_b32_e64 v16, v15, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 -; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v4, v5, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: v_and_b32_e64 v17, v12, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v7 +; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v5, v6, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v19 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v20 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5 -; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v18 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_or_b32_e64 v18, v5, v14 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 -; GFX9-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v4, v15 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v14, s[6:7], v5, v14, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 -; GFX9-O0-NEXT: v_lshrrev_b64 v[16:17], s4, v[4:5] -; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[19:20], s4, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v19 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-O0-NEXT: v_add_co_u32_e64 v14, s[6:7], v14, v15 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v6, v7, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18 +; GFX9-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v5, v12 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v6, v7, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_lshrrev_b64 v[17:18], s4, v[5:6] +; GFX9-O0-NEXT: v_lshrrev_b64 v[19:20], s4, v[15:16] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v18 +; GFX9-O0-NEXT: v_add_co_u32_e64 v15, s[6:7], v15, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v7, v12, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v14, s[6:7], v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16 +; GFX9-O0-NEXT: v_add_co_u32_e64 v15, s[6:7], v7, v12 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 -; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[6:7], v2, v7 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v3, v6, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[6:7], v2, v12 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v3, v7, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], s4, v[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7 +; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], s4, v[5:6] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index 371ae0384302f..183e4267e582e 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -473,7 +473,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: successors: %bb.7(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: early-clobber %34:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4) - ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %34.sub0, killed %55, 0, implicit $exec + ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %34.sub0, killed %52, 0, implicit $exec ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed %34.sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1 ; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1) @@ -501,14 +501,14 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: bb.5.Flow: ; SI-NEXT: successors: %bb.1(0x40000000), %bb.7(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %56:vgpr_32, %bb.6 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %53:vgpr_32, %bb.6 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.6.sw.bb18: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %42:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %41:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 ; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1) ; SI-NEXT: S_BRANCH %bb.5 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 74e9ab718c3d2..81e17400973a4 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -475,13 +475,11 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline { ; GFX9-O0-NEXT: v_add3_u32 v0, v0, v1, v2 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s35 ; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s34, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: s_mov_b32 s35, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 6347a3783c9c6..72672c8b6efad 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -479,13 +479,11 @@ define i64 @called_i64(i64 %a) noinline { ; GFX9-O0-NEXT: v_add3_u32 v0, v0, v1, v2 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: s_mov_b32 s5, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 @@ -1311,13 +1309,11 @@ define i64 @strict_wwm_called_i64(i64 %a) noinline { ; GFX9-O0-NEXT: v_add3_u32 v0, v0, v1, v2 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: s_mov_b32 s5, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 diff --git a/llvm/test/CodeGen/PowerPC/recipest.ll b/llvm/test/CodeGen/PowerPC/recipest.ll index 4bf572bb02942..504216921110b 100644 --- a/llvm/test/CodeGen/PowerPC/recipest.ll +++ b/llvm/test/CodeGen/PowerPC/recipest.ll @@ -164,7 +164,7 @@ define double @foof_fmf(double %a, float %b) nounwind { ; CHECK-P9-NEXT: xsmuldp f1, f1, f0 ; CHECK-P9-NEXT: blr %x = call contract reassoc arcp float @llvm.sqrt.f32(float %b) - %y = fpext float %x to double + %y = fpext arcp float %x to double %r = fdiv contract reassoc arcp double %a, %y ret double %r } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/non-inst-abs-sub-copyable-value.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/non-inst-abs-sub-copyable-value.ll new file mode 100644 index 0000000000000..aedd9703fcddc --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/non-inst-abs-sub-copyable-value.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=arm64-apple-macosx15.0.0 < %s | FileCheck %s + +define i1 @test(i32 %shr.i.i90, i32 %x) { +; CHECK-LABEL: define i1 @test( +; CHECK-SAME: i32 [[SHR_I_I90:%.*]], i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[SHR_I_I90]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = sub <2 x i32> [[TMP6]], +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[TMP1]], i1 true) +; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt <2 x i64> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; CHECK-NEXT: ret i1 [[TMP5]] +; +entry: + %cond.i.i = tail call i32 @llvm.abs.i32(i32 %shr.i.i90, i1 true) + %conv.i.i91 = zext i32 %cond.i.i to i64 + %sub32.i.i = sub i32 %x, 2 + %cond41.i.i = tail call i32 @llvm.abs.i32(i32 %sub32.i.i, i1 true) + %conv42.i.i = zext i32 %cond41.i.i to i64 + %cmp.not.i.2.i.i = icmp ugt i64 %conv.i.i91, 300 + %cmp.not.i.3.i.i = icmp ugt i64 %conv42.i.i, 100 + ret i1 %cmp.not.i.3.i.i +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.abs.i32(i32, i1 immarg) #0 diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 345bb4d7b91ec..09d2f1ed92554 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -19,6 +19,7 @@ if (current_toolchain == default_toolchain) { "_LIBCPP_ABI_FORCE_ITANIUM=", "_LIBCPP_ABI_FORCE_MICROSOFT=", "_LIBCPP_ABI_VERSION=$libcxx_abi_version", + "_LIBCPP_ASSERTION_SEMANTIC_DEFAULT=_LIBCPP_ASSERTION_SEMANTIC_HARDENING_DEPENDENT", "_LIBCPP_EXTRA_SITE_DEFINES=", "_LIBCPP_HAS_FILESYSTEM=1", "_LIBCPP_HAS_THREADS=1", diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/tablegen.gni b/llvm/utils/gn/secondary/llvm/utils/TableGen/tablegen.gni index 21433642946c1..47f5f4e76219a 100644 --- a/llvm/utils/gn/secondary/llvm/utils/TableGen/tablegen.gni +++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/tablegen.gni @@ -32,7 +32,10 @@ template("tablegen") { config_name = "${target_name}_config" config(config_name) { visibility = [ ":$target_name" ] - include_dirs = [ target_gen_dir ] + include_dirs = [ + target_gen_dir, + root_build_dir, + ] } compiled_action(target_name) {